HTML reader: extract submodules
Reducing module size should reduce memory use during compilation. This is preparatory work to tackle support for more table features.
This commit is contained in:
parent
75c881e2d9
commit
f9258371dd
5 changed files with 345 additions and 239 deletions
|
@ -599,6 +599,9 @@ library
|
|||
Text.Pandoc.Readers.Docx.Parse.Styles,
|
||||
Text.Pandoc.Readers.Docx.Util,
|
||||
Text.Pandoc.Readers.Docx.Fields,
|
||||
Text.Pandoc.Readers.HTML.Parsing,
|
||||
Text.Pandoc.Readers.HTML.TagCategories,
|
||||
Text.Pandoc.Readers.HTML.Types,
|
||||
Text.Pandoc.Readers.LaTeX.Parsing,
|
||||
Text.Pandoc.Readers.LaTeX.Lang,
|
||||
Text.Pandoc.Readers.LaTeX.SIunitx,
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{-# LANGUAGE LambdaCase #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
{-# LANGUAGE LambdaCase #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE ViewPatterns #-}
|
||||
{- |
|
||||
|
@ -29,7 +28,8 @@ import Control.Applicative ((<|>))
|
|||
import Control.Arrow (first)
|
||||
import Control.Monad (guard, mplus, msum, mzero, unless, void)
|
||||
import Control.Monad.Except (throwError)
|
||||
import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT)
|
||||
import Control.Monad.Reader (ask, asks, lift, local, runReaderT)
|
||||
import Data.ByteString.Base64 (encode)
|
||||
import Data.Char (isAlphaNum, isLetter)
|
||||
import Data.Default (Default (..), def)
|
||||
import Data.Foldable (for_)
|
||||
|
@ -40,17 +40,19 @@ import Data.Monoid (First (..))
|
|||
import qualified Data.Set as Set
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import Network.URI (URI, nonStrictRelativeTo, parseURIReference)
|
||||
import Network.URI (nonStrictRelativeTo, parseURIReference)
|
||||
import Text.HTML.TagSoup
|
||||
import Text.HTML.TagSoup.Match
|
||||
import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines)
|
||||
import Text.Pandoc.Builder (Blocks, Inlines, trimInlines)
|
||||
import qualified Text.Pandoc.Builder as B
|
||||
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
|
||||
import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
|
||||
import qualified Text.Pandoc.UTF8 as UTF8
|
||||
import Text.Pandoc.Definition
|
||||
import Text.Pandoc.Readers.HTML.Parsing
|
||||
import Text.Pandoc.Readers.HTML.TagCategories
|
||||
import Text.Pandoc.Readers.HTML.Types
|
||||
import Text.Pandoc.Readers.LaTeX (rawLaTeXInline)
|
||||
import Text.Pandoc.Readers.LaTeX.Types (Macro)
|
||||
import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
|
||||
import Text.Pandoc.Error
|
||||
import Text.Pandoc.Logging
|
||||
|
@ -66,7 +68,6 @@ import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
|
|||
import Text.Pandoc.Walk
|
||||
import Text.Parsec.Error
|
||||
import Text.TeXMath (readMathML, writeTeX)
|
||||
import Data.ByteString.Base64 (encode)
|
||||
|
||||
-- | Convert HTML-formatted string to 'Pandoc' document.
|
||||
readHtml :: PandocMonad m
|
||||
|
@ -105,32 +106,12 @@ replaceNotes' noteTbl (RawInline (Format "noteref") ref) =
|
|||
maybe (Str "") (Note . B.toList) $ lookup ref noteTbl
|
||||
replaceNotes' _ x = x
|
||||
|
||||
data HTMLState =
|
||||
HTMLState
|
||||
{ parserState :: ParserState,
|
||||
noteTable :: [(Text, Blocks)],
|
||||
baseHref :: Maybe URI,
|
||||
identifiers :: Set.Set Text,
|
||||
logMessages :: [LogMessage],
|
||||
macros :: M.Map Text Macro,
|
||||
readerOpts :: ReaderOptions
|
||||
}
|
||||
|
||||
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
|
||||
, inChapter :: Bool -- ^ Set if in chapter section
|
||||
, inPlain :: Bool -- ^ Set if in pPlain
|
||||
}
|
||||
|
||||
setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
|
||||
setInChapter = local (\s -> s {inChapter = True})
|
||||
|
||||
setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
|
||||
setInPlain = local (\s -> s {inPlain = True})
|
||||
|
||||
type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
|
||||
|
||||
type TagParser m = HTMLParser m [Tag Text]
|
||||
|
||||
pHtml :: PandocMonad m => TagParser m Blocks
|
||||
pHtml = try $ do
|
||||
(TagOpen "html" attr) <- lookAhead pAny
|
||||
|
@ -681,22 +662,6 @@ inline = choice
|
|||
, pRawHtmlInline
|
||||
]
|
||||
|
||||
pLocation :: PandocMonad m => TagParser m ()
|
||||
pLocation = do
|
||||
(TagPosition r c) <- pSat isTagPosition
|
||||
setPosition $ newPos "input" r c
|
||||
|
||||
pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
|
||||
pSat f = do
|
||||
pos <- getPosition
|
||||
token tshow (const pos) (\x -> if f x then Just x else Nothing)
|
||||
|
||||
pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
|
||||
pSatisfy f = try $ optional pLocation >> pSat f
|
||||
|
||||
pAny :: PandocMonad m => TagParser m (Tag Text)
|
||||
pAny = pSatisfy (const True)
|
||||
|
||||
pSelfClosing :: PandocMonad m
|
||||
=> (Text -> Bool) -> ([Attribute Text] -> Bool)
|
||||
-> TagParser m (Tag Text)
|
||||
|
@ -924,49 +889,6 @@ pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines)
|
|||
-> TagParser m Inlines
|
||||
pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
|
||||
|
||||
pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
|
||||
pInTags tagtype parser = pInTags' tagtype (const True) parser
|
||||
|
||||
pInTags' :: (PandocMonad m, Monoid a)
|
||||
=> Text
|
||||
-> (Tag Text -> Bool)
|
||||
-> TagParser m a
|
||||
-> TagParser m a
|
||||
pInTags' tagtype tagtest parser = try $ do
|
||||
pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
|
||||
mconcat <$> manyTill parser (pCloses tagtype <|> eof)
|
||||
|
||||
-- parses p, preceded by an opening tag (optional if tagsOptional)
|
||||
-- and followed by a closing tag (optional if tagsOptional)
|
||||
pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a
|
||||
pInTag tagsOptional tagtype p = try $ do
|
||||
skipMany pBlank
|
||||
(if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype [])
|
||||
skipMany pBlank
|
||||
x <- p
|
||||
skipMany pBlank
|
||||
(if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype)
|
||||
skipMany pBlank
|
||||
return x
|
||||
|
||||
pCloses :: PandocMonad m => Text -> TagParser m ()
|
||||
pCloses tagtype = try $ do
|
||||
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
|
||||
case t of
|
||||
(TagClose t') | t' == tagtype -> void pAny
|
||||
(TagOpen t' _) | t' `closes` tagtype -> return ()
|
||||
(TagClose "ul") | tagtype == "li" -> return ()
|
||||
(TagClose "ol") | tagtype == "li" -> return ()
|
||||
(TagClose "dl") | tagtype == "dd" -> return ()
|
||||
(TagClose "table") | tagtype == "td" -> return ()
|
||||
(TagClose "table") | tagtype == "th" -> return ()
|
||||
(TagClose "table") | tagtype == "tr" -> return ()
|
||||
(TagClose "td") | tagtype `Set.member` blockHtmlTags -> return ()
|
||||
(TagClose "th") | tagtype `Set.member` blockHtmlTags -> return ()
|
||||
(TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags
|
||||
-> return () -- see #3794
|
||||
_ -> mzero
|
||||
|
||||
pTagText :: PandocMonad m => TagParser m Inlines
|
||||
pTagText = try $ do
|
||||
(TagText str) <- pSatisfy isTagText
|
||||
|
@ -975,14 +897,10 @@ pTagText = try $ do
|
|||
parsed <- lift $ lift $
|
||||
flip runReaderT qu $ runParserT (many pTagContents) st "text" str
|
||||
case parsed of
|
||||
Left _ -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'"
|
||||
Left _ -> throwError $ PandocParseError $
|
||||
"Could not parse `" <> str <> "'"
|
||||
Right result -> return $ mconcat result
|
||||
|
||||
pBlank :: PandocMonad m => TagParser m ()
|
||||
pBlank = try $ do
|
||||
(TagText str) <- pSatisfy isTagText
|
||||
guard $ T.all isSpace str
|
||||
|
||||
type InlinesParser m = HTMLParser m Text
|
||||
|
||||
pTagContents :: PandocMonad m => InlinesParser m Inlines
|
||||
|
@ -1077,54 +995,6 @@ pSpace = many1 (satisfy isSpace) >>= \xs ->
|
|||
then return B.softbreak
|
||||
else return B.space
|
||||
|
||||
--
|
||||
-- Constants
|
||||
--
|
||||
|
||||
eitherBlockOrInline :: Set.Set Text
|
||||
eitherBlockOrInline = Set.fromList
|
||||
["audio", "applet", "button", "iframe", "embed",
|
||||
"del", "ins", "progress", "map", "area", "noscript", "script",
|
||||
"object", "svg", "video", "source"]
|
||||
|
||||
blockHtmlTags :: Set.Set Text
|
||||
blockHtmlTags = Set.fromList
|
||||
["?xml", "!DOCTYPE", "address", "article", "aside",
|
||||
"blockquote", "body", "canvas",
|
||||
"caption", "center", "col", "colgroup", "dd", "details",
|
||||
"dir", "div", "dl", "dt", "fieldset", "figcaption", "figure",
|
||||
"footer", "form", "h1", "h2", "h3", "h4",
|
||||
"h5", "h6", "head", "header", "hgroup", "hr", "html",
|
||||
"isindex", "main", "menu", "meta", "noframes", "nav",
|
||||
"ol", "output", "p", "pre",
|
||||
"section", "summary", "table", "tbody", "textarea",
|
||||
"thead", "tfoot", "ul", "dd",
|
||||
"dt", "frameset", "li", "tbody", "td", "tfoot",
|
||||
"th", "thead", "tr", "script", "style"]
|
||||
|
||||
-- We want to allow raw docbook in markdown documents, so we
|
||||
-- include docbook block tags here too.
|
||||
blockDocBookTags :: Set.Set Text
|
||||
blockDocBookTags = Set.fromList
|
||||
["calloutlist", "bibliolist", "glosslist", "itemizedlist",
|
||||
"orderedlist", "segmentedlist", "simplelist",
|
||||
"variablelist", "caution", "important", "note", "tip",
|
||||
"warning", "address", "literallayout", "programlisting",
|
||||
"programlistingco", "screen", "screenco", "screenshot",
|
||||
"synopsis", "example", "informalexample", "figure",
|
||||
"informalfigure", "table", "informaltable", "para",
|
||||
"simpara", "formalpara", "equation", "informalequation",
|
||||
"figure", "screenshot", "mediaobject", "qandaset",
|
||||
"procedure", "task", "cmdsynopsis", "funcsynopsis",
|
||||
"classsynopsis", "blockquote", "epigraph", "msgset",
|
||||
"sidebar", "title"]
|
||||
|
||||
epubTags :: Set.Set Text
|
||||
epubTags = Set.fromList ["case", "switch", "default"]
|
||||
|
||||
blockTags :: Set.Set Text
|
||||
blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags]
|
||||
|
||||
class NamedTag a where
|
||||
getTagName :: a -> Maybe Text
|
||||
|
||||
|
@ -1162,47 +1032,6 @@ isTextTag = tagText (const True)
|
|||
isCommentTag :: Tag a -> Bool
|
||||
isCommentTag = tagComment (const True)
|
||||
|
||||
-- taken from HXT and extended
|
||||
-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
|
||||
closes :: Text -> Text -> Bool
|
||||
_ `closes` "body" = False
|
||||
_ `closes` "html" = False
|
||||
"body" `closes` "head" = True
|
||||
"a" `closes` "a" = True
|
||||
"li" `closes` "li" = True
|
||||
"th" `closes` t | t `elem` ["th","td"] = True
|
||||
"td" `closes` t | t `elem` ["th","td"] = True
|
||||
"tr" `closes` t | t `elem` ["th","td","tr"] = True
|
||||
"dd" `closes` t | t `elem` ["dt", "dd"] = True
|
||||
"dt" `closes` t | t `elem` ["dt","dd"] = True
|
||||
"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
|
||||
"optgroup" `closes` "optgroup" = True
|
||||
"optgroup" `closes` "option" = True
|
||||
"option" `closes` "option" = True
|
||||
-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags
|
||||
x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
|
||||
"dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
|
||||
"h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section",
|
||||
"table", "ul"] = True
|
||||
_ `closes` "meta" = True
|
||||
"form" `closes` "form" = True
|
||||
"label" `closes` "label" = True
|
||||
"map" `closes` "map" = True
|
||||
"object" `closes` "object" = True
|
||||
_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
|
||||
t `closes` "select" | t /= "option" = True
|
||||
"thead" `closes` "colgroup" = True
|
||||
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
|
||||
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
|
||||
t `closes` t2 |
|
||||
t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] &&
|
||||
t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main"
|
||||
t1 `closes` t2 |
|
||||
t1 `Set.member` blockTags &&
|
||||
t2 `Set.notMember` blockTags &&
|
||||
t2 `Set.notMember` eitherBlockOrInline = True
|
||||
_ `closes` _ = False
|
||||
|
||||
--- parsers for use in markdown, textile readers
|
||||
|
||||
-- | Matches a stretch of HTML in balanced tags.
|
||||
|
@ -1347,13 +1176,6 @@ stripPrefix' s =
|
|||
if T.null t then s else T.drop 1 t
|
||||
where (_, t) = T.span (/= ':') s
|
||||
|
||||
isSpace :: Char -> Bool
|
||||
isSpace ' ' = True
|
||||
isSpace '\t' = True
|
||||
isSpace '\n' = True
|
||||
isSpace '\r' = True
|
||||
isSpace _ = False
|
||||
|
||||
-- Utilities
|
||||
|
||||
-- | Adjusts a url according to the document's base URL.
|
||||
|
@ -1364,41 +1186,6 @@ canonicalizeUrl url = do
|
|||
(Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
|
||||
_ -> url
|
||||
|
||||
|
||||
-- Instances
|
||||
|
||||
instance HasMacros HTMLState where
|
||||
extractMacros = macros
|
||||
updateMacros f st = st{ macros = f $ macros st }
|
||||
|
||||
instance HasIdentifierList HTMLState where
|
||||
extractIdentifierList = identifiers
|
||||
updateIdentifierList f s = s{ identifiers = f (identifiers s) }
|
||||
|
||||
instance HasLogMessages HTMLState where
|
||||
addLogMessage m s = s{ logMessages = m : logMessages s }
|
||||
getLogMessages = reverse . logMessages
|
||||
|
||||
-- This signature should be more general
|
||||
-- MonadReader HTMLLocal m => HasQuoteContext st m
|
||||
instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where
|
||||
getQuoteContext = asks quoteContext
|
||||
withQuoteContext q = local (\s -> s{quoteContext = q})
|
||||
|
||||
instance HasReaderOptions HTMLState where
|
||||
extractReaderOptions = extractReaderOptions . parserState
|
||||
|
||||
instance HasMeta HTMLState where
|
||||
setMeta s b st = st {parserState = setMeta s b $ parserState st}
|
||||
deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
|
||||
|
||||
instance Default HTMLLocal where
|
||||
def = HTMLLocal NoQuote False False
|
||||
|
||||
instance HasLastStrPosition HTMLState where
|
||||
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
|
||||
getLastStrPos = getLastStrPos . parserState
|
||||
|
||||
-- For now we need a special version here; the one in Shared has String type
|
||||
renderTags' :: [Tag Text] -> Text
|
||||
renderTags' = renderTagsOptions
|
||||
|
@ -1411,21 +1198,6 @@ renderTags' = renderTagsOptions
|
|||
-- EPUB Specific
|
||||
--
|
||||
--
|
||||
sectioningContent :: [Text]
|
||||
sectioningContent = ["article", "aside", "nav", "section"]
|
||||
|
||||
|
||||
groupingContent :: [Text]
|
||||
groupingContent = ["p", "hr", "pre", "blockquote", "ol"
|
||||
, "ul", "li", "dl", "dt", "dt", "dd"
|
||||
, "figure", "figcaption", "div", "main"]
|
||||
|
||||
matchTagClose :: Text -> (Tag Text -> Bool)
|
||||
matchTagClose t = (~== TagClose t)
|
||||
|
||||
matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
|
||||
matchTagOpen t as = (~== TagOpen t as)
|
||||
|
||||
{-
|
||||
|
||||
types :: [(String, ([String], Int))]
|
||||
|
|
156
src/Text/Pandoc/Readers/HTML/Parsing.hs
Normal file
156
src/Text/Pandoc/Readers/HTML/Parsing.hs
Normal file
|
@ -0,0 +1,156 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Readers.HTML.Parsing
|
||||
Copyright : Copyright (C) 2006-2020 John MacFarlane
|
||||
License : GNU GPL, version 2 or above
|
||||
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
Stability : alpha
|
||||
Portability : portable
|
||||
|
||||
Parsing functions and utilities.
|
||||
-}
|
||||
module Text.Pandoc.Readers.HTML.Parsing
|
||||
( pInTags
|
||||
, pInTags'
|
||||
, pInTag
|
||||
, pAny
|
||||
, pCloses
|
||||
, pSatisfy
|
||||
, pBlank
|
||||
, matchTagClose
|
||||
, matchTagOpen
|
||||
, isSpace
|
||||
)
|
||||
where
|
||||
|
||||
import Control.Monad (guard, void, mzero)
|
||||
import Data.Text (Text)
|
||||
import Text.HTML.TagSoup
|
||||
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
|
||||
import Text.Pandoc.Parsing
|
||||
( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
|
||||
, skipMany, setPosition, token, try)
|
||||
import Text.Pandoc.Readers.HTML.TagCategories
|
||||
import Text.Pandoc.Readers.HTML.Types
|
||||
import Text.Pandoc.Shared (tshow)
|
||||
import qualified Data.Set as Set
|
||||
import qualified Data.Text as T
|
||||
|
||||
pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
|
||||
pInTags tagtype parser = pInTags' tagtype (const True) parser
|
||||
|
||||
pInTags' :: (PandocMonad m, Monoid a)
|
||||
=> Text
|
||||
-> (Tag Text -> Bool)
|
||||
-> TagParser m a
|
||||
-> TagParser m a
|
||||
pInTags' tagtype tagtest parser = try $ do
|
||||
pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
|
||||
mconcat <$> manyTill parser (pCloses tagtype <|> eof)
|
||||
|
||||
-- parses p, preceded by an opening tag (optional if tagsOptional)
|
||||
-- and followed by a closing tag (optional if tagsOptional)
|
||||
pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a
|
||||
pInTag tagsOptional tagtype p = try $ do
|
||||
skipMany pBlank
|
||||
(if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype [])
|
||||
skipMany pBlank
|
||||
x <- p
|
||||
skipMany pBlank
|
||||
(if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype)
|
||||
skipMany pBlank
|
||||
return x
|
||||
|
||||
pCloses :: PandocMonad m => Text -> TagParser m ()
|
||||
pCloses tagtype = try $ do
|
||||
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
|
||||
case t of
|
||||
(TagClose t') | t' == tagtype -> void pAny
|
||||
(TagOpen t' _) | t' `closes` tagtype -> return ()
|
||||
(TagClose "ul") | tagtype == "li" -> return ()
|
||||
(TagClose "ol") | tagtype == "li" -> return ()
|
||||
(TagClose "dl") | tagtype == "dd" -> return ()
|
||||
(TagClose "table") | tagtype == "td" -> return ()
|
||||
(TagClose "table") | tagtype == "th" -> return ()
|
||||
(TagClose "table") | tagtype == "tr" -> return ()
|
||||
(TagClose "td") | tagtype `Set.member` blockHtmlTags -> return ()
|
||||
(TagClose "th") | tagtype `Set.member` blockHtmlTags -> return ()
|
||||
(TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags
|
||||
-> return () -- see #3794
|
||||
_ -> mzero
|
||||
|
||||
pBlank :: PandocMonad m => TagParser m ()
|
||||
pBlank = try $ do
|
||||
(TagText str) <- pSatisfy isTagText
|
||||
guard $ T.all isSpace str
|
||||
|
||||
pLocation :: PandocMonad m => TagParser m ()
|
||||
pLocation = do
|
||||
(TagPosition r c) <- pSat isTagPosition
|
||||
setPosition $ newPos "input" r c
|
||||
|
||||
pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
|
||||
pSat f = do
|
||||
pos <- getPosition
|
||||
token tshow (const pos) (\x -> if f x then Just x else Nothing)
|
||||
|
||||
pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
|
||||
pSatisfy f = try $ optional pLocation >> pSat f
|
||||
|
||||
matchTagClose :: Text -> (Tag Text -> Bool)
|
||||
matchTagClose t = (~== TagClose t)
|
||||
|
||||
matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
|
||||
matchTagOpen t as = (~== TagOpen t as)
|
||||
|
||||
pAny :: PandocMonad m => TagParser m (Tag Text)
|
||||
pAny = pSatisfy (const True)
|
||||
|
||||
isSpace :: Char -> Bool
|
||||
isSpace ' ' = True
|
||||
isSpace '\t' = True
|
||||
isSpace '\n' = True
|
||||
isSpace '\r' = True
|
||||
isSpace _ = False
|
||||
|
||||
-- taken from HXT and extended
|
||||
-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
|
||||
closes :: Text -> Text -> Bool
|
||||
_ `closes` "body" = False
|
||||
_ `closes` "html" = False
|
||||
"body" `closes` "head" = True
|
||||
"a" `closes` "a" = True
|
||||
"li" `closes` "li" = True
|
||||
"th" `closes` t | t `elem` ["th","td"] = True
|
||||
"td" `closes` t | t `elem` ["th","td"] = True
|
||||
"tr" `closes` t | t `elem` ["th","td","tr"] = True
|
||||
"dd" `closes` t | t `elem` ["dt", "dd"] = True
|
||||
"dt" `closes` t | t `elem` ["dt","dd"] = True
|
||||
"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
|
||||
"optgroup" `closes` "optgroup" = True
|
||||
"optgroup" `closes` "option" = True
|
||||
"option" `closes` "option" = True
|
||||
-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags
|
||||
x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
|
||||
"dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
|
||||
"h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section",
|
||||
"table", "ul"] = True
|
||||
_ `closes` "meta" = True
|
||||
"form" `closes` "form" = True
|
||||
"label" `closes` "label" = True
|
||||
"map" `closes` "map" = True
|
||||
"object" `closes` "object" = True
|
||||
_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
|
||||
t `closes` "select" | t /= "option" = True
|
||||
"thead" `closes` "colgroup" = True
|
||||
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
|
||||
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
|
||||
t `closes` t2 |
|
||||
t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] &&
|
||||
t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main"
|
||||
t1 `closes` t2 |
|
||||
t1 `Set.member` blockTags &&
|
||||
t2 `Set.notMember` blockTags &&
|
||||
t2 `Set.notMember` eitherBlockOrInline = True
|
||||
_ `closes` _ = False
|
78
src/Text/Pandoc/Readers/HTML/TagCategories.hs
Normal file
78
src/Text/Pandoc/Readers/HTML/TagCategories.hs
Normal file
|
@ -0,0 +1,78 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Readers.HTML.TagCategories
|
||||
Copyright : Copyright (C) 2006-2020 John MacFarlane
|
||||
License : GNU GPL, version 2 or above
|
||||
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
Stability : alpha
|
||||
Portability : portable
|
||||
|
||||
Categories of tags.
|
||||
-}
|
||||
module Text.Pandoc.Readers.HTML.TagCategories
|
||||
( blockHtmlTags
|
||||
, blockDocBookTags
|
||||
, eitherBlockOrInline
|
||||
, epubTags
|
||||
, blockTags
|
||||
, sectioningContent
|
||||
, groupingContent
|
||||
)
|
||||
where
|
||||
|
||||
import Data.Set (Set, fromList, unions)
|
||||
import Data.Text (Text)
|
||||
|
||||
eitherBlockOrInline :: Set Text
|
||||
eitherBlockOrInline = fromList
|
||||
["audio", "applet", "button", "iframe", "embed",
|
||||
"del", "ins", "progress", "map", "area", "noscript", "script",
|
||||
"object", "svg", "video", "source"]
|
||||
|
||||
blockHtmlTags :: Set Text
|
||||
blockHtmlTags = fromList
|
||||
["?xml", "!DOCTYPE", "address", "article", "aside",
|
||||
"blockquote", "body", "canvas",
|
||||
"caption", "center", "col", "colgroup", "dd", "details",
|
||||
"dir", "div", "dl", "dt", "fieldset", "figcaption", "figure",
|
||||
"footer", "form", "h1", "h2", "h3", "h4",
|
||||
"h5", "h6", "head", "header", "hgroup", "hr", "html",
|
||||
"isindex", "main", "menu", "meta", "noframes", "nav",
|
||||
"ol", "output", "p", "pre",
|
||||
"section", "summary", "table", "tbody", "textarea",
|
||||
"thead", "tfoot", "ul", "dd",
|
||||
"dt", "frameset", "li", "tbody", "td", "tfoot",
|
||||
"th", "thead", "tr", "script", "style"]
|
||||
|
||||
-- We want to allow raw docbook in markdown documents, so we
|
||||
-- include docbook block tags here too.
|
||||
blockDocBookTags :: Set Text
|
||||
blockDocBookTags = fromList
|
||||
["calloutlist", "bibliolist", "glosslist", "itemizedlist",
|
||||
"orderedlist", "segmentedlist", "simplelist",
|
||||
"variablelist", "caution", "important", "note", "tip",
|
||||
"warning", "address", "literallayout", "programlisting",
|
||||
"programlistingco", "screen", "screenco", "screenshot",
|
||||
"synopsis", "example", "informalexample", "figure",
|
||||
"informalfigure", "table", "informaltable", "para",
|
||||
"simpara", "formalpara", "equation", "informalequation",
|
||||
"figure", "screenshot", "mediaobject", "qandaset",
|
||||
"procedure", "task", "cmdsynopsis", "funcsynopsis",
|
||||
"classsynopsis", "blockquote", "epigraph", "msgset",
|
||||
"sidebar", "title"]
|
||||
|
||||
epubTags :: Set Text
|
||||
epubTags = fromList ["case", "switch", "default"]
|
||||
|
||||
blockTags :: Set Text
|
||||
blockTags = unions [blockHtmlTags, blockDocBookTags, epubTags]
|
||||
|
||||
sectioningContent :: [Text]
|
||||
sectioningContent = ["article", "aside", "nav", "section"]
|
||||
|
||||
|
||||
groupingContent :: [Text]
|
||||
groupingContent = ["p", "hr", "pre", "blockquote", "ol"
|
||||
, "ul", "li", "dl", "dt", "dt", "dd"
|
||||
, "figure", "figcaption", "div", "main"]
|
97
src/Text/Pandoc/Readers/HTML/Types.hs
Normal file
97
src/Text/Pandoc/Readers/HTML/Types.hs
Normal file
|
@ -0,0 +1,97 @@
|
|||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Readers.HTML.Types
|
||||
Copyright : Copyright (C) 2006-2020 John MacFarlane
|
||||
License : GNU GPL, version 2 or above
|
||||
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
Stability : alpha
|
||||
Portability : portable
|
||||
|
||||
Types for pandoc's HTML reader.
|
||||
-}
|
||||
module Text.Pandoc.Readers.HTML.Types
|
||||
( TagParser
|
||||
, HTMLParser
|
||||
, HTMLState (..)
|
||||
, HTMLLocal (..)
|
||||
)
|
||||
where
|
||||
|
||||
import Control.Monad.Reader (ReaderT, asks, local)
|
||||
import Data.Default (Default (def))
|
||||
import Data.Map (Map)
|
||||
import Data.Set (Set)
|
||||
import Data.Text (Text)
|
||||
import Network.URI (URI)
|
||||
import Text.HTML.TagSoup
|
||||
import Text.Pandoc.Builder (Blocks, HasMeta (..))
|
||||
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
|
||||
import Text.Pandoc.Logging (LogMessage)
|
||||
import Text.Pandoc.Options (ReaderOptions)
|
||||
import Text.Pandoc.Parsing
|
||||
( HasIdentifierList (..), HasLastStrPosition (..), HasLogMessages (..)
|
||||
, HasMacros (..), HasQuoteContext (..), HasReaderOptions (..)
|
||||
, ParserT, ParserState, QuoteContext (NoQuote)
|
||||
)
|
||||
import Text.Pandoc.Readers.LaTeX.Types (Macro)
|
||||
|
||||
-- | HTML parser type
|
||||
type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
|
||||
|
||||
-- | HTML parser, expecting @Tag Text@ as tokens.
|
||||
type TagParser m = HTMLParser m [Tag Text]
|
||||
|
||||
-- | Global HTML parser state
|
||||
data HTMLState = HTMLState
|
||||
{ parserState :: ParserState
|
||||
, noteTable :: [(Text, Blocks)]
|
||||
, baseHref :: Maybe URI
|
||||
, identifiers :: Set Text
|
||||
, logMessages :: [LogMessage]
|
||||
, macros :: Map Text Macro
|
||||
, readerOpts :: ReaderOptions
|
||||
}
|
||||
|
||||
-- | Local HTML parser state
|
||||
data HTMLLocal = HTMLLocal
|
||||
{ quoteContext :: QuoteContext
|
||||
, inChapter :: Bool -- ^ Set if in chapter section
|
||||
, inPlain :: Bool -- ^ Set if in pPlain
|
||||
}
|
||||
|
||||
|
||||
-- Instances
|
||||
|
||||
instance HasMacros HTMLState where
|
||||
extractMacros = macros
|
||||
updateMacros f st = st{ macros = f $ macros st }
|
||||
|
||||
instance HasIdentifierList HTMLState where
|
||||
extractIdentifierList = identifiers
|
||||
updateIdentifierList f s = s{ identifiers = f (identifiers s) }
|
||||
|
||||
instance HasLogMessages HTMLState where
|
||||
addLogMessage m s = s{ logMessages = m : logMessages s }
|
||||
getLogMessages = reverse . logMessages
|
||||
|
||||
-- This signature should be more general
|
||||
-- MonadReader HTMLLocal m => HasQuoteContext st m
|
||||
instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where
|
||||
getQuoteContext = asks quoteContext
|
||||
withQuoteContext q = local (\s -> s{quoteContext = q})
|
||||
|
||||
instance HasReaderOptions HTMLState where
|
||||
extractReaderOptions = extractReaderOptions . parserState
|
||||
|
||||
instance HasMeta HTMLState where
|
||||
setMeta s b st = st {parserState = setMeta s b $ parserState st}
|
||||
deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
|
||||
|
||||
instance Default HTMLLocal where
|
||||
def = HTMLLocal NoQuote False False
|
||||
|
||||
instance HasLastStrPosition HTMLState where
|
||||
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
|
||||
getLastStrPos = getLastStrPos . parserState
|
Loading…
Add table
Reference in a new issue