From f9258371dd20e0a9569c04923188a91f6c2e489e Mon Sep 17 00:00:00 2001
From: Albert Krewinkel <albert@zeitkraut.de>
Date: Sun, 22 Nov 2020 22:30:47 +0100
Subject: [PATCH] HTML reader: extract submodules

Reducing module size should reduce memory use during compilation.

This is preparatory work to tackle support for more table features.
---
 pandoc.cabal                                  |   3 +
 src/Text/Pandoc/Readers/HTML.hs               | 250 +-----------------
 src/Text/Pandoc/Readers/HTML/Parsing.hs       | 156 +++++++++++
 src/Text/Pandoc/Readers/HTML/TagCategories.hs |  78 ++++++
 src/Text/Pandoc/Readers/HTML/Types.hs         |  97 +++++++
 5 files changed, 345 insertions(+), 239 deletions(-)
 create mode 100644 src/Text/Pandoc/Readers/HTML/Parsing.hs
 create mode 100644 src/Text/Pandoc/Readers/HTML/TagCategories.hs
 create mode 100644 src/Text/Pandoc/Readers/HTML/Types.hs

diff --git a/pandoc.cabal b/pandoc.cabal
index a065d7459..c3ef00dcd 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -599,6 +599,9 @@ library
                    Text.Pandoc.Readers.Docx.Parse.Styles,
                    Text.Pandoc.Readers.Docx.Util,
                    Text.Pandoc.Readers.Docx.Fields,
+                   Text.Pandoc.Readers.HTML.Parsing,
+                   Text.Pandoc.Readers.HTML.TagCategories,
+                   Text.Pandoc.Readers.HTML.Types,
                    Text.Pandoc.Readers.LaTeX.Parsing,
                    Text.Pandoc.Readers.LaTeX.Lang,
                    Text.Pandoc.Readers.LaTeX.SIunitx,
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 7eab27cef..9e84bedab 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1,7 +1,6 @@
-{-# LANGUAGE LambdaCase            #-}
-{-# LANGUAGE FlexibleContexts      #-}
+{-# LANGUAGE FlexibleContexts     #-}
 {-# LANGUAGE FlexibleInstances     #-}
-{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE LambdaCase            #-}
 {-# LANGUAGE OverloadedStrings     #-}
 {-# LANGUAGE ViewPatterns          #-}
 {- |
@@ -29,7 +28,8 @@ import Control.Applicative ((<|>))
 import Control.Arrow (first)
 import Control.Monad (guard, mplus, msum, mzero, unless, void)
 import Control.Monad.Except (throwError)
-import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT)
+import Control.Monad.Reader (ask, asks, lift, local, runReaderT)
+import Data.ByteString.Base64 (encode)
 import Data.Char (isAlphaNum, isLetter)
 import Data.Default (Default (..), def)
 import Data.Foldable (for_)
@@ -40,17 +40,19 @@ import Data.Monoid (First (..))
 import qualified Data.Set as Set
 import Data.Text (Text)
 import qualified Data.Text as T
-import Network.URI (URI, nonStrictRelativeTo, parseURIReference)
+import Network.URI (nonStrictRelativeTo, parseURIReference)
 import Text.HTML.TagSoup
 import Text.HTML.TagSoup.Match
-import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines)
+import Text.Pandoc.Builder (Blocks, Inlines, trimInlines)
 import qualified Text.Pandoc.Builder as B
 import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
 import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
 import qualified Text.Pandoc.UTF8 as UTF8
 import Text.Pandoc.Definition
+import Text.Pandoc.Readers.HTML.Parsing
+import Text.Pandoc.Readers.HTML.TagCategories
+import Text.Pandoc.Readers.HTML.Types
 import Text.Pandoc.Readers.LaTeX (rawLaTeXInline)
-import Text.Pandoc.Readers.LaTeX.Types (Macro)
 import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
 import Text.Pandoc.Error
 import Text.Pandoc.Logging
@@ -66,7 +68,6 @@ import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
 import Text.Pandoc.Walk
 import Text.Parsec.Error
 import Text.TeXMath (readMathML, writeTeX)
-import Data.ByteString.Base64 (encode)
 
 -- | Convert HTML-formatted string to 'Pandoc' document.
 readHtml :: PandocMonad m
@@ -105,32 +106,12 @@ replaceNotes' noteTbl (RawInline (Format "noteref") ref) =
   maybe (Str "") (Note . B.toList) $ lookup ref noteTbl
 replaceNotes' _ x = x
 
-data HTMLState =
-  HTMLState
-  {  parserState :: ParserState,
-     noteTable   :: [(Text, Blocks)],
-     baseHref    :: Maybe URI,
-     identifiers :: Set.Set Text,
-     logMessages :: [LogMessage],
-     macros      :: M.Map Text Macro,
-     readerOpts  :: ReaderOptions
-  }
-
-data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
-                           , inChapter    :: Bool -- ^ Set if in chapter section
-                           , inPlain      :: Bool -- ^ Set if in pPlain
-                           }
-
 setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
 setInChapter = local (\s -> s {inChapter = True})
 
 setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
 setInPlain = local (\s -> s {inPlain = True})
 
-type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
-
-type TagParser m = HTMLParser m [Tag Text]
-
 pHtml :: PandocMonad m => TagParser m Blocks
 pHtml = try $ do
   (TagOpen "html" attr) <- lookAhead pAny
@@ -681,22 +662,6 @@ inline = choice
            , pRawHtmlInline
            ]
 
-pLocation :: PandocMonad m => TagParser m ()
-pLocation = do
-  (TagPosition r c) <- pSat isTagPosition
-  setPosition $ newPos "input" r c
-
-pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
-pSat f = do
-  pos <- getPosition
-  token tshow (const pos) (\x -> if f x then Just x else Nothing)
-
-pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
-pSatisfy f = try $ optional pLocation >> pSat f
-
-pAny :: PandocMonad m => TagParser m (Tag Text)
-pAny = pSatisfy (const True)
-
 pSelfClosing :: PandocMonad m
              => (Text -> Bool) -> ([Attribute Text] -> Bool)
              -> TagParser m (Tag Text)
@@ -924,49 +889,6 @@ pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines)
                -> TagParser m Inlines
 pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
 
-pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
-pInTags tagtype parser = pInTags' tagtype (const True) parser
-
-pInTags' :: (PandocMonad m, Monoid a)
-         => Text
-         -> (Tag Text -> Bool)
-         -> TagParser m a
-         -> TagParser m a
-pInTags' tagtype tagtest parser = try $ do
-  pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
-  mconcat <$> manyTill parser (pCloses tagtype <|> eof)
-
--- parses p, preceded by an opening tag (optional if tagsOptional)
--- and followed by a closing tag (optional if tagsOptional)
-pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a
-pInTag tagsOptional tagtype p = try $ do
-  skipMany pBlank
-  (if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype [])
-  skipMany pBlank
-  x <- p
-  skipMany pBlank
-  (if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype)
-  skipMany pBlank
-  return x
-
-pCloses :: PandocMonad m => Text -> TagParser m ()
-pCloses tagtype = try $ do
-  t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
-  case t of
-       (TagClose t') | t' == tagtype -> void pAny
-       (TagOpen t' _) | t' `closes` tagtype -> return ()
-       (TagClose "ul") | tagtype == "li" -> return ()
-       (TagClose "ol") | tagtype == "li" -> return ()
-       (TagClose "dl") | tagtype == "dd" -> return ()
-       (TagClose "table") | tagtype == "td" -> return ()
-       (TagClose "table") | tagtype == "th" -> return ()
-       (TagClose "table") | tagtype == "tr" -> return ()
-       (TagClose "td") | tagtype `Set.member` blockHtmlTags -> return ()
-       (TagClose "th") | tagtype `Set.member` blockHtmlTags -> return ()
-       (TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags
-                                            -> return () -- see #3794
-       _ -> mzero
-
 pTagText :: PandocMonad m => TagParser m Inlines
 pTagText = try $ do
   (TagText str) <- pSatisfy isTagText
@@ -975,14 +897,10 @@ pTagText = try $ do
   parsed <- lift $ lift $
             flip runReaderT qu $ runParserT (many pTagContents) st "text" str
   case parsed of
-       Left _        -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'"
+       Left _        -> throwError $ PandocParseError $
+                        "Could not parse `" <> str <> "'"
        Right result  -> return $ mconcat result
 
-pBlank :: PandocMonad m => TagParser m ()
-pBlank = try $ do
-  (TagText str) <- pSatisfy isTagText
-  guard $ T.all isSpace str
-
 type InlinesParser m = HTMLParser m Text
 
 pTagContents :: PandocMonad m => InlinesParser m Inlines
@@ -1077,54 +995,6 @@ pSpace = many1 (satisfy isSpace) >>= \xs ->
                then return B.softbreak
                else return B.space
 
---
--- Constants
---
-
-eitherBlockOrInline :: Set.Set Text
-eitherBlockOrInline = Set.fromList
-  ["audio", "applet", "button", "iframe", "embed",
-   "del", "ins", "progress", "map", "area", "noscript", "script",
-   "object", "svg", "video", "source"]
-
-blockHtmlTags :: Set.Set Text
-blockHtmlTags = Set.fromList
-   ["?xml", "!DOCTYPE", "address", "article", "aside",
-    "blockquote", "body", "canvas",
-    "caption", "center", "col", "colgroup", "dd", "details",
-    "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure",
-    "footer", "form", "h1", "h2", "h3", "h4",
-    "h5", "h6", "head", "header", "hgroup", "hr", "html",
-    "isindex", "main", "menu", "meta", "noframes", "nav",
-    "ol", "output", "p", "pre",
-    "section", "summary", "table", "tbody", "textarea",
-    "thead", "tfoot", "ul", "dd",
-    "dt", "frameset", "li", "tbody", "td", "tfoot",
-    "th", "thead", "tr", "script", "style"]
-
--- We want to allow raw docbook in markdown documents, so we
--- include docbook block tags here too.
-blockDocBookTags :: Set.Set Text
-blockDocBookTags = Set.fromList
-   ["calloutlist", "bibliolist", "glosslist", "itemizedlist",
-    "orderedlist", "segmentedlist", "simplelist",
-    "variablelist", "caution", "important", "note", "tip",
-    "warning", "address", "literallayout", "programlisting",
-    "programlistingco", "screen", "screenco", "screenshot",
-    "synopsis", "example", "informalexample", "figure",
-    "informalfigure", "table", "informaltable", "para",
-    "simpara", "formalpara", "equation", "informalequation",
-    "figure", "screenshot", "mediaobject", "qandaset",
-    "procedure", "task", "cmdsynopsis", "funcsynopsis",
-    "classsynopsis", "blockquote", "epigraph", "msgset",
-    "sidebar", "title"]
-
-epubTags :: Set.Set Text
-epubTags = Set.fromList ["case", "switch", "default"]
-
-blockTags :: Set.Set Text
-blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags]
-
 class NamedTag a where
   getTagName :: a -> Maybe Text
 
@@ -1162,47 +1032,6 @@ isTextTag = tagText (const True)
 isCommentTag :: Tag a -> Bool
 isCommentTag = tagComment (const True)
 
--- taken from HXT and extended
--- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
-closes :: Text -> Text -> Bool
-_ `closes` "body" = False
-_ `closes` "html" = False
-"body" `closes` "head" = True
-"a" `closes` "a" = True
-"li" `closes` "li" = True
-"th" `closes` t | t `elem` ["th","td"] = True
-"td" `closes` t | t `elem` ["th","td"] = True
-"tr" `closes` t | t `elem` ["th","td","tr"] = True
-"dd" `closes` t | t `elem` ["dt", "dd"] = True
-"dt" `closes` t | t `elem` ["dt","dd"] = True
-"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
-"optgroup" `closes` "optgroup" = True
-"optgroup" `closes` "option" = True
-"option" `closes` "option" = True
--- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags
-x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
-   "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
-   "h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section",
-   "table", "ul"] = True
-_ `closes` "meta" = True
-"form" `closes` "form" = True
-"label" `closes` "label" = True
-"map" `closes` "map" = True
-"object" `closes` "object" = True
-_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
-t `closes` "select" | t /= "option" = True
-"thead" `closes` "colgroup" = True
-"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
-"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
-t `closes` t2 |
-   t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] &&
-   t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main"
-t1 `closes` t2 |
-   t1 `Set.member` blockTags &&
-   t2 `Set.notMember` blockTags &&
-   t2 `Set.notMember` eitherBlockOrInline = True
-_ `closes` _ = False
-
 --- parsers for use in markdown, textile readers
 
 -- | Matches a stretch of HTML in balanced tags.
@@ -1347,13 +1176,6 @@ stripPrefix' s =
   if T.null t then s else T.drop 1 t
   where (_, t) = T.span (/= ':') s
 
-isSpace :: Char -> Bool
-isSpace ' '  = True
-isSpace '\t' = True
-isSpace '\n' = True
-isSpace '\r' = True
-isSpace _    = False
-
 -- Utilities
 
 -- | Adjusts a url according to the document's base URL.
@@ -1364,41 +1186,6 @@ canonicalizeUrl url = do
                 (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
                 _                   -> url
 
-
--- Instances
-
-instance HasMacros HTMLState where
-  extractMacros        = macros
-  updateMacros f st    = st{ macros = f $ macros st }
-
-instance HasIdentifierList HTMLState where
-  extractIdentifierList = identifiers
-  updateIdentifierList f s = s{ identifiers = f (identifiers s) }
-
-instance HasLogMessages HTMLState where
-  addLogMessage m s = s{ logMessages = m : logMessages s }
-  getLogMessages = reverse . logMessages
-
--- This signature should be more general
--- MonadReader HTMLLocal m => HasQuoteContext st m
-instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where
-  getQuoteContext = asks quoteContext
-  withQuoteContext q = local (\s -> s{quoteContext = q})
-
-instance HasReaderOptions HTMLState where
-    extractReaderOptions = extractReaderOptions . parserState
-
-instance HasMeta HTMLState where
-  setMeta s b st = st {parserState = setMeta s b $ parserState st}
-  deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
-
-instance Default HTMLLocal where
-  def = HTMLLocal NoQuote False False
-
-instance HasLastStrPosition HTMLState where
-  setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
-  getLastStrPos = getLastStrPos . parserState
-
 -- For now we need a special version here; the one in Shared has String type
 renderTags' :: [Tag Text] -> Text
 renderTags' = renderTagsOptions
@@ -1411,21 +1198,6 @@ renderTags' = renderTagsOptions
 -- EPUB Specific
 --
 --
-sectioningContent :: [Text]
-sectioningContent = ["article", "aside", "nav", "section"]
-
-
-groupingContent :: [Text]
-groupingContent = ["p", "hr", "pre", "blockquote", "ol"
-                  , "ul", "li", "dl", "dt", "dt", "dd"
-                  , "figure", "figcaption", "div", "main"]
-
-matchTagClose :: Text -> (Tag Text -> Bool)
-matchTagClose t = (~== TagClose t)
-
-matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
-matchTagOpen t as = (~== TagOpen t as)
-
 {-
 
 types :: [(String, ([String], Int))]
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
new file mode 100644
index 000000000..7fda066b5
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -0,0 +1,156 @@
+{-# LANGUAGE OverloadedStrings     #-}
+{- |
+   Module      : Text.Pandoc.Readers.HTML.Parsing
+   Copyright   : Copyright (C) 2006-2020 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+
+Parsing functions and utilities.
+-}
+module Text.Pandoc.Readers.HTML.Parsing
+  ( pInTags
+  , pInTags'
+  , pInTag
+  , pAny
+  , pCloses
+  , pSatisfy
+  , pBlank
+  , matchTagClose
+  , matchTagOpen
+  , isSpace
+  )
+where
+
+import Control.Monad (guard, void, mzero)
+import Data.Text (Text)
+import Text.HTML.TagSoup
+import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
+import Text.Pandoc.Parsing
+  ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
+  , skipMany, setPosition, token, try)
+import Text.Pandoc.Readers.HTML.TagCategories
+import Text.Pandoc.Readers.HTML.Types
+import Text.Pandoc.Shared (tshow)
+import qualified Data.Set as Set
+import qualified Data.Text as T
+
+pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
+pInTags tagtype parser = pInTags' tagtype (const True) parser
+
+pInTags' :: (PandocMonad m, Monoid a)
+         => Text
+         -> (Tag Text -> Bool)
+         -> TagParser m a
+         -> TagParser m a
+pInTags' tagtype tagtest parser = try $ do
+  pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
+  mconcat <$> manyTill parser (pCloses tagtype <|> eof)
+
+-- parses p, preceded by an opening tag (optional if tagsOptional)
+-- and followed by a closing tag (optional if tagsOptional)
+pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a
+pInTag tagsOptional tagtype p = try $ do
+  skipMany pBlank
+  (if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype [])
+  skipMany pBlank
+  x <- p
+  skipMany pBlank
+  (if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype)
+  skipMany pBlank
+  return x
+
+pCloses :: PandocMonad m => Text -> TagParser m ()
+pCloses tagtype = try $ do
+  t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
+  case t of
+       (TagClose t') | t' == tagtype -> void pAny
+       (TagOpen t' _) | t' `closes` tagtype -> return ()
+       (TagClose "ul") | tagtype == "li" -> return ()
+       (TagClose "ol") | tagtype == "li" -> return ()
+       (TagClose "dl") | tagtype == "dd" -> return ()
+       (TagClose "table") | tagtype == "td" -> return ()
+       (TagClose "table") | tagtype == "th" -> return ()
+       (TagClose "table") | tagtype == "tr" -> return ()
+       (TagClose "td") | tagtype `Set.member` blockHtmlTags -> return ()
+       (TagClose "th") | tagtype `Set.member` blockHtmlTags -> return ()
+       (TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags
+                                            -> return () -- see #3794
+       _ -> mzero
+
+pBlank :: PandocMonad m => TagParser m ()
+pBlank = try $ do
+  (TagText str) <- pSatisfy isTagText
+  guard $ T.all isSpace str
+
+pLocation :: PandocMonad m => TagParser m ()
+pLocation = do
+  (TagPosition r c) <- pSat isTagPosition
+  setPosition $ newPos "input" r c
+
+pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
+pSat f = do
+  pos <- getPosition
+  token tshow (const pos) (\x -> if f x then Just x else Nothing)
+
+pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
+pSatisfy f = try $ optional pLocation >> pSat f
+
+matchTagClose :: Text -> (Tag Text -> Bool)
+matchTagClose t = (~== TagClose t)
+
+matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
+matchTagOpen t as = (~== TagOpen t as)
+
+pAny :: PandocMonad m => TagParser m (Tag Text)
+pAny = pSatisfy (const True)
+
+isSpace :: Char -> Bool
+isSpace ' '  = True
+isSpace '\t' = True
+isSpace '\n' = True
+isSpace '\r' = True
+isSpace _    = False
+
+-- taken from HXT and extended
+-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
+closes :: Text -> Text -> Bool
+_ `closes` "body" = False
+_ `closes` "html" = False
+"body" `closes` "head" = True
+"a" `closes` "a" = True
+"li" `closes` "li" = True
+"th" `closes` t | t `elem` ["th","td"] = True
+"td" `closes` t | t `elem` ["th","td"] = True
+"tr" `closes` t | t `elem` ["th","td","tr"] = True
+"dd" `closes` t | t `elem` ["dt", "dd"] = True
+"dt" `closes` t | t `elem` ["dt","dd"] = True
+"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
+"optgroup" `closes` "optgroup" = True
+"optgroup" `closes` "option" = True
+"option" `closes` "option" = True
+-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags
+x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
+   "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
+   "h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section",
+   "table", "ul"] = True
+_ `closes` "meta" = True
+"form" `closes` "form" = True
+"label" `closes` "label" = True
+"map" `closes` "map" = True
+"object" `closes` "object" = True
+_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
+t `closes` "select" | t /= "option" = True
+"thead" `closes` "colgroup" = True
+"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
+"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
+t `closes` t2 |
+   t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] &&
+   t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main"
+t1 `closes` t2 |
+   t1 `Set.member` blockTags &&
+   t2 `Set.notMember` blockTags &&
+   t2 `Set.notMember` eitherBlockOrInline = True
+_ `closes` _ = False
diff --git a/src/Text/Pandoc/Readers/HTML/TagCategories.hs b/src/Text/Pandoc/Readers/HTML/TagCategories.hs
new file mode 100644
index 000000000..4f82a1831
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML/TagCategories.hs
@@ -0,0 +1,78 @@
+{-# LANGUAGE OverloadedStrings     #-}
+{- |
+   Module      : Text.Pandoc.Readers.HTML.TagCategories
+   Copyright   : Copyright (C) 2006-2020 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+
+Categories of tags.
+-}
+module Text.Pandoc.Readers.HTML.TagCategories
+  ( blockHtmlTags
+  , blockDocBookTags
+  , eitherBlockOrInline
+  , epubTags
+  , blockTags
+  , sectioningContent
+  , groupingContent
+  )
+where
+
+import Data.Set (Set, fromList, unions)
+import Data.Text (Text)
+
+eitherBlockOrInline :: Set Text
+eitherBlockOrInline = fromList
+  ["audio", "applet", "button", "iframe", "embed",
+   "del", "ins", "progress", "map", "area", "noscript", "script",
+   "object", "svg", "video", "source"]
+
+blockHtmlTags :: Set Text
+blockHtmlTags = fromList
+   ["?xml", "!DOCTYPE", "address", "article", "aside",
+    "blockquote", "body", "canvas",
+    "caption", "center", "col", "colgroup", "dd", "details",
+    "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure",
+    "footer", "form", "h1", "h2", "h3", "h4",
+    "h5", "h6", "head", "header", "hgroup", "hr", "html",
+    "isindex", "main", "menu", "meta", "noframes", "nav",
+    "ol", "output", "p", "pre",
+    "section", "summary", "table", "tbody", "textarea",
+    "thead", "tfoot", "ul", "dd",
+    "dt", "frameset", "li", "tbody", "td", "tfoot",
+    "th", "thead", "tr", "script", "style"]
+
+-- We want to allow raw docbook in markdown documents, so we
+-- include docbook block tags here too.
+blockDocBookTags :: Set Text
+blockDocBookTags = fromList
+   ["calloutlist", "bibliolist", "glosslist", "itemizedlist",
+    "orderedlist", "segmentedlist", "simplelist",
+    "variablelist", "caution", "important", "note", "tip",
+    "warning", "address", "literallayout", "programlisting",
+    "programlistingco", "screen", "screenco", "screenshot",
+    "synopsis", "example", "informalexample", "figure",
+    "informalfigure", "table", "informaltable", "para",
+    "simpara", "formalpara", "equation", "informalequation",
+    "figure", "screenshot", "mediaobject", "qandaset",
+    "procedure", "task", "cmdsynopsis", "funcsynopsis",
+    "classsynopsis", "blockquote", "epigraph", "msgset",
+    "sidebar", "title"]
+
+epubTags :: Set Text
+epubTags = fromList ["case", "switch", "default"]
+
+blockTags :: Set Text
+blockTags = unions [blockHtmlTags, blockDocBookTags, epubTags]
+
+sectioningContent :: [Text]
+sectioningContent = ["article", "aside", "nav", "section"]
+
+
+groupingContent :: [Text]
+groupingContent = ["p", "hr", "pre", "blockquote", "ol"
+                  , "ul", "li", "dl", "dt", "dt", "dd"
+                  , "figure", "figcaption", "div", "main"]
diff --git a/src/Text/Pandoc/Readers/HTML/Types.hs b/src/Text/Pandoc/Readers/HTML/Types.hs
new file mode 100644
index 000000000..a94eeb828
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML/Types.hs
@@ -0,0 +1,97 @@
+{-# LANGUAGE FlexibleInstances     #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{- |
+   Module      : Text.Pandoc.Readers.HTML.Types
+   Copyright   : Copyright (C) 2006-2020 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+
+Types for pandoc's HTML reader.
+-}
+module Text.Pandoc.Readers.HTML.Types
+  ( TagParser
+  , HTMLParser
+  , HTMLState (..)
+  , HTMLLocal (..)
+  )
+where
+
+import Control.Monad.Reader (ReaderT, asks, local)
+import Data.Default (Default (def))
+import Data.Map (Map)
+import Data.Set (Set)
+import Data.Text (Text)
+import Network.URI (URI)
+import Text.HTML.TagSoup
+import Text.Pandoc.Builder (Blocks, HasMeta (..))
+import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
+import Text.Pandoc.Logging (LogMessage)
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Parsing
+  ( HasIdentifierList (..), HasLastStrPosition (..), HasLogMessages (..)
+  , HasMacros (..), HasQuoteContext (..), HasReaderOptions (..)
+  , ParserT, ParserState, QuoteContext (NoQuote)
+  )
+import Text.Pandoc.Readers.LaTeX.Types (Macro)
+
+-- | HTML parser type
+type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
+
+-- | HTML parser, expecting @Tag Text@ as tokens.
+type TagParser m = HTMLParser m [Tag Text]
+
+-- | Global HTML parser state
+data HTMLState = HTMLState
+  { parserState :: ParserState
+  , noteTable   :: [(Text, Blocks)]
+  , baseHref    :: Maybe URI
+  , identifiers :: Set Text
+  , logMessages :: [LogMessage]
+  , macros      :: Map Text Macro
+  , readerOpts  :: ReaderOptions
+  }
+
+-- | Local HTML parser state
+data HTMLLocal = HTMLLocal
+  { quoteContext :: QuoteContext
+  , inChapter    :: Bool -- ^ Set if in chapter section
+  , inPlain      :: Bool -- ^ Set if in pPlain
+  }
+
+
+-- Instances
+
+instance HasMacros HTMLState where
+  extractMacros        = macros
+  updateMacros f st    = st{ macros = f $ macros st }
+
+instance HasIdentifierList HTMLState where
+  extractIdentifierList = identifiers
+  updateIdentifierList f s = s{ identifiers = f (identifiers s) }
+
+instance HasLogMessages HTMLState where
+  addLogMessage m s = s{ logMessages = m : logMessages s }
+  getLogMessages = reverse . logMessages
+
+-- This signature should be more general
+-- MonadReader HTMLLocal m => HasQuoteContext st m
+instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where
+  getQuoteContext = asks quoteContext
+  withQuoteContext q = local (\s -> s{quoteContext = q})
+
+instance HasReaderOptions HTMLState where
+    extractReaderOptions = extractReaderOptions . parserState
+
+instance HasMeta HTMLState where
+  setMeta s b st = st {parserState = setMeta s b $ parserState st}
+  deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
+
+instance Default HTMLLocal where
+  def = HTMLLocal NoQuote False False
+
+instance HasLastStrPosition HTMLState where
+  setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
+  getLastStrPos = getLastStrPos . parserState