pandoc/src/Text/Pandoc/Readers/HTML.hs
hftf 7f8a3c6cb7 Consistent underline for Readers ()
* Added underlineSpan builder function.  This can be easily updated if needed. The purpose is for Readers to transform underlines consistently.

* Docx Reader: Use underlineSpan and update test

* Org Reader: Use underlineSpan and add test

* Textile Reader: Use underlineSpan and add test case

* Txt2Tags Reader: Use underlineSpan and update test

* HTML Reader: Use underlineSpan and add test case
2017-10-27 18:45:00 -04:00

1286 lines
48 KiB
Haskell

{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses,
ViewPatterns, OverloadedStrings #-}
{-
Copyright (C) 2006-2017 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.HTML
Copyright : Copyright (C) 2006-2017 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Conversion of HTML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.HTML ( readHtml
, htmlTag
, htmlInBalanced
, isInlineTag
, isBlockTag
, NamedTag(..)
, isTextTag
, isCommentTag
) where
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
import Text.Pandoc.Shared ( extractSpaces, addMetaField
, escapeURI, safeRead, crFilter, underlineSpan )
import Text.Pandoc.Options (
ReaderOptions(readerExtensions,readerStripComments), extensionEnabled,
Extension (Ext_epub_html_exts,
Ext_raw_html, Ext_native_divs, Ext_native_spans))
import Text.Pandoc.Logging
import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Walk
import qualified Data.Map as M
import Data.Foldable ( for_ )
import Data.Maybe ( fromMaybe, isJust, isNothing )
import Data.List.Split ( wordsBy )
import Data.List ( intercalate, isPrefixOf )
import Data.Char ( isDigit, isLetter, isAlphaNum )
import Control.Monad ( guard, mzero, void, unless, mplus, msum )
import Control.Arrow ((***))
import Control.Applicative ( (<|>) )
import Data.Monoid (First (..))
import Data.Text (Text)
import qualified Data.Text as T
import Text.TeXMath (readMathML, writeTeX)
import Data.Default (Default (..), def)
import Control.Monad.Reader (ask, asks, local, ReaderT, runReaderT, lift)
import Network.URI (URI, parseURIReference, nonStrictRelativeTo)
import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
import Data.Monoid ((<>))
import Text.Parsec.Error
import qualified Data.Set as Set
import Text.Pandoc.Error
import Text.Pandoc.Class (PandocMonad(..))
import Control.Monad.Except (throwError)
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: PandocMonad m
=> ReaderOptions -- ^ Reader options
-> Text -- ^ String to parse (assumes @'\n'@ line endings)
-> m Pandoc
readHtml opts inp = do
let tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True }
(crFilter inp)
parseDoc = do
blocks <- (fixPlains False) . mconcat <$> manyTill block eof
meta <- stateMeta . parserState <$> getState
bs' <- replaceNotes (B.toList blocks)
reportLogMessages
return $ Pandoc meta bs'
getError (errorMessages -> ms) = case ms of
[] -> ""
(m:_) -> messageString m
result <- flip runReaderT def $
runParserT parseDoc
(HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty [])
"source" tags
case result of
Right doc -> return doc
Left err -> throwError $ PandocParseError $ getError err
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes = walkM replaceNotes'
replaceNotes' :: PandocMonad m => Inline -> TagParser m Inline
replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes
where
getNotes = noteTable <$> getState
replaceNotes' x = return x
data HTMLState =
HTMLState
{ parserState :: ParserState,
noteTable :: [(String, Blocks)],
baseHref :: Maybe URI,
identifiers :: Set.Set String,
headerMap :: M.Map Inlines String,
logMessages :: [LogMessage]
}
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
, inChapter :: Bool -- ^ Set if in chapter section
, inPlain :: Bool -- ^ Set if in pPlain
}
setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
setInChapter = local (\s -> s {inChapter = True})
setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
setInPlain = local (\s -> s {inPlain = True})
type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
type TagParser m = HTMLParser m [Tag Text]
pHtml :: PandocMonad m => TagParser m Blocks
pHtml = try $ do
(TagOpen "html" attr) <- lookAhead $ pAnyTag
for_ (lookup "lang" attr) $
updateState . B.setMeta "lang" . B.text . T.unpack
pInTags "html" block
pBody :: PandocMonad m => TagParser m Blocks
pBody = pInTags "body" block
pHead :: PandocMonad m => TagParser m Blocks
pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag)
where pTitle = pInTags "title" inline >>= setTitle . trimInlines
setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
pMetaTag = do
mt <- pSatisfy (matchTagOpen "meta" [])
let name = T.unpack $ fromAttrib "name" mt
if null name
then return mempty
else do
let content = T.unpack $ fromAttrib "content" mt
updateState $ \s ->
let ps = parserState s in
s{ parserState = ps{
stateMeta = addMetaField name (B.text content)
(stateMeta ps) } }
return mempty
pBaseTag = do
bt <- pSatisfy (matchTagOpen "base" [])
updateState $ \st -> st{ baseHref =
parseURIReference $ T.unpack $ fromAttrib "href" bt }
return mempty
block :: PandocMonad m => TagParser m Blocks
block = do
res <- choice
[ eSection
, eSwitch B.para block
, mempty <$ eFootnote
, mempty <$ eTOC
, mempty <$ eTitlePage
, pPara
, pHeader
, pBlockQuote
, pCodeBlock
, pList
, pHrule
, pTable
, pHtml
, pHead
, pBody
, pDiv
, pPlain
, pFigure
, pRawHtmlBlock
]
trace (take 60 $ show $ B.toList res)
return res
namespaces :: PandocMonad m => [(String, TagParser m Inlines)]
namespaces = [(mathMLNamespace, pMath True)]
mathMLNamespace :: String
mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
eSwitch :: (PandocMonad m, Monoid a)
=> (Inlines -> a)
-> TagParser m a
-> TagParser m a
eSwitch constructor parser = try $ do
guardEnabled Ext_epub_html_exts
pSatisfy (matchTagOpen "switch" [])
cases <- getFirst . mconcat <$>
manyTill (First <$> (eCase <* skipMany pBlank) )
(lookAhead $ try $ pSatisfy (matchTagOpen "default" []))
skipMany pBlank
fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank)
skipMany pBlank
pSatisfy (matchTagClose "switch")
return $ maybe fallback constructor cases
eCase :: PandocMonad m => TagParser m (Maybe Inlines)
eCase = do
skipMany pBlank
TagOpen _ attr' <- lookAhead $ pSatisfy $ (matchTagOpen "case" [])
let attr = toStringAttr attr'
case (flip lookup namespaces) =<< lookup "required-namespace" attr of
Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank))
Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (matchTagClose "case"))
eFootnote :: PandocMonad m => TagParser m ()
eFootnote = try $ do
let notes = ["footnote", "rearnote"]
guardEnabled Ext_epub_html_exts
(TagOpen tag attr') <- lookAhead $ pAnyTag
let attr = toStringAttr attr'
guard (maybe False (flip elem notes) (lookup "type" attr))
let ident = fromMaybe "" (lookup "id" attr)
content <- pInTags tag block
addNote ident content
addNote :: PandocMonad m => String -> Blocks -> TagParser m ()
addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)})
eNoteref :: PandocMonad m => TagParser m Inlines
eNoteref = try $ do
guardEnabled Ext_epub_html_exts
TagOpen tag attr' <- lookAhead $ pAnyTag
let attr = toStringAttr attr'
guard (maybe False (== "noteref") (lookup "type" attr))
let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)
guard (not (null ident))
pInTags tag block
return $ B.rawInline "noteref" ident
-- Strip TOC if there is one, better to generate again
eTOC :: PandocMonad m => TagParser m ()
eTOC = try $ do
guardEnabled Ext_epub_html_exts
(TagOpen tag attr) <- lookAhead $ pAnyTag
guard (maybe False (== "toc") (lookup "type" attr))
void (pInTags tag block)
pList :: PandocMonad m => TagParser m Blocks
pList = pBulletList <|> pOrderedList <|> pDefinitionList
pBulletList :: PandocMonad m => TagParser m Blocks
pBulletList = try $ do
pSatisfy (matchTagOpen "ul" [])
let nonItem = pSatisfy (\t ->
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
not (matchTagClose "ul" t))
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
items <- manyTill (pListItem nonItem) (pCloses "ul")
return $ B.bulletList $ map (fixPlains True) items
pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks
pListItem nonItem = do
TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" [])
let attr = toStringAttr attr'
let addId ident bs = case B.toList bs of
(Plain ils:xs) -> B.fromList (Plain
[Span (ident, [], []) ils] : xs)
_ -> B.divWith (ident, [], []) bs
(maybe id addId (lookup "id" attr)) <$>
pInTags "li" block <* skipMany nonItem
parseListStyleType :: String -> ListNumberStyle
parseListStyleType "lower-roman" = LowerRoman
parseListStyleType "upper-roman" = UpperRoman
parseListStyleType "lower-alpha" = LowerAlpha
parseListStyleType "upper-alpha" = UpperAlpha
parseListStyleType "decimal" = Decimal
parseListStyleType _ = DefaultStyle
parseTypeAttr :: String -> ListNumberStyle
parseTypeAttr "i" = LowerRoman
parseTypeAttr "I" = UpperRoman
parseTypeAttr "a" = LowerAlpha
parseTypeAttr "A" = UpperAlpha
parseTypeAttr "1" = Decimal
parseTypeAttr _ = DefaultStyle
pOrderedList :: PandocMonad m => TagParser m Blocks
pOrderedList = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" [])
let attribs = toStringAttr attribs'
let (start, style) = (sta', sty')
where sta = fromMaybe "1" $
lookup "start" attribs
sta' = if all isDigit sta
then read sta
else 1
pickListStyle = pickStyleAttrProps ["list-style-type", "list-style"]
typeAttr = fromMaybe "" $ lookup "type" attribs
classAttr = fromMaybe "" $ lookup "class" attribs
styleAttr = fromMaybe "" $ lookup "style" attribs
listStyle = fromMaybe "" $ pickListStyle styleAttr
sty' = foldOrElse DefaultStyle
[ parseTypeAttr typeAttr
, parseListStyleType classAttr
, parseListStyleType listStyle
]
let nonItem = pSatisfy (\t ->
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
not (matchTagClose "ol" t))
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
items <- manyTill (pListItem nonItem) (pCloses "ol")
return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
pDefinitionList :: PandocMonad m => TagParser m Blocks
pDefinitionList = try $ do
pSatisfy (matchTagOpen "dl" [])
items <- manyTill pDefListItem (pCloses "dl")
return $ B.definitionList items
pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks])
pDefListItem = try $ do
let nonItem = pSatisfy (\t -> not (matchTagOpen "dt" [] t) &&
not (matchTagOpen "dd" [] t) && not (matchTagClose "dl" t))
terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
skipMany nonItem
let term = foldl1 (\x y -> x <> B.linebreak <> y) terms
return (term, map (fixPlains True) defs)
fixPlains :: Bool -> Blocks -> Blocks
fixPlains inList bs = if any isParaish bs'
then B.fromList $ map plainToPara bs'
else bs
where isParaish (Para _) = True
isParaish (CodeBlock _ _) = True
isParaish (Header _ _ _) = True
isParaish (BlockQuote _) = True
isParaish (BulletList _) = not inList
isParaish (OrderedList _ _) = not inList
isParaish (DefinitionList _) = not inList
isParaish _ = False
plainToPara (Plain xs) = Para xs
plainToPara x = x
bs' = B.toList bs
pRawTag :: PandocMonad m => TagParser m Text
pRawTag = do
tag <- pAnyTag
let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"]
if tagOpen ignorable (const True) tag || tagClose ignorable tag
then return mempty
else return $ renderTags' [tag]
pDiv :: PandocMonad m => TagParser m Blocks
pDiv = try $ do
guardEnabled Ext_native_divs
let isDivLike "div" = True
isDivLike "section" = True
isDivLike "main" = True
isDivLike _ = False
TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True)
let attr = toStringAttr attr'
contents <- pInTags tag block
let (ident, classes, kvs) = mkAttr attr
let classes' = if tag == "section"
then "section":classes
else classes
kvs' = if tag == "main" && isNothing (lookup "role" kvs)
then ("role", "main"):kvs
else kvs
return $ B.divWith (ident, classes', kvs') contents
pRawHtmlBlock :: PandocMonad m => TagParser m Blocks
pRawHtmlBlock = do
raw <- T.unpack <$> (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag)
exts <- getOption readerExtensions
if extensionEnabled Ext_raw_html exts && not (null raw)
then return $ B.rawBlock "html" raw
else ignore raw
ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a
ignore raw = do
pos <- getPosition
-- raw can be null for tags like <!DOCTYPE>; see paRawTag
-- in this case we don't want a warning:
unless (null raw) $
logMessage $ SkippedContent raw pos
return mempty
pHtmlBlock :: PandocMonad m => Text -> TagParser m Text
pHtmlBlock t = try $ do
open <- pSatisfy (matchTagOpen t [])
contents <- manyTill pAnyTag (pSatisfy (matchTagClose t))
return $ renderTags' $ [open] <> contents <> [TagClose t]
-- Sets chapter context
eSection :: PandocMonad m => TagParser m Blocks
eSection = try $ do
let matchChapter as = maybe False (T.isInfixOf "chapter") (lookup "type" as)
let sectTag = tagOpen (`elem` sectioningContent) matchChapter
TagOpen tag _ <- lookAhead $ pSatisfy sectTag
setInChapter (pInTags tag block)
headerLevel :: PandocMonad m => Text -> TagParser m Int
headerLevel tagtype = do
case safeRead (T.unpack (T.drop 1 tagtype)) of
Just level ->
(try $ do
guardEnabled Ext_epub_html_exts
asks inChapter >>= guard
return (level - 1))
<|>
return level
Nothing -> fail "Could not retrieve header level"
eTitlePage :: PandocMonad m => TagParser m ()
eTitlePage = try $ do
let isTitlePage as = maybe False (T.isInfixOf "titlepage") (lookup "type" as)
let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section")
isTitlePage
TagOpen tag _ <- lookAhead $ pSatisfy groupTag
() <$ pInTags tag block
pHeader :: PandocMonad m => TagParser m Blocks
pHeader = try $ do
TagOpen tagtype attr' <- pSatisfy $
tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
(const True)
let attr = toStringAttr attr'
let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text)
[("class","title")]
level <- headerLevel tagtype
contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
let ident = fromMaybe "" $ lookup "id" attr
let classes = maybe [] words $ lookup "class" attr
let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
attr'' <- registerHeader (ident, classes, keyvals) contents
return $ if bodyTitle
then mempty -- skip a representation of the title in the body
else B.headerWith attr'' level contents
pHrule :: PandocMonad m => TagParser m Blocks
pHrule = do
pSelfClosing (=="hr") (const True)
return B.horizontalRule
pTable :: PandocMonad m => TagParser m Blocks
pTable = try $ do
TagOpen _ _ <- pSatisfy (matchTagOpen "table" [])
skipMany pBlank
caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
let pTh = option [] $ pInTags "tr" (pCell "th")
pTr = try $ skipMany pBlank >>
pInTags "tr" (pCell "td" <|> pCell "th")
pTBody = do pOptInTag "tbody" $ many1 pTr
head'' <- pOptInTag "thead" pTh
head' <- map snd <$>
(pOptInTag "tbody" $
if null head'' then pTh else return head'')
rowsLs <- many pTBody
rows' <- pOptInTag "tfoot" $ many pTr
TagClose _ <- pSatisfy (matchTagClose "table")
let rows'' = (concat rowsLs) <> rows'
let rows''' = map (map snd) rows''
-- let rows''' = map (map snd) rows''
-- fail on empty table
guard $ not $ null head' && null rows'''
let isSinglePlain x = case B.toList x of
[] -> True
[Plain _] -> True
_ -> False
let isSimple = all isSinglePlain $ concat (head':rows''')
let cols = length $ if null head' then head rows''' else head'
-- add empty cells to short rows
let addEmpties r = case cols - length r of
n | n > 0 -> r <> replicate n mempty
| otherwise -> r
let rows = map addEmpties rows'''
let aligns = case rows'' of
(cs:_) -> map fst cs
_ -> replicate cols AlignDefault
let widths = if null widths'
then if isSimple
then replicate cols 0
else replicate cols (1.0 / fromIntegral cols)
else widths'
return $ B.table caption (zip aligns widths) head' rows
pCol :: PandocMonad m => TagParser m Double
pCol = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
let attribs = toStringAttr attribs'
skipMany pBlank
optional $ pSatisfy (matchTagClose "col")
skipMany pBlank
return $ case lookup "width" attribs of
Nothing -> case lookup "style" attribs of
Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs ->
fromMaybe 0.0 $ safeRead ('0':'.':filter
(`notElem` (" \t\r\n%'\";" :: [Char])) xs)
_ -> 0.0
Just x | not (null x) && last x == '%' ->
fromMaybe 0.0 $ safeRead ('0':'.':init x)
_ -> 0.0
pColgroup :: PandocMonad m => TagParser m [Double]
pColgroup = try $ do
pSatisfy (matchTagOpen "colgroup" [])
skipMany pBlank
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
noColOrRowSpans :: Tag Text -> Bool
noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
where isNullOrOne x = case fromAttrib x t of
"" -> True
"1" -> True
_ -> False
pCell :: PandocMonad m => Text -> TagParser m [(Alignment, Blocks)]
pCell celltype = try $ do
skipMany pBlank
tag <- lookAhead $
pSatisfy (\t -> t ~== TagOpen celltype [] && noColOrRowSpans t)
let extractAlign' [] = ""
extractAlign' ("text-align":x:_) = x
extractAlign' (_:xs) = extractAlign' xs
let extractAlign = extractAlign' . wordsBy (`elem` [' ','\t',';',':'])
let align = case maybeFromAttrib "align" tag `mplus`
(extractAlign <$> maybeFromAttrib "style" tag) of
Just "left" -> AlignLeft
Just "right" -> AlignRight
Just "center" -> AlignCenter
_ -> AlignDefault
res <- pInTags' celltype noColOrRowSpans block
skipMany pBlank
return [(align, res)]
pBlockQuote :: PandocMonad m => TagParser m Blocks
pBlockQuote = do
contents <- pInTags "blockquote" block
return $ B.blockQuote $ fixPlains False contents
pPlain :: PandocMonad m => TagParser m Blocks
pPlain = do
contents <- setInPlain $ trimInlines . mconcat <$> many1 inline
if B.isNull contents
then return mempty
else return $ B.plain contents
pPara :: PandocMonad m => TagParser m Blocks
pPara = do
contents <- trimInlines <$> pInTags "p" inline
return $ B.para contents
pFigure :: PandocMonad m => TagParser m Blocks
pFigure = try $ do
TagOpen _ _ <- pSatisfy (matchTagOpen "figure" [])
skipMany pBlank
let pImg = (\x -> (Just x, Nothing)) <$>
(pOptInTag "p" pImage <* skipMany pBlank)
pCapt = (\x -> (Nothing, Just x)) <$>
(pInTags "figcaption" inline <* skipMany pBlank)
pSkip = (Nothing, Nothing) <$ pSatisfy (not . matchTagClose "figure")
res <- many (pImg <|> pCapt <|> pSkip)
let mbimg = msum $ map fst res
let mbcap = msum $ map snd res
TagClose _ <- pSatisfy (matchTagClose "figure")
let caption = fromMaybe mempty mbcap
case B.toList <$> mbimg of
Just [Image attr _ (url, tit)] ->
return $ B.para $ B.imageWith attr url ("fig:" ++ tit) caption
_ -> mzero
pCodeBlock :: PandocMonad m => TagParser m Blocks
pCodeBlock = try $ do
TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" [])
let attr = toStringAttr attr'
contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
let rawText = concatMap tagToString contents
-- drop leading newline if any
let result' = case rawText of
'\n':xs -> xs
_ -> rawText
-- drop trailing newline if any
let result = case reverse result' of
'\n':_ -> init result'
_ -> result'
return $ B.codeBlockWith (mkAttr attr) result
tagToString :: Tag Text -> String
tagToString (TagText s) = T.unpack s
tagToString (TagOpen "br" _) = "\n"
tagToString _ = ""
inline :: PandocMonad m => TagParser m Inlines
inline = choice
[ eNoteref
, eSwitch id inline
, pTagText
, pQ
, pEmph
, pStrong
, pSuperscript
, pSubscript
, pStrikeout
, pUnderline
, pLineBreak
, pLink
, pImage
, pCode
, pSpan
, pMath False
, pRawHtmlInline
]
pLocation :: PandocMonad m => TagParser m ()
pLocation = do
(TagPosition r c) <- pSat isTagPosition
setPosition $ newPos "input" r c
pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSat f = do
pos <- getPosition
token show (const pos) (\x -> if f x then Just x else Nothing)
pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy f = try $ optional pLocation >> pSat f
pAnyTag :: PandocMonad m => TagParser m (Tag Text)
pAnyTag = pSatisfy (const True)
pSelfClosing :: PandocMonad m
=> (Text -> Bool) -> ([Attribute Text] -> Bool)
-> TagParser m (Tag Text)
pSelfClosing f g = do
open <- pSatisfy (tagOpen f g)
optional $ pSatisfy (tagClose f)
return open
pQ :: PandocMonad m => TagParser m Inlines
pQ = do
context <- asks quoteContext
let quoteType = case context of
InDoubleQuote -> SingleQuote
_ -> DoubleQuote
let innerQuoteContext = if quoteType == SingleQuote
then InSingleQuote
else InDoubleQuote
let constructor = case quoteType of
SingleQuote -> B.singleQuoted
DoubleQuote -> B.doubleQuoted
withQuoteContext innerQuoteContext $
pInlinesInTags "q" constructor
pEmph :: PandocMonad m => TagParser m Inlines
pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph
pStrong :: PandocMonad m => TagParser m Inlines
pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong
pSuperscript :: PandocMonad m => TagParser m Inlines
pSuperscript = pInlinesInTags "sup" B.superscript
pSubscript :: PandocMonad m => TagParser m Inlines
pSubscript = pInlinesInTags "sub" B.subscript
pStrikeout :: PandocMonad m => TagParser m Inlines
pStrikeout = do
pInlinesInTags "s" B.strikeout <|>
pInlinesInTags "strike" B.strikeout <|>
pInlinesInTags "del" B.strikeout <|>
try (do pSatisfy (matchTagOpen "span" [("class","strikeout")])
contents <- mconcat <$> manyTill inline (pCloses "span")
return $ B.strikeout contents)
pUnderline :: PandocMonad m => TagParser m Inlines
pUnderline = pInlinesInTags "u" underlineSpan <|> pInlinesInTags "ins" underlineSpan
pLineBreak :: PandocMonad m => TagParser m Inlines
pLineBreak = do
pSelfClosing (=="br") (const True)
return B.linebreak
-- Unlike fromAttrib from tagsoup, this distinguishes
-- between a missing attribute and an attribute with empty content.
maybeFromAttrib :: String -> Tag Text -> Maybe String
maybeFromAttrib name (TagOpen _ attrs) =
T.unpack <$> lookup (T.pack name) attrs
maybeFromAttrib _ _ = Nothing
pLink :: PandocMonad m => TagParser m Inlines
pLink = try $ do
tag <- pSatisfy $ tagOpenLit "a" (const True)
let title = T.unpack $ fromAttrib "title" tag
-- take id from id attribute if present, otherwise name
let uid = maybe (T.unpack $ fromAttrib "name" tag) id $
maybeFromAttrib "id" tag
let cls = words $ T.unpack $ fromAttrib "class" tag
lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
-- check for href; if href, then a link, otherwise a span
case maybeFromAttrib "href" tag of
Nothing ->
return $ B.spanWith (uid, cls, []) lab
Just url' -> do
mbBaseHref <- baseHref <$> getState
let url = case (parseURIReference url', mbBaseHref) of
(Just rel, Just bs) ->
show (rel `nonStrictRelativeTo` bs)
_ -> url'
return $ B.linkWith (uid, cls, []) (escapeURI url) title lab
pImage :: PandocMonad m => TagParser m Inlines
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
mbBaseHref <- baseHref <$> getState
let url' = T.unpack $ fromAttrib "src" tag
let url = case (parseURIReference url', mbBaseHref) of
(Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
_ -> url'
let title = T.unpack $ fromAttrib "title" tag
let alt = T.unpack $ fromAttrib "alt" tag
let uid = T.unpack $ fromAttrib "id" tag
let cls = words $ T.unpack $ fromAttrib "class" tag
let getAtt k = case fromAttrib k tag of
"" -> []
v -> [(T.unpack k, T.unpack v)]
let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"]
return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt)
pCode :: PandocMonad m => TagParser m Inlines
pCode = try $ do
(TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
let attr = toStringAttr attr'
result <- manyTill pAnyTag (pCloses open)
return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ T.unpack $
innerText result
pSpan :: PandocMonad m => TagParser m Inlines
pSpan = try $ do
guardEnabled Ext_native_spans
TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
let attr = toStringAttr attr'
contents <- pInTags "span" inline
let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes
where styleAttr = fromMaybe "" $ lookup "style" attr
fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr
classes = fromMaybe [] $
words <$> lookup "class" attr
let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr)
return $ tag contents
pRawHtmlInline :: PandocMonad m => TagParser m Inlines
pRawHtmlInline = do
inplain <- asks inPlain
result <- pSatisfy (tagComment (const True))
<|> if inplain
then pSatisfy (not . isBlockTag)
else pSatisfy isInlineTag
exts <- getOption readerExtensions
let raw = T.unpack $ renderTags' [result]
if extensionEnabled Ext_raw_html exts
then return $ B.rawInline "html" raw
else ignore raw
mathMLToTeXMath :: String -> Either String String
mathMLToTeXMath s = writeTeX <$> readMathML s
toStringAttr :: [(Text, Text)] -> [(String, String)]
toStringAttr = map go
where go (x,y) = (T.unpack x, T.unpack y)
pMath :: PandocMonad m => Bool -> TagParser m Inlines
pMath inCase = try $ do
open@(TagOpen _ attr') <- pSatisfy $ tagOpen (=="math") (const True)
-- we'll assume math tags are MathML unless specially marked
-- otherwise...
let attr = toStringAttr attr'
unless inCase $
guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr))
contents <- manyTill pAnyTag (pSatisfy (matchTagClose "math"))
case mathMLToTeXMath (T.unpack $ renderTags $
[open] <> contents <> [TagClose "math"]) of
Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $
T.unpack $ innerText contents
Right [] -> return mempty
Right x -> return $ case lookup "display" attr of
Just "block" -> B.displayMath x
_ -> B.math x
pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines)
-> TagParser m Inlines
pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
pInTags tagtype parser = pInTags' tagtype (const True) parser
pInTags' :: (PandocMonad m, Monoid a)
=> Text
-> (Tag Text -> Bool)
-> TagParser m a
-> TagParser m a
pInTags' tagtype tagtest parser = try $ do
pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
mconcat <$> manyTill parser (pCloses tagtype <|> eof)
-- parses p, preceeded by an optional opening tag
-- and followed by an optional closing tags
pOptInTag :: PandocMonad m => Text -> TagParser m a -> TagParser m a
pOptInTag tagtype p = try $ do
skipMany pBlank
optional $ pSatisfy (matchTagOpen tagtype [])
skipMany pBlank
x <- p
skipMany pBlank
optional $ pSatisfy (matchTagClose tagtype)
skipMany pBlank
return x
pCloses :: PandocMonad m => Text -> TagParser m ()
pCloses tagtype = try $ do
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
case t of
(TagClose t') | t' == tagtype -> pAnyTag >> return ()
(TagOpen t' _) | t' `closes` tagtype -> return ()
(TagClose "ul") | tagtype == "li" -> return ()
(TagClose "ol") | tagtype == "li" -> return ()
(TagClose "dl") | tagtype == "dd" -> return ()
(TagClose "table") | tagtype == "td" -> return ()
(TagClose "table") | tagtype == "tr" -> return ()
(TagClose "td") | tagtype `Set.member` blockHtmlTags -> return ()
(TagClose "th") | tagtype `Set.member` blockHtmlTags -> return ()
(TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags
-> return () -- see #3794
_ -> mzero
pTagText :: PandocMonad m => TagParser m Inlines
pTagText = try $ do
(TagText str) <- pSatisfy isTagText
st <- getState
qu <- ask
parsed <- lift $ lift $
flip runReaderT qu $ runParserT (many pTagContents) st "text" str
case parsed of
Left _ -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'"
Right result -> return $ mconcat result
pBlank :: PandocMonad m => TagParser m ()
pBlank = try $ do
(TagText str) <- pSatisfy isTagText
guard $ T.all isSpace str
type InlinesParser m = HTMLParser m Text
pTagContents :: PandocMonad m => InlinesParser m Inlines
pTagContents =
B.displayMath <$> mathDisplay
<|> B.math <$> mathInline
<|> pStr
<|> pSpace
<|> smartPunctuation pTagContents
<|> pSymbol
<|> pBad
pStr :: PandocMonad m => InlinesParser m Inlines
pStr = do
result <- many1 $ satisfy $ \c ->
not (isSpace c) && not (isSpecial c) && not (isBad c)
updateLastStrPos
return $ B.str result
isSpecial :: Char -> Bool
isSpecial '"' = True
isSpecial '\'' = True
isSpecial '.' = True
isSpecial '-' = True
isSpecial '$' = True
isSpecial '\8216' = True
isSpecial '\8217' = True
isSpecial '\8220' = True
isSpecial '\8221' = True
isSpecial _ = False
pSymbol :: PandocMonad m => InlinesParser m Inlines
pSymbol = satisfy isSpecial >>= return . B.str . (:[])
isBad :: Char -> Bool
isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
pBad :: PandocMonad m => InlinesParser m Inlines
pBad = do
c <- satisfy isBad
let c' = case c of
'\128' -> '\8364'
'\130' -> '\8218'
'\131' -> '\402'
'\132' -> '\8222'
'\133' -> '\8230'
'\134' -> '\8224'
'\135' -> '\8225'
'\136' -> '\710'
'\137' -> '\8240'
'\138' -> '\352'
'\139' -> '\8249'
'\140' -> '\338'
'\142' -> '\381'
'\145' -> '\8216'
'\146' -> '\8217'
'\147' -> '\8220'
'\148' -> '\8221'
'\149' -> '\8226'
'\150' -> '\8211'
'\151' -> '\8212'
'\152' -> '\732'
'\153' -> '\8482'
'\154' -> '\353'
'\155' -> '\8250'
'\156' -> '\339'
'\158' -> '\382'
'\159' -> '\376'
_ -> '?'
return $ B.str [c']
pSpace :: PandocMonad m => InlinesParser m Inlines
pSpace = many1 (satisfy isSpace) >>= \xs ->
if '\n' `elem` xs
then return B.softbreak
else return B.space
--
-- Constants
--
eitherBlockOrInline :: Set.Set Text
eitherBlockOrInline = Set.fromList
["audio", "applet", "button", "iframe", "embed",
"del", "ins", "progress", "map", "area", "noscript", "script",
"object", "svg", "video", "source"]
blockHtmlTags :: Set.Set Text
blockHtmlTags = Set.fromList
["?xml", "!DOCTYPE", "address", "article", "aside",
"blockquote", "body", "canvas",
"caption", "center", "col", "colgroup", "dd", "details",
"dir", "div", "dl", "dt", "fieldset", "figcaption", "figure",
"footer", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "head", "header", "hgroup", "hr", "html",
"isindex", "main", "menu", "meta", "noframes", "ol", "output", "p", "pre",
"section", "table", "tbody", "textarea",
"thead", "tfoot", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr", "script", "style"]
-- We want to allow raw docbook in markdown documents, so we
-- include docbook block tags here too.
blockDocBookTags :: Set.Set Text
blockDocBookTags = Set.fromList
["calloutlist", "bibliolist", "glosslist", "itemizedlist",
"orderedlist", "segmentedlist", "simplelist",
"variablelist", "caution", "important", "note", "tip",
"warning", "address", "literallayout", "programlisting",
"programlistingco", "screen", "screenco", "screenshot",
"synopsis", "example", "informalexample", "figure",
"informalfigure", "table", "informaltable", "para",
"simpara", "formalpara", "equation", "informalequation",
"figure", "screenshot", "mediaobject", "qandaset",
"procedure", "task", "cmdsynopsis", "funcsynopsis",
"classsynopsis", "blockquote", "epigraph", "msgset",
"sidebar", "title"]
epubTags :: Set.Set Text
epubTags = Set.fromList ["case", "switch", "default"]
blockTags :: Set.Set Text
blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags]
class NamedTag a where
getTagName :: a -> Maybe Text
instance NamedTag (Tag Text) where
getTagName (TagOpen t _) = Just t
getTagName (TagClose t) = Just t
getTagName _ = Nothing
instance NamedTag (Tag String) where
getTagName (TagOpen t _) = Just (T.pack t)
getTagName (TagClose t) = Just (T.pack t)
getTagName _ = Nothing
isInlineTag :: NamedTag (Tag a) => Tag a -> Bool
isInlineTag t = isInlineTagName || isCommentTag t
where isInlineTagName = case getTagName t of
Just x -> x
`Set.notMember` blockTags
Nothing -> False
isBlockTag :: NamedTag (Tag a) => Tag a -> Bool
isBlockTag t = isBlockTagName || isTagComment t
where isBlockTagName =
case getTagName t of
Just x
| "?" `T.isPrefixOf` x -> True
| "!" `T.isPrefixOf` x -> True
| otherwise -> x `Set.member` blockTags
|| x `Set.member` eitherBlockOrInline
Nothing -> False
isTextTag :: Tag a -> Bool
isTextTag = tagText (const True)
isCommentTag :: Tag a -> Bool
isCommentTag = tagComment (const True)
-- taken from HXT and extended
-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
closes :: Text -> Text -> Bool
_ `closes` "body" = False
_ `closes` "html" = False
"body" `closes` "head" = True
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"td" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dd" `closes` t | t `elem` ["dt", "dd"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
"optgroup" `closes` "optgroup" = True
"optgroup" `closes` "option" = True
"option" `closes` "option" = True
-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags
x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
"dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section",
"table", "ul"] = True
_ `closes` "meta" = True
"form" `closes` "form" = True
"label" `closes` "label" = True
"map" `closes` "map" = True
"object" `closes` "object" = True
_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
t `closes` "select" | t /= "option" = True
"thead" `closes` t | t `elem` ["colgroup"] = True
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
t `closes` t2 |
t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] &&
t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main"
t1 `closes` t2 |
t1 `Set.member` blockTags &&
t2 `Set.notMember` blockTags &&
t2 `Set.notMember` eitherBlockOrInline = True
_ `closes` _ = False
--- parsers for use in markdown, textile readers
-- | Matches a stretch of HTML in balanced tags.
htmlInBalanced :: (HasReaderOptions st, Monad m)
=> (Tag String -> Bool)
-> ParserT String st m String
htmlInBalanced f = try $ do
lookAhead (char '<')
inp <- getInput
let ts = canonicalizeTags $
parseTagsOptions parseOptions{ optTagWarning = True,
optTagPosition = True } inp
case ts of
(TagPosition sr sc : t@(TagOpen tn _) : rest) -> do
guard $ f t
guard $ not $ hasTagWarning (t : take 1 rest)
case htmlInBalanced' tn (t:rest) of
[] -> mzero
xs -> case reverse xs of
(TagClose _ : TagPosition er ec : _) -> do
let ls = er - sr
let cs = ec - sc
lscontents <- unlines <$> count ls anyLine
cscontents <- count cs anyChar
closetag <- do
x <- many (satisfy (/='>'))
char '>'
return (x <> ">")
return (lscontents <> cscontents <> closetag)
_ -> mzero
_ -> mzero
htmlInBalanced' :: String
-> [Tag String]
-> [Tag String]
htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
where go :: Int -> [Tag String] -> Maybe [Tag String]
go n (t@(TagOpen tn' _):rest) | tn' == tagname =
(t :) <$> go (n + 1) rest
go 1 (t@(TagClose tn'):_) | tn' == tagname =
return [t]
go n (t@(TagClose tn'):rest) | tn' == tagname =
(t :) <$> go (n - 1) rest
go n (t:ts') = (t :) <$> go n ts'
go _ [] = mzero
hasTagWarning :: [Tag a] -> Bool
hasTagWarning (TagWarning _:_) = True
hasTagWarning _ = False
-- | Matches a tag meeting a certain condition.
htmlTag :: (HasReaderOptions st, Monad m)
=> (Tag String -> Bool)
-> ParserT [Char] st m (Tag String, String)
htmlTag f = try $ do
lookAhead (char '<')
inp <- getInput
let ts = canonicalizeTags $ parseTagsOptions
parseOptions{ optTagWarning = False
, optTagPosition = True }
(inp ++ " ") -- add space to ensure that
-- we get a TagPosition after the tag
(next, ln, col) <- case ts of
(TagPosition{} : next : TagPosition ln col : _)
| f next -> return (next, ln, col)
_ -> mzero
-- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
-- should NOT be parsed as an HTML tag, see #2277,
-- so we exclude . even though it's a valid character
-- in XML elemnet names
let isNameChar c = isAlphaNum c || c == ':' || c == '-' || c == '_'
let isName s = case s of
[] -> False
(c:cs) -> isLetter c && all isNameChar cs
let endAngle = try $ do char '>'
pos <- getPosition
guard $ (sourceLine pos == ln &&
sourceColumn pos >= col) ||
sourceLine pos > ln
let handleTag tagname = do
-- basic sanity check, since the parser is very forgiving
-- and finds tags in stuff like x<y)
guard $ isName tagname
guard $ not $ null tagname
-- <https://example.org> should NOT be a tag either.
-- tagsoup will parse it as TagOpen "https:" [("example.org","")]
guard $ last tagname /= ':'
char '<'
rendered <- manyTill anyChar endAngle
return (next, "<" ++ rendered ++ ">")
case next of
TagComment s
| "<!--" `isPrefixOf` inp -> do
char '<'
manyTill anyChar endAngle
stripComments <- getOption readerStripComments
if stripComments
then return (next, "")
else return (next, "<!--" <> s <> "-->")
| otherwise -> fail "bogus comment mode, HTML5 parse error"
TagOpen tagname attr -> do
guard $ all (isName . fst) attr
handleTag tagname
TagClose tagname ->
handleTag tagname
_ -> mzero
mkAttr :: [(String, String)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) <> epubTypes
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
-- Strip namespace prefixes
stripPrefixes :: [Tag Text] -> [Tag Text]
stripPrefixes = map stripPrefix
stripPrefix :: Tag Text -> Tag Text
stripPrefix (TagOpen s as) =
TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)
stripPrefix (TagClose s) = TagClose (stripPrefix' s)
stripPrefix x = x
stripPrefix' :: Text -> Text
stripPrefix' s =
if T.null t then s else T.drop 1 t
where (_, t) = T.span (/= ':') s
isSpace :: Char -> Bool
isSpace ' ' = True
isSpace '\t' = True
isSpace '\n' = True
isSpace '\r' = True
isSpace _ = False
-- Instances
instance HasIdentifierList HTMLState where
extractIdentifierList = identifiers
updateIdentifierList f s = s{ identifiers = f (identifiers s) }
instance HasHeaderMap HTMLState where
extractHeaderMap = headerMap
updateHeaderMap f s = s{ headerMap = f (headerMap s) }
instance HasLogMessages HTMLState where
addLogMessage m s = s{ logMessages = m : logMessages s }
getLogMessages = reverse . logMessages
-- This signature should be more general
-- MonadReader HTMLLocal m => HasQuoteContext st m
instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where
getQuoteContext = asks quoteContext
withQuoteContext q = local (\s -> s{quoteContext = q})
instance HasReaderOptions HTMLState where
extractReaderOptions = extractReaderOptions . parserState
instance HasMeta HTMLState where
setMeta s b st = st {parserState = setMeta s b $ parserState st}
deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
instance Default HTMLLocal where
def = HTMLLocal NoQuote False False
instance HasLastStrPosition HTMLState where
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
getLastStrPos = getLastStrPos . parserState
-- For now we need a special verison here; the one in Shared has String type
renderTags' :: [Tag Text] -> Text
renderTags' = renderTagsOptions
renderOptions{ optMinimize = matchTags ["hr", "br", "img",
"meta", "link"]
, optRawTag = matchTags ["script", "style"] }
where matchTags = \tags -> flip elem tags . T.toLower
-- EPUB Specific
--
--
sectioningContent :: [Text]
sectioningContent = ["article", "aside", "nav", "section"]
groupingContent :: [Text]
groupingContent = ["p", "hr", "pre", "blockquote", "ol"
, "ul", "li", "dl", "dt", "dt", "dd"
, "figure", "figcaption", "div", "main"]
matchTagClose :: Text -> (Tag Text -> Bool)
matchTagClose t = (~== TagClose t)
matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
matchTagOpen t as = (~== TagOpen t as)
{-
types :: [(String, ([String], Int))]
types = -- Document divisions
map (\s -> (s, (["section", "body"], 0)))
["volume", "part", "chapter", "division"]
<> -- Document section and components
[
("abstract", ([], 0))]
-}