Rewrote HTML reader to use Text throughout.
- Export new NamedTag class from HTML reader. - Effect on memory usage is modest (< 10%).
This commit is contained in:
parent
0ab26ac9eb
commit
49b738de4e
1 changed files with 192 additions and 135 deletions
|
@ -1,5 +1,5 @@
|
|||
{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses,
|
||||
ViewPatterns#-}
|
||||
ViewPatterns, OverloadedStrings #-}
|
||||
{-
|
||||
Copyright (C) 2006-2017 John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
|
@ -34,6 +34,7 @@ module Text.Pandoc.Readers.HTML ( readHtml
|
|||
, htmlInBalanced
|
||||
, isInlineTag
|
||||
, isBlockTag
|
||||
, NamedTag(..)
|
||||
, isTextTag
|
||||
, isCommentTag
|
||||
) where
|
||||
|
@ -43,7 +44,7 @@ import Text.HTML.TagSoup.Match
|
|||
import Text.Pandoc.Definition
|
||||
import qualified Text.Pandoc.Builder as B
|
||||
import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
|
||||
import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField
|
||||
import Text.Pandoc.Shared ( extractSpaces, addMetaField
|
||||
, escapeURI, safeRead )
|
||||
import Text.Pandoc.Options (ReaderOptions(readerExtensions), extensionEnabled,
|
||||
Extension (Ext_epub_html_exts,
|
||||
|
@ -53,13 +54,14 @@ import Text.Pandoc.Parsing hiding ((<|>))
|
|||
import Text.Pandoc.Walk
|
||||
import qualified Data.Map as M
|
||||
import Data.Maybe ( fromMaybe, isJust)
|
||||
import Data.List ( intercalate, isInfixOf, isPrefixOf )
|
||||
import Data.List ( intercalate, isPrefixOf )
|
||||
import Data.Char ( isDigit, isLetter, isAlphaNum )
|
||||
import Control.Monad ( guard, mzero, void, unless )
|
||||
import Control.Arrow ((***))
|
||||
import Control.Applicative ( (<|>) )
|
||||
import Data.Monoid (First (..))
|
||||
import Data.Text (Text, unpack)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import Text.TeXMath (readMathML, writeTeX)
|
||||
import Data.Default (Default (..), def)
|
||||
import Control.Monad.Reader (ask, asks, local, ReaderT, runReaderT, lift)
|
||||
|
@ -80,7 +82,7 @@ readHtml :: PandocMonad m
|
|||
readHtml opts inp = do
|
||||
let tags = stripPrefixes . canonicalizeTags $
|
||||
parseTagsOptions parseOptions{ optTagPosition = True }
|
||||
(unpack inp)
|
||||
inp
|
||||
parseDoc = do
|
||||
blocks <- (fixPlains False) . mconcat <$> manyTill block eof
|
||||
meta <- stateMeta . parserState <$> getState
|
||||
|
@ -130,7 +132,7 @@ setInPlain = local (\s -> s {inPlain = True})
|
|||
|
||||
type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
|
||||
|
||||
type TagParser m = HTMLParser m [Tag String]
|
||||
type TagParser m = HTMLParser m [Tag Text]
|
||||
|
||||
pBody :: PandocMonad m => TagParser m Blocks
|
||||
pBody = pInTags "body" block
|
||||
|
@ -140,12 +142,12 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag
|
|||
where pTitle = pInTags "title" inline >>= setTitle . trimInlines
|
||||
setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
|
||||
pMetaTag = do
|
||||
mt <- pSatisfy (~== TagOpen "meta" [])
|
||||
let name = fromAttrib "name" mt
|
||||
mt <- pSatisfy (matchTagOpen "meta" [])
|
||||
let name = T.unpack $ fromAttrib "name" mt
|
||||
if null name
|
||||
then return mempty
|
||||
else do
|
||||
let content = fromAttrib "content" mt
|
||||
let content = T.unpack $ fromAttrib "content" mt
|
||||
updateState $ \s ->
|
||||
let ps = parserState s in
|
||||
s{ parserState = ps{
|
||||
|
@ -153,9 +155,9 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag
|
|||
(stateMeta ps) } }
|
||||
return mempty
|
||||
pBaseTag = do
|
||||
bt <- pSatisfy (~== TagOpen "base" [])
|
||||
bt <- pSatisfy (matchTagOpen "base" [])
|
||||
updateState $ \st -> st{ baseHref =
|
||||
parseURIReference $ fromAttrib "href" bt }
|
||||
parseURIReference $ T.unpack $ fromAttrib "href" bt }
|
||||
return mempty
|
||||
|
||||
block :: PandocMonad m => TagParser m Blocks
|
||||
|
@ -195,29 +197,31 @@ eSwitch :: (PandocMonad m, Monoid a)
|
|||
-> TagParser m a
|
||||
eSwitch constructor parser = try $ do
|
||||
guardEnabled Ext_epub_html_exts
|
||||
pSatisfy (~== TagOpen "switch" [])
|
||||
pSatisfy (matchTagOpen "switch" [])
|
||||
cases <- getFirst . mconcat <$>
|
||||
manyTill (First <$> (eCase <* skipMany pBlank) )
|
||||
(lookAhead $ try $ pSatisfy (~== TagOpen "default" []))
|
||||
(lookAhead $ try $ pSatisfy (matchTagOpen "default" []))
|
||||
skipMany pBlank
|
||||
fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank)
|
||||
skipMany pBlank
|
||||
pSatisfy (~== TagClose "switch")
|
||||
pSatisfy (matchTagClose "switch")
|
||||
return $ maybe fallback constructor cases
|
||||
|
||||
eCase :: PandocMonad m => TagParser m (Maybe Inlines)
|
||||
eCase = do
|
||||
skipMany pBlank
|
||||
TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" [])
|
||||
TagOpen _ attr' <- lookAhead $ pSatisfy $ (matchTagOpen "case" [])
|
||||
let attr = toStringAttr attr'
|
||||
case (flip lookup namespaces) =<< lookup "required-namespace" attr of
|
||||
Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank))
|
||||
Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case"))
|
||||
Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (matchTagClose "case"))
|
||||
|
||||
eFootnote :: PandocMonad m => TagParser m ()
|
||||
eFootnote = try $ do
|
||||
let notes = ["footnote", "rearnote"]
|
||||
guardEnabled Ext_epub_html_exts
|
||||
(TagOpen tag attr) <- lookAhead $ pAnyTag
|
||||
(TagOpen tag attr') <- lookAhead $ pAnyTag
|
||||
let attr = toStringAttr attr'
|
||||
guard (maybe False (flip elem notes) (lookup "type" attr))
|
||||
let ident = fromMaybe "" (lookup "id" attr)
|
||||
content <- pInTags tag block
|
||||
|
@ -229,7 +233,8 @@ addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)
|
|||
eNoteref :: PandocMonad m => TagParser m Inlines
|
||||
eNoteref = try $ do
|
||||
guardEnabled Ext_epub_html_exts
|
||||
TagOpen tag attr <- lookAhead $ pAnyTag
|
||||
TagOpen tag attr' <- lookAhead $ pAnyTag
|
||||
let attr = toStringAttr attr'
|
||||
guard (maybe False (== "noteref") (lookup "type" attr))
|
||||
let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)
|
||||
guard (not (null ident))
|
||||
|
@ -249,10 +254,10 @@ pList = pBulletList <|> pOrderedList <|> pDefinitionList
|
|||
|
||||
pBulletList :: PandocMonad m => TagParser m Blocks
|
||||
pBulletList = try $ do
|
||||
pSatisfy (~== TagOpen "ul" [])
|
||||
pSatisfy (matchTagOpen "ul" [])
|
||||
let nonItem = pSatisfy (\t ->
|
||||
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
|
||||
not (t ~== TagClose "ul"))
|
||||
not (matchTagClose "ul" t))
|
||||
-- note: if they have an <ol> or <ul> not in scope of a <li>,
|
||||
-- treat it as a list item, though it's not valid xhtml...
|
||||
skipMany nonItem
|
||||
|
@ -261,7 +266,8 @@ pBulletList = try $ do
|
|||
|
||||
pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks
|
||||
pListItem nonItem = do
|
||||
TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" [])
|
||||
TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" [])
|
||||
let attr = toStringAttr attr'
|
||||
let addId ident bs = case B.toList bs of
|
||||
(Plain ils:xs) -> B.fromList (Plain
|
||||
[Span (ident, [], []) ils] : xs)
|
||||
|
@ -287,7 +293,8 @@ parseTypeAttr _ = DefaultStyle
|
|||
|
||||
pOrderedList :: PandocMonad m => TagParser m Blocks
|
||||
pOrderedList = try $ do
|
||||
TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
|
||||
TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" [])
|
||||
let attribs = toStringAttr attribs'
|
||||
let (start, style) = (sta', sty')
|
||||
where sta = fromMaybe "1" $
|
||||
lookup "start" attribs
|
||||
|
@ -309,7 +316,7 @@ pOrderedList = try $ do
|
|||
]
|
||||
let nonItem = pSatisfy (\t ->
|
||||
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
|
||||
not (t ~== TagClose "ol"))
|
||||
not (matchTagClose "ol" t))
|
||||
-- note: if they have an <ol> or <ul> not in scope of a <li>,
|
||||
-- treat it as a list item, though it's not valid xhtml...
|
||||
skipMany nonItem
|
||||
|
@ -318,14 +325,14 @@ pOrderedList = try $ do
|
|||
|
||||
pDefinitionList :: PandocMonad m => TagParser m Blocks
|
||||
pDefinitionList = try $ do
|
||||
pSatisfy (~== TagOpen "dl" [])
|
||||
pSatisfy (matchTagOpen "dl" [])
|
||||
items <- manyTill pDefListItem (pCloses "dl")
|
||||
return $ B.definitionList items
|
||||
|
||||
pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks])
|
||||
pDefListItem = try $ do
|
||||
let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
|
||||
not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
|
||||
let nonItem = pSatisfy (\t -> not (matchTagOpen "dt" [] t) &&
|
||||
not (matchTagOpen "dd" [] t) && not (matchTagClose "dl" t))
|
||||
terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
|
||||
defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
|
||||
skipMany nonItem
|
||||
|
@ -348,12 +355,12 @@ fixPlains inList bs = if any isParaish bs'
|
|||
plainToPara x = x
|
||||
bs' = B.toList bs
|
||||
|
||||
pRawTag :: PandocMonad m => TagParser m String
|
||||
pRawTag :: PandocMonad m => TagParser m Text
|
||||
pRawTag = do
|
||||
tag <- pAnyTag
|
||||
let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"]
|
||||
if tagOpen ignorable (const True) tag || tagClose ignorable tag
|
||||
then return []
|
||||
then return mempty
|
||||
else return $ renderTags' [tag]
|
||||
|
||||
pDiv :: PandocMonad m => TagParser m Blocks
|
||||
|
@ -362,7 +369,8 @@ pDiv = try $ do
|
|||
let isDivLike "div" = True
|
||||
isDivLike "section" = True
|
||||
isDivLike _ = False
|
||||
TagOpen tag attr <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True)
|
||||
TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True)
|
||||
let attr = toStringAttr attr'
|
||||
contents <- pInTags tag block
|
||||
let (ident, classes, kvs) = mkAttr attr
|
||||
let classes' = if tag == "section"
|
||||
|
@ -372,7 +380,7 @@ pDiv = try $ do
|
|||
|
||||
pRawHtmlBlock :: PandocMonad m => TagParser m Blocks
|
||||
pRawHtmlBlock = do
|
||||
raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag
|
||||
raw <- T.unpack <$> (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag)
|
||||
exts <- getOption readerExtensions
|
||||
if extensionEnabled Ext_raw_html exts && not (null raw)
|
||||
then return $ B.rawBlock "html" raw
|
||||
|
@ -387,33 +395,35 @@ ignore raw = do
|
|||
logMessage $ SkippedContent raw pos
|
||||
return mempty
|
||||
|
||||
pHtmlBlock :: PandocMonad m => String -> TagParser m String
|
||||
pHtmlBlock :: PandocMonad m => Text -> TagParser m Text
|
||||
pHtmlBlock t = try $ do
|
||||
open <- pSatisfy (~== TagOpen t [])
|
||||
contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
|
||||
return $ renderTags' $ [open] ++ contents ++ [TagClose t]
|
||||
open <- pSatisfy (matchTagOpen t [])
|
||||
contents <- manyTill pAnyTag (pSatisfy (matchTagClose t))
|
||||
return $ renderTags' $ [open] <> contents <> [TagClose t]
|
||||
|
||||
-- Sets chapter context
|
||||
eSection :: PandocMonad m => TagParser m Blocks
|
||||
eSection = try $ do
|
||||
let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as)
|
||||
let matchChapter as = maybe False (T.isInfixOf "chapter") (lookup "type" as)
|
||||
let sectTag = tagOpen (`elem` sectioningContent) matchChapter
|
||||
TagOpen tag _ <- lookAhead $ pSatisfy sectTag
|
||||
setInChapter (pInTags tag block)
|
||||
|
||||
headerLevel :: PandocMonad m => String -> TagParser m Int
|
||||
headerLevel :: PandocMonad m => Text -> TagParser m Int
|
||||
headerLevel tagtype = do
|
||||
let level = read (drop 1 tagtype)
|
||||
(try $ do
|
||||
guardEnabled Ext_epub_html_exts
|
||||
asks inChapter >>= guard
|
||||
return (level - 1))
|
||||
<|>
|
||||
return level
|
||||
case safeRead (T.unpack (T.drop 1 tagtype)) of
|
||||
Just level ->
|
||||
(try $ do
|
||||
guardEnabled Ext_epub_html_exts
|
||||
asks inChapter >>= guard
|
||||
return (level - 1))
|
||||
<|>
|
||||
return level
|
||||
Nothing -> fail "Could not retrieve header level"
|
||||
|
||||
eTitlePage :: PandocMonad m => TagParser m ()
|
||||
eTitlePage = try $ do
|
||||
let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as)
|
||||
let isTitlePage as = maybe False (T.isInfixOf "titlepage") (lookup "type" as)
|
||||
let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section")
|
||||
isTitlePage
|
||||
TagOpen tag _ <- lookAhead $ pSatisfy groupTag
|
||||
|
@ -421,19 +431,21 @@ eTitlePage = try $ do
|
|||
|
||||
pHeader :: PandocMonad m => TagParser m Blocks
|
||||
pHeader = try $ do
|
||||
TagOpen tagtype attr <- pSatisfy $
|
||||
TagOpen tagtype attr' <- pSatisfy $
|
||||
tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
|
||||
(const True)
|
||||
let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
|
||||
let attr = toStringAttr attr'
|
||||
let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text)
|
||||
[("class","title")]
|
||||
level <- headerLevel tagtype
|
||||
contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
|
||||
let ident = fromMaybe "" $ lookup "id" attr
|
||||
let classes = maybe [] words $ lookup "class" attr
|
||||
let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
|
||||
attr' <- registerHeader (ident, classes, keyvals) contents
|
||||
attr'' <- registerHeader (ident, classes, keyvals) contents
|
||||
return $ if bodyTitle
|
||||
then mempty -- skip a representation of the title in the body
|
||||
else B.headerWith attr' level contents
|
||||
else B.headerWith attr'' level contents
|
||||
|
||||
pHrule :: PandocMonad m => TagParser m Blocks
|
||||
pHrule = do
|
||||
|
@ -442,7 +454,7 @@ pHrule = do
|
|||
|
||||
pTable :: PandocMonad m => TagParser m Blocks
|
||||
pTable = try $ do
|
||||
TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
|
||||
TagOpen _ _ <- pSatisfy (matchTagOpen "table" [])
|
||||
skipMany pBlank
|
||||
caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
|
||||
widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
|
||||
|
@ -456,8 +468,8 @@ pTable = try $ do
|
|||
else return head''
|
||||
rowsLs <- many pTBody
|
||||
rows' <- pOptInTag "tfoot" $ many pTr
|
||||
TagClose _ <- pSatisfy (~== TagClose "table")
|
||||
let rows'' = (concat rowsLs) ++ rows'
|
||||
TagClose _ <- pSatisfy (matchTagClose "table")
|
||||
let rows'' = (concat rowsLs) <> rows'
|
||||
-- fail on empty table
|
||||
guard $ not $ null head' && null rows''
|
||||
let isSinglePlain x = case B.toList x of
|
||||
|
@ -468,7 +480,7 @@ pTable = try $ do
|
|||
let cols = length $ if null head' then head rows'' else head'
|
||||
-- add empty cells to short rows
|
||||
let addEmpties r = case cols - length r of
|
||||
n | n > 0 -> r ++ replicate n mempty
|
||||
n | n > 0 -> r <> replicate n mempty
|
||||
| otherwise -> r
|
||||
let rows = map addEmpties rows''
|
||||
let aligns = replicate cols AlignDefault
|
||||
|
@ -481,15 +493,16 @@ pTable = try $ do
|
|||
|
||||
pCol :: PandocMonad m => TagParser m Double
|
||||
pCol = try $ do
|
||||
TagOpen _ attribs <- pSatisfy (~== TagOpen "col" [])
|
||||
TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
|
||||
let attribs = toStringAttr attribs'
|
||||
skipMany pBlank
|
||||
optional $ pSatisfy (~== TagClose "col")
|
||||
optional $ pSatisfy (matchTagClose "col")
|
||||
skipMany pBlank
|
||||
return $ case lookup "width" attribs of
|
||||
Nothing -> case lookup "style" attribs of
|
||||
Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs ->
|
||||
fromMaybe 0.0 $ safeRead ('0':'.':filter
|
||||
(`notElem` " \t\r\n%'\";") xs)
|
||||
(`notElem` (" \t\r\n%'\";" :: [Char])) xs)
|
||||
_ -> 0.0
|
||||
Just x | not (null x) && last x == '%' ->
|
||||
fromMaybe 0.0 $ safeRead ('0':'.':init x)
|
||||
|
@ -497,18 +510,18 @@ pCol = try $ do
|
|||
|
||||
pColgroup :: PandocMonad m => TagParser m [Double]
|
||||
pColgroup = try $ do
|
||||
pSatisfy (~== TagOpen "colgroup" [])
|
||||
pSatisfy (matchTagOpen "colgroup" [])
|
||||
skipMany pBlank
|
||||
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
|
||||
|
||||
noColOrRowSpans :: Tag String -> Bool
|
||||
noColOrRowSpans :: Tag Text -> Bool
|
||||
noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
|
||||
where isNullOrOne x = case fromAttrib x t of
|
||||
"" -> True
|
||||
"1" -> True
|
||||
_ -> False
|
||||
|
||||
pCell :: PandocMonad m => String -> TagParser m [Blocks]
|
||||
pCell :: PandocMonad m => Text -> TagParser m [Blocks]
|
||||
pCell celltype = try $ do
|
||||
skipMany pBlank
|
||||
res <- pInTags' celltype noColOrRowSpans block
|
||||
|
@ -534,7 +547,8 @@ pPara = do
|
|||
|
||||
pCodeBlock :: PandocMonad m => TagParser m Blocks
|
||||
pCodeBlock = try $ do
|
||||
TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
|
||||
TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" [])
|
||||
let attr = toStringAttr attr'
|
||||
contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
|
||||
let rawText = concatMap tagToString contents
|
||||
-- drop leading newline if any
|
||||
|
@ -547,8 +561,8 @@ pCodeBlock = try $ do
|
|||
_ -> result'
|
||||
return $ B.codeBlockWith (mkAttr attr) result
|
||||
|
||||
tagToString :: Tag String -> String
|
||||
tagToString (TagText s) = s
|
||||
tagToString :: Tag Text -> String
|
||||
tagToString (TagText s) = T.unpack s
|
||||
tagToString (TagOpen "br" _) = "\n"
|
||||
tagToString _ = ""
|
||||
|
||||
|
@ -577,20 +591,20 @@ pLocation = do
|
|||
(TagPosition r c) <- pSat isTagPosition
|
||||
setPosition $ newPos "input" r c
|
||||
|
||||
pSat :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String)
|
||||
pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
|
||||
pSat f = do
|
||||
pos <- getPosition
|
||||
token show (const pos) (\x -> if f x then Just x else Nothing)
|
||||
|
||||
pSatisfy :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String)
|
||||
pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
|
||||
pSatisfy f = try $ optional pLocation >> pSat f
|
||||
|
||||
pAnyTag :: PandocMonad m => TagParser m (Tag String)
|
||||
pAnyTag :: PandocMonad m => TagParser m (Tag Text)
|
||||
pAnyTag = pSatisfy (const True)
|
||||
|
||||
pSelfClosing :: PandocMonad m
|
||||
=> (String -> Bool) -> ([Attribute String] -> Bool)
|
||||
-> TagParser m (Tag String)
|
||||
=> (Text -> Bool) -> ([Attribute Text] -> Bool)
|
||||
-> TagParser m (Tag Text)
|
||||
pSelfClosing f g = do
|
||||
open <- pSatisfy (tagOpen f g)
|
||||
optional $ pSatisfy (tagClose f)
|
||||
|
@ -628,7 +642,7 @@ pStrikeout = do
|
|||
pInlinesInTags "s" B.strikeout <|>
|
||||
pInlinesInTags "strike" B.strikeout <|>
|
||||
pInlinesInTags "del" B.strikeout <|>
|
||||
try (do pSatisfy (~== TagOpen "span" [("class","strikeout")])
|
||||
try (do pSatisfy (matchTagOpen "span" [("class","strikeout")])
|
||||
contents <- mconcat <$> manyTill inline (pCloses "span")
|
||||
return $ B.strikeout contents)
|
||||
|
||||
|
@ -639,17 +653,19 @@ pLineBreak = do
|
|||
|
||||
-- Unlike fromAttrib from tagsoup, this distinguishes
|
||||
-- between a missing attribute and an attribute with empty content.
|
||||
maybeFromAttrib :: String -> Tag String -> Maybe String
|
||||
maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs
|
||||
maybeFromAttrib :: String -> Tag Text -> Maybe String
|
||||
maybeFromAttrib name (TagOpen _ attrs) =
|
||||
T.unpack <$> lookup (T.pack name) attrs
|
||||
maybeFromAttrib _ _ = Nothing
|
||||
|
||||
pLink :: PandocMonad m => TagParser m Inlines
|
||||
pLink = try $ do
|
||||
tag <- pSatisfy $ tagOpenLit "a" (const True)
|
||||
let title = fromAttrib "title" tag
|
||||
let title = T.unpack $ fromAttrib "title" tag
|
||||
-- take id from id attribute if present, otherwise name
|
||||
let uid = maybe (fromAttrib "name" tag) id $ maybeFromAttrib "id" tag
|
||||
let cls = words $ fromAttrib "class" tag
|
||||
let uid = maybe (T.unpack $ fromAttrib "name" tag) id $
|
||||
maybeFromAttrib "id" tag
|
||||
let cls = words $ T.unpack $ fromAttrib "class" tag
|
||||
lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
|
||||
-- check for href; if href, then a link, otherwise a span
|
||||
case maybeFromAttrib "href" tag of
|
||||
|
@ -667,30 +683,33 @@ pImage :: PandocMonad m => TagParser m Inlines
|
|||
pImage = do
|
||||
tag <- pSelfClosing (=="img") (isJust . lookup "src")
|
||||
mbBaseHref <- baseHref <$> getState
|
||||
let url' = fromAttrib "src" tag
|
||||
let url' = T.unpack $ fromAttrib "src" tag
|
||||
let url = case (parseURIReference url', mbBaseHref) of
|
||||
(Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
|
||||
_ -> url'
|
||||
let title = fromAttrib "title" tag
|
||||
let alt = fromAttrib "alt" tag
|
||||
let uid = fromAttrib "id" tag
|
||||
let cls = words $ fromAttrib "class" tag
|
||||
let title = T.unpack $ fromAttrib "title" tag
|
||||
let alt = T.unpack $ fromAttrib "alt" tag
|
||||
let uid = T.unpack $ fromAttrib "id" tag
|
||||
let cls = words $ T.unpack $ fromAttrib "class" tag
|
||||
let getAtt k = case fromAttrib k tag of
|
||||
"" -> []
|
||||
v -> [(k, v)]
|
||||
v -> [(T.unpack k, T.unpack v)]
|
||||
let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"]
|
||||
return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt)
|
||||
|
||||
pCode :: PandocMonad m => TagParser m Inlines
|
||||
pCode = try $ do
|
||||
(TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
|
||||
(TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
|
||||
let attr = toStringAttr attr'
|
||||
result <- manyTill pAnyTag (pCloses open)
|
||||
return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result
|
||||
return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ T.unpack $
|
||||
innerText result
|
||||
|
||||
pSpan :: PandocMonad m => TagParser m Inlines
|
||||
pSpan = try $ do
|
||||
guardEnabled Ext_native_spans
|
||||
TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
|
||||
TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
|
||||
let attr = toStringAttr attr'
|
||||
contents <- pInTags "span" inline
|
||||
let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes
|
||||
where styleAttr = fromMaybe "" $ lookup "style" attr
|
||||
|
@ -708,7 +727,7 @@ pRawHtmlInline = do
|
|||
then pSatisfy (not . isBlockTag)
|
||||
else pSatisfy isInlineTag
|
||||
exts <- getOption readerExtensions
|
||||
let raw = renderTags' [result]
|
||||
let raw = T.unpack $ renderTags' [result]
|
||||
if extensionEnabled Ext_raw_html exts
|
||||
then return $ B.rawInline "html" raw
|
||||
else ignore raw
|
||||
|
@ -716,32 +735,38 @@ pRawHtmlInline = do
|
|||
mathMLToTeXMath :: String -> Either String String
|
||||
mathMLToTeXMath s = writeTeX <$> readMathML s
|
||||
|
||||
toStringAttr :: [(Text, Text)] -> [(String, String)]
|
||||
toStringAttr = map go
|
||||
where go (x,y) = (T.unpack x, T.unpack y)
|
||||
|
||||
pMath :: PandocMonad m => Bool -> TagParser m Inlines
|
||||
pMath inCase = try $ do
|
||||
open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True)
|
||||
open@(TagOpen _ attr') <- pSatisfy $ tagOpen (=="math") (const True)
|
||||
-- we'll assume math tags are MathML unless specially marked
|
||||
-- otherwise...
|
||||
let attr = toStringAttr attr'
|
||||
unless inCase $
|
||||
guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr))
|
||||
contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math"))
|
||||
case mathMLToTeXMath (renderTags $ [open] ++ contents ++ [TagClose "math"]) of
|
||||
contents <- manyTill pAnyTag (pSatisfy (matchTagClose "math"))
|
||||
case mathMLToTeXMath (T.unpack $ renderTags $
|
||||
[open] <> contents <> [TagClose "math"]) of
|
||||
Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $
|
||||
innerText contents
|
||||
T.unpack $ innerText contents
|
||||
Right [] -> return mempty
|
||||
Right x -> return $ case lookup "display" attr of
|
||||
Just "block" -> B.displayMath x
|
||||
_ -> B.math x
|
||||
|
||||
pInlinesInTags :: PandocMonad m => String -> (Inlines -> Inlines)
|
||||
pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines)
|
||||
-> TagParser m Inlines
|
||||
pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
|
||||
|
||||
pInTags :: (PandocMonad m, Monoid a) => String -> TagParser m a -> TagParser m a
|
||||
pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
|
||||
pInTags tagtype parser = pInTags' tagtype (const True) parser
|
||||
|
||||
pInTags' :: (PandocMonad m, Monoid a)
|
||||
=> String
|
||||
-> (Tag String -> Bool)
|
||||
=> Text
|
||||
-> (Tag Text -> Bool)
|
||||
-> TagParser m a
|
||||
-> TagParser m a
|
||||
pInTags' tagtype tagtest parser = try $ do
|
||||
|
@ -750,18 +775,18 @@ pInTags' tagtype tagtest parser = try $ do
|
|||
|
||||
-- parses p, preceeded by an optional opening tag
|
||||
-- and followed by an optional closing tags
|
||||
pOptInTag :: PandocMonad m => String -> TagParser m a -> TagParser m a
|
||||
pOptInTag :: PandocMonad m => Text -> TagParser m a -> TagParser m a
|
||||
pOptInTag tagtype p = try $ do
|
||||
skipMany pBlank
|
||||
optional $ pSatisfy (~== TagOpen tagtype [])
|
||||
optional $ pSatisfy (matchTagOpen tagtype [])
|
||||
skipMany pBlank
|
||||
x <- p
|
||||
skipMany pBlank
|
||||
optional $ pSatisfy (~== TagClose tagtype)
|
||||
optional $ pSatisfy (matchTagClose tagtype)
|
||||
skipMany pBlank
|
||||
return x
|
||||
|
||||
pCloses :: PandocMonad m => String -> TagParser m ()
|
||||
pCloses :: PandocMonad m => Text -> TagParser m ()
|
||||
pCloses tagtype = try $ do
|
||||
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
|
||||
case t of
|
||||
|
@ -782,15 +807,15 @@ pTagText = try $ do
|
|||
parsed <- lift $ lift $
|
||||
flip runReaderT qu $ runParserT (many pTagContents) st "text" str
|
||||
case parsed of
|
||||
Left _ -> throwError $ PandocParseError $ "Could not parse `" ++ str ++ "'"
|
||||
Left _ -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'"
|
||||
Right result -> return $ mconcat result
|
||||
|
||||
pBlank :: PandocMonad m => TagParser m ()
|
||||
pBlank = try $ do
|
||||
(TagText str) <- pSatisfy isTagText
|
||||
guard $ all isSpace str
|
||||
guard $ T.all isSpace str
|
||||
|
||||
type InlinesParser m = HTMLParser m String
|
||||
type InlinesParser m = HTMLParser m Text
|
||||
|
||||
pTagContents :: PandocMonad m => InlinesParser m Inlines
|
||||
pTagContents =
|
||||
|
@ -871,13 +896,13 @@ pSpace = many1 (satisfy isSpace) >>= \xs ->
|
|||
-- Constants
|
||||
--
|
||||
|
||||
eitherBlockOrInline :: Set.Set String
|
||||
eitherBlockOrInline :: Set.Set Text
|
||||
eitherBlockOrInline = Set.fromList
|
||||
["audio", "applet", "button", "iframe", "embed",
|
||||
"del", "ins", "progress", "map", "area", "noscript", "script",
|
||||
"object", "svg", "video", "source"]
|
||||
|
||||
blockHtmlTags :: Set.Set String
|
||||
blockHtmlTags :: Set.Set Text
|
||||
blockHtmlTags = Set.fromList
|
||||
["?xml", "!DOCTYPE", "address", "article", "aside",
|
||||
"blockquote", "body", "canvas",
|
||||
|
@ -893,7 +918,7 @@ blockHtmlTags = Set.fromList
|
|||
|
||||
-- We want to allow raw docbook in markdown documents, so we
|
||||
-- include docbook block tags here too.
|
||||
blockDocBookTags :: Set.Set String
|
||||
blockDocBookTags :: Set.Set Text
|
||||
blockDocBookTags = Set.fromList
|
||||
["calloutlist", "bibliolist", "glosslist", "itemizedlist",
|
||||
"orderedlist", "segmentedlist", "simplelist",
|
||||
|
@ -908,37 +933,52 @@ blockDocBookTags = Set.fromList
|
|||
"classsynopsis", "blockquote", "epigraph", "msgset",
|
||||
"sidebar", "title"]
|
||||
|
||||
epubTags :: Set.Set String
|
||||
epubTags :: Set.Set Text
|
||||
epubTags = Set.fromList ["case", "switch", "default"]
|
||||
|
||||
blockTags :: Set.Set String
|
||||
blockTags :: Set.Set Text
|
||||
blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags]
|
||||
|
||||
isInlineTag :: Tag String -> Bool
|
||||
isInlineTag t = tagOpen isInlineTagName (const True) t ||
|
||||
tagClose isInlineTagName t ||
|
||||
tagComment (const True) t
|
||||
where isInlineTagName x = x `Set.notMember` blockTags
|
||||
class NamedTag a where
|
||||
getTagName :: a -> Maybe Text
|
||||
|
||||
isBlockTag :: Tag String -> Bool
|
||||
isBlockTag t = tagOpen isBlockTagName (const True) t ||
|
||||
tagClose isBlockTagName t ||
|
||||
tagComment (const True) t
|
||||
where isBlockTagName ('?':_) = True
|
||||
isBlockTagName ('!':_) = True
|
||||
isBlockTagName x = x `Set.member` blockTags
|
||||
|| x `Set.member`
|
||||
eitherBlockOrInline
|
||||
instance NamedTag (Tag Text) where
|
||||
getTagName (TagOpen t _) = Just t
|
||||
getTagName (TagClose t) = Just t
|
||||
getTagName _ = Nothing
|
||||
|
||||
isTextTag :: Tag String -> Bool
|
||||
instance NamedTag (Tag String) where
|
||||
getTagName (TagOpen t _) = Just (T.pack t)
|
||||
getTagName (TagClose t) = Just (T.pack t)
|
||||
getTagName _ = Nothing
|
||||
|
||||
isInlineTag :: NamedTag (Tag a) => Tag a -> Bool
|
||||
isInlineTag t = isInlineTagName || isCommentTag t
|
||||
where isInlineTagName = case getTagName t of
|
||||
Just x -> x
|
||||
`Set.notMember` blockTags
|
||||
Nothing -> False
|
||||
|
||||
isBlockTag :: NamedTag (Tag a) => Tag a -> Bool
|
||||
isBlockTag t = isBlockTagName || isTagComment t
|
||||
where isBlockTagName =
|
||||
case getTagName t of
|
||||
Just x
|
||||
| "?" `T.isPrefixOf` x -> True
|
||||
| "!" `T.isPrefixOf` x -> True
|
||||
| otherwise -> x `Set.member` blockTags
|
||||
|| x `Set.member` eitherBlockOrInline
|
||||
Nothing -> False
|
||||
|
||||
isTextTag :: Tag a -> Bool
|
||||
isTextTag = tagText (const True)
|
||||
|
||||
isCommentTag :: Tag String -> Bool
|
||||
isCommentTag :: Tag a -> Bool
|
||||
isCommentTag = tagComment (const True)
|
||||
|
||||
-- taken from HXT and extended
|
||||
-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
|
||||
closes :: String -> String -> Bool
|
||||
closes :: Text -> Text -> Bool
|
||||
_ `closes` "body" = False
|
||||
_ `closes` "html" = False
|
||||
"body" `closes` "head" = True
|
||||
|
@ -1000,8 +1040,11 @@ htmlInBalanced f = try $ do
|
|||
let cs = ec - sc
|
||||
lscontents <- unlines <$> count ls anyLine
|
||||
cscontents <- count cs anyChar
|
||||
(_,closetag) <- htmlTag (~== TagClose tn)
|
||||
return (lscontents ++ cscontents ++ closetag)
|
||||
closetag <- do
|
||||
x <- many (satisfy (/='>'))
|
||||
char '>'
|
||||
return (x <> ">")
|
||||
return (lscontents <> cscontents <> closetag)
|
||||
_ -> mzero
|
||||
_ -> mzero
|
||||
|
||||
|
@ -1019,7 +1062,7 @@ htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
|
|||
go n (t:ts') = (t :) <$> go n ts'
|
||||
go _ [] = mzero
|
||||
|
||||
hasTagWarning :: [Tag String] -> Bool
|
||||
hasTagWarning :: [Tag a] -> Bool
|
||||
hasTagWarning (TagWarning _:_) = True
|
||||
hasTagWarning _ = False
|
||||
|
||||
|
@ -1047,47 +1090,48 @@ htmlTag f = try $ do
|
|||
-- basic sanity check, since the parser is very forgiving
|
||||
-- and finds tags in stuff like x<y)
|
||||
guard $ isName tagname
|
||||
guard $ not $ null tagname
|
||||
-- <https://example.org> should NOT be a tag either.
|
||||
-- tagsoup will parse it as TagOpen "https:" [("example.org","")]
|
||||
guard $ last tagname /= ':'
|
||||
rendered <- manyTill anyChar (char '>')
|
||||
return (next, rendered ++ ">")
|
||||
return (next, rendered <> ">")
|
||||
case next of
|
||||
TagComment s
|
||||
| "<!--" `isPrefixOf` inp -> do
|
||||
count (length s + 4) anyChar
|
||||
skipMany (satisfy (/='>'))
|
||||
char '>'
|
||||
return (next, "<!--" ++ s ++ "-->")
|
||||
return (next, "<!--" <> s <> "-->")
|
||||
| otherwise -> fail "bogus comment mode, HTML5 parse error"
|
||||
TagOpen tagname attr -> do
|
||||
guard $ all (isName . fst) attr
|
||||
handleTag tagname
|
||||
TagClose tagname -> handleTag tagname
|
||||
TagClose tagname ->
|
||||
handleTag tagname
|
||||
_ -> mzero
|
||||
|
||||
mkAttr :: [(String, String)] -> Attr
|
||||
mkAttr attr = (attribsId, attribsClasses, attribsKV)
|
||||
where attribsId = fromMaybe "" $ lookup "id" attr
|
||||
attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes
|
||||
attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) <> epubTypes
|
||||
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
|
||||
epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
|
||||
|
||||
-- Strip namespace prefixes
|
||||
stripPrefixes :: [Tag String] -> [Tag String]
|
||||
stripPrefixes :: [Tag Text] -> [Tag Text]
|
||||
stripPrefixes = map stripPrefix
|
||||
|
||||
stripPrefix :: Tag String -> Tag String
|
||||
stripPrefix :: Tag Text -> Tag Text
|
||||
stripPrefix (TagOpen s as) =
|
||||
TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)
|
||||
stripPrefix (TagClose s) = TagClose (stripPrefix' s)
|
||||
stripPrefix x = x
|
||||
|
||||
stripPrefix' :: String -> String
|
||||
stripPrefix' :: Text -> Text
|
||||
stripPrefix' s =
|
||||
case span (/= ':') s of
|
||||
(_, "") -> s
|
||||
(_, (_:ts)) -> ts
|
||||
if T.null t then s else T.drop 1 t
|
||||
where (_, t) = T.span (/= ':') s
|
||||
|
||||
isSpace :: Char -> Bool
|
||||
isSpace ' ' = True
|
||||
|
@ -1130,19 +1174,32 @@ instance HasLastStrPosition HTMLState where
|
|||
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
|
||||
getLastStrPos = getLastStrPos . parserState
|
||||
|
||||
-- For now we need a special verison here; the one in Shared has String type
|
||||
renderTags' :: [Tag Text] -> Text
|
||||
renderTags' = renderTagsOptions
|
||||
renderOptions{ optMinimize = matchTags ["hr", "br", "img",
|
||||
"meta", "link"]
|
||||
, optRawTag = matchTags ["script", "style"] }
|
||||
where matchTags = \tags -> flip elem tags . T.toLower
|
||||
|
||||
|
||||
-- EPUB Specific
|
||||
--
|
||||
--
|
||||
sectioningContent :: [String]
|
||||
sectioningContent :: [Text]
|
||||
sectioningContent = ["article", "aside", "nav", "section"]
|
||||
|
||||
|
||||
groupingContent :: [String]
|
||||
groupingContent :: [Text]
|
||||
groupingContent = ["p", "hr", "pre", "blockquote", "ol"
|
||||
, "ul", "li", "dl", "dt", "dt", "dd"
|
||||
, "figure", "figcaption", "div", "main"]
|
||||
|
||||
matchTagClose :: Text -> (Tag Text -> Bool)
|
||||
matchTagClose t = (~== TagClose t)
|
||||
|
||||
matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
|
||||
matchTagOpen t as = (~== TagOpen t as)
|
||||
|
||||
{-
|
||||
|
||||
|
@ -1150,7 +1207,7 @@ types :: [(String, ([String], Int))]
|
|||
types = -- Document divisions
|
||||
map (\s -> (s, (["section", "body"], 0)))
|
||||
["volume", "part", "chapter", "division"]
|
||||
++ -- Document section and components
|
||||
<> -- Document section and components
|
||||
[
|
||||
("abstract", ([], 0))]
|
||||
-}
|
||||
|
|
Loading…
Add table
Reference in a new issue