82b3e0ab97
- The (non-exported) prelude is in prelude/Prelude.hs. - It exports Monoid and Applicative, like base 4.8 prelude, but works with older base versions. - It exports (<>) for mappend. - It hides 'catch' on older base versions. This allows us to remove many imports of Data.Monoid and Control.Applicative, and remove Text.Pandoc.Compat.Monoid. It should allow us to use -Wall again for ghc 7.10.
1049 lines
37 KiB
Haskell
1049 lines
37 KiB
Haskell
{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses,
|
|
ViewPatterns#-}
|
|
{-
|
|
Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
-}
|
|
|
|
{- |
|
|
Module : Text.Pandoc.Readers.HTML
|
|
Copyright : Copyright (C) 2006-2015 John MacFarlane
|
|
License : GNU GPL, version 2 or above
|
|
|
|
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
|
Stability : alpha
|
|
Portability : portable
|
|
|
|
Conversion of HTML to 'Pandoc' document.
|
|
-}
|
|
module Text.Pandoc.Readers.HTML ( readHtml
|
|
, htmlTag
|
|
, htmlInBalanced
|
|
, isInlineTag
|
|
, isBlockTag
|
|
, isTextTag
|
|
, isCommentTag
|
|
) where
|
|
|
|
import Text.HTML.TagSoup
|
|
import Text.HTML.TagSoup.Match
|
|
import Text.Pandoc.Definition
|
|
import qualified Text.Pandoc.Builder as B
|
|
import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
|
|
import Text.Pandoc.Shared ( extractSpaces, renderTags'
|
|
, escapeURI, safeRead, mapLeft )
|
|
import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace)
|
|
, Extension (Ext_epub_html_exts,
|
|
Ext_native_divs, Ext_native_spans))
|
|
import Text.Pandoc.Parsing hiding ((<|>))
|
|
import Text.Pandoc.Walk
|
|
import qualified Data.Map as M
|
|
import Data.Maybe ( fromMaybe, isJust)
|
|
import Data.List ( intercalate, isInfixOf, isPrefixOf, isSuffixOf )
|
|
import Data.Char ( isDigit )
|
|
import Control.Monad ( liftM, guard, when, mzero, void, unless )
|
|
import Control.Arrow ((***))
|
|
import Control.Applicative ( (<|>) )
|
|
import Data.Monoid (First (..))
|
|
import Text.Printf (printf)
|
|
import Debug.Trace (trace)
|
|
import Text.TeXMath (readMathML, writeTeX)
|
|
import Data.Default (Default (..), def)
|
|
import Control.Monad.Reader (Reader,ask, asks, local, runReader)
|
|
import Network.URI (isURI)
|
|
import Text.Pandoc.Error
|
|
import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
|
|
|
|
import Text.Parsec.Error
|
|
|
|
|
|
-- | Convert HTML-formatted string to 'Pandoc' document.
|
|
readHtml :: ReaderOptions -- ^ Reader options
|
|
-> String -- ^ String to parse (assumes @'\n'@ line endings)
|
|
-> Either PandocError Pandoc
|
|
readHtml opts inp =
|
|
mapLeft (ParseFailure . getError) . flip runReader def $
|
|
runParserT parseDoc
|
|
(HTMLState def{ stateOptions = opts } [] Nothing [] M.empty)
|
|
"source" tags
|
|
where tags = stripPrefixes . canonicalizeTags $
|
|
parseTagsOptions parseOptions{ optTagPosition = True } inp
|
|
parseDoc = do
|
|
blocks <- (fixPlains False) . mconcat <$> manyTill block eof
|
|
meta <- stateMeta . parserState <$> getState
|
|
bs' <- replaceNotes (B.toList blocks)
|
|
return $ Pandoc meta bs'
|
|
getError (errorMessages -> ms) = case ms of
|
|
[] -> ""
|
|
(m:_) -> messageString m
|
|
|
|
replaceNotes :: [Block] -> TagParser [Block]
|
|
replaceNotes = walkM replaceNotes'
|
|
|
|
replaceNotes' :: Inline -> TagParser Inline
|
|
replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes
|
|
where
|
|
getNotes = noteTable <$> getState
|
|
replaceNotes' x = return x
|
|
|
|
data HTMLState =
|
|
HTMLState
|
|
{ parserState :: ParserState,
|
|
noteTable :: [(String, Blocks)],
|
|
baseHref :: Maybe String,
|
|
identifiers :: [String],
|
|
headerMap :: M.Map Inlines String
|
|
}
|
|
|
|
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
|
|
, inChapter :: Bool -- ^ Set if in chapter section
|
|
, inPlain :: Bool -- ^ Set if in pPlain
|
|
}
|
|
|
|
setInChapter :: HTMLParser s a -> HTMLParser s a
|
|
setInChapter = local (\s -> s {inChapter = True})
|
|
|
|
setInPlain :: HTMLParser s a -> HTMLParser s a
|
|
setInPlain = local (\s -> s {inPlain = True})
|
|
|
|
type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal)
|
|
|
|
type TagParser = HTMLParser [Tag String]
|
|
|
|
pBody :: TagParser Blocks
|
|
pBody = pInTags "body" block
|
|
|
|
pHead :: TagParser Blocks
|
|
pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag)
|
|
where pTitle = pInTags "title" inline >>= setTitle . trimInlines
|
|
setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
|
|
pMetaTag = do
|
|
mt <- pSatisfy (~== TagOpen "meta" [])
|
|
let name = fromAttrib "name" mt
|
|
if null name
|
|
then return mempty
|
|
else do
|
|
let content = fromAttrib "content" mt
|
|
updateState $ B.setMeta name (B.text content)
|
|
return mempty
|
|
pBaseTag = do
|
|
bt <- pSatisfy (~== TagOpen "base" [])
|
|
let baseH = fromAttrib "href" bt
|
|
if null baseH
|
|
then return mempty
|
|
else do
|
|
let baseH' = case reverse baseH of
|
|
'/':_ -> baseH
|
|
_ -> baseH ++ "/"
|
|
updateState $ \st -> st{ baseHref = Just baseH' }
|
|
return mempty
|
|
|
|
block :: TagParser Blocks
|
|
block = do
|
|
tr <- getOption readerTrace
|
|
pos <- getPosition
|
|
res <- choice
|
|
[ eSection
|
|
, eSwitch B.para block
|
|
, mempty <$ eFootnote
|
|
, mempty <$ eTOC
|
|
, mempty <$ eTitlePage
|
|
, pPara
|
|
, pHeader
|
|
, pBlockQuote
|
|
, pCodeBlock
|
|
, pList
|
|
, pHrule
|
|
, pTable
|
|
, pHead
|
|
, pBody
|
|
, pDiv
|
|
, pPlain
|
|
, pRawHtmlBlock
|
|
]
|
|
when tr $ trace (printf "line %d: %s" (sourceLine pos)
|
|
(take 60 $ show $ B.toList res)) (return ())
|
|
return res
|
|
|
|
namespaces :: [(String, TagParser Inlines)]
|
|
namespaces = [(mathMLNamespace, pMath True)]
|
|
|
|
mathMLNamespace :: String
|
|
mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
|
|
|
|
eSwitch :: Monoid a => (Inlines -> a) -> TagParser a -> TagParser a
|
|
eSwitch constructor parser = try $ do
|
|
guardEnabled Ext_epub_html_exts
|
|
pSatisfy (~== TagOpen "switch" [])
|
|
cases <- getFirst . mconcat <$>
|
|
manyTill (First <$> (eCase <* skipMany pBlank) )
|
|
(lookAhead $ try $ pSatisfy (~== TagOpen "default" []))
|
|
skipMany pBlank
|
|
fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank)
|
|
skipMany pBlank
|
|
pSatisfy (~== TagClose "switch")
|
|
return $ maybe fallback constructor cases
|
|
|
|
eCase :: TagParser (Maybe Inlines)
|
|
eCase = do
|
|
skipMany pBlank
|
|
TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" [])
|
|
case (flip lookup namespaces) =<< lookup "required-namespace" attr of
|
|
Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank))
|
|
Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case"))
|
|
|
|
eFootnote :: TagParser ()
|
|
eFootnote = try $ do
|
|
let notes = ["footnote", "rearnote"]
|
|
guardEnabled Ext_epub_html_exts
|
|
(TagOpen tag attr) <- lookAhead $ pAnyTag
|
|
guard (maybe False (flip elem notes) (lookup "type" attr))
|
|
let ident = fromMaybe "" (lookup "id" attr)
|
|
content <- pInTags tag block
|
|
addNote ident content
|
|
|
|
addNote :: String -> Blocks -> TagParser ()
|
|
addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)})
|
|
|
|
eNoteref :: TagParser Inlines
|
|
eNoteref = try $ do
|
|
guardEnabled Ext_epub_html_exts
|
|
TagOpen tag attr <- lookAhead $ pAnyTag
|
|
guard (maybe False (== "noteref") (lookup "type" attr))
|
|
let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)
|
|
guard (not (null ident))
|
|
pInTags tag block
|
|
return $ B.rawInline "noteref" ident
|
|
|
|
-- Strip TOC if there is one, better to generate again
|
|
eTOC :: TagParser ()
|
|
eTOC = try $ do
|
|
guardEnabled Ext_epub_html_exts
|
|
(TagOpen tag attr) <- lookAhead $ pAnyTag
|
|
guard (maybe False (== "toc") (lookup "type" attr))
|
|
void (pInTags tag block)
|
|
|
|
pList :: TagParser Blocks
|
|
pList = pBulletList <|> pOrderedList <|> pDefinitionList
|
|
|
|
pBulletList :: TagParser Blocks
|
|
pBulletList = try $ do
|
|
pSatisfy (~== TagOpen "ul" [])
|
|
let nonItem = pSatisfy (\t ->
|
|
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
|
|
not (t ~== TagClose "ul"))
|
|
-- note: if they have an <ol> or <ul> not in scope of a <li>,
|
|
-- treat it as a list item, though it's not valid xhtml...
|
|
skipMany nonItem
|
|
items <- manyTill (pListItem nonItem) (pCloses "ul")
|
|
return $ B.bulletList $ map (fixPlains True) items
|
|
|
|
pListItem :: TagParser a -> TagParser Blocks
|
|
pListItem nonItem = do
|
|
TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" [])
|
|
let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr)
|
|
(liDiv <>) <$> pInTags "li" block <* skipMany nonItem
|
|
|
|
parseListStyleType :: String -> ListNumberStyle
|
|
parseListStyleType "lower-roman" = LowerRoman
|
|
parseListStyleType "upper-roman" = UpperRoman
|
|
parseListStyleType "lower-alpha" = LowerAlpha
|
|
parseListStyleType "upper-alpha" = UpperAlpha
|
|
parseListStyleType "decimal" = Decimal
|
|
parseListStyleType _ = DefaultStyle
|
|
|
|
parseTypeAttr :: String -> ListNumberStyle
|
|
parseTypeAttr "i" = LowerRoman
|
|
parseTypeAttr "I" = UpperRoman
|
|
parseTypeAttr "a" = LowerAlpha
|
|
parseTypeAttr "A" = UpperAlpha
|
|
parseTypeAttr "1" = Decimal
|
|
parseTypeAttr _ = DefaultStyle
|
|
|
|
pOrderedList :: TagParser Blocks
|
|
pOrderedList = try $ do
|
|
TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
|
|
let (start, style) = (sta', sty')
|
|
where sta = fromMaybe "1" $
|
|
lookup "start" attribs
|
|
sta' = if all isDigit sta
|
|
then read sta
|
|
else 1
|
|
|
|
pickListStyle = pickStyleAttrProps ["list-style-type", "list-style"]
|
|
|
|
typeAttr = fromMaybe "" $ lookup "type" attribs
|
|
classAttr = fromMaybe "" $ lookup "class" attribs
|
|
styleAttr = fromMaybe "" $ lookup "style" attribs
|
|
listStyle = fromMaybe "" $ pickListStyle styleAttr
|
|
|
|
sty' = foldOrElse DefaultStyle
|
|
[ parseTypeAttr typeAttr
|
|
, parseListStyleType classAttr
|
|
, parseListStyleType listStyle
|
|
]
|
|
let nonItem = pSatisfy (\t ->
|
|
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
|
|
not (t ~== TagClose "ol"))
|
|
-- note: if they have an <ol> or <ul> not in scope of a <li>,
|
|
-- treat it as a list item, though it's not valid xhtml...
|
|
skipMany nonItem
|
|
items <- manyTill (pListItem nonItem) (pCloses "ol")
|
|
return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
|
|
|
|
pDefinitionList :: TagParser Blocks
|
|
pDefinitionList = try $ do
|
|
pSatisfy (~== TagOpen "dl" [])
|
|
items <- manyTill pDefListItem (pCloses "dl")
|
|
return $ B.definitionList items
|
|
|
|
pDefListItem :: TagParser (Inlines, [Blocks])
|
|
pDefListItem = try $ do
|
|
let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
|
|
not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
|
|
terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
|
|
defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
|
|
skipMany nonItem
|
|
let term = foldl1 (\x y -> x <> B.linebreak <> y) terms
|
|
return (term, map (fixPlains True) defs)
|
|
|
|
fixPlains :: Bool -> Blocks -> Blocks
|
|
fixPlains inList bs = if any isParaish bs'
|
|
then B.fromList $ map plainToPara bs'
|
|
else bs
|
|
where isParaish (Para _) = True
|
|
isParaish (CodeBlock _ _) = True
|
|
isParaish (Header _ _ _) = True
|
|
isParaish (BlockQuote _) = True
|
|
isParaish (BulletList _) = not inList
|
|
isParaish (OrderedList _ _) = not inList
|
|
isParaish (DefinitionList _) = not inList
|
|
isParaish _ = False
|
|
plainToPara (Plain xs) = Para xs
|
|
plainToPara x = x
|
|
bs' = B.toList bs
|
|
|
|
pRawTag :: TagParser String
|
|
pRawTag = do
|
|
tag <- pAnyTag
|
|
let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"]
|
|
if tagOpen ignorable (const True) tag || tagClose ignorable tag
|
|
then return []
|
|
else return $ renderTags' [tag]
|
|
|
|
pDiv :: TagParser Blocks
|
|
pDiv = try $ do
|
|
guardEnabled Ext_native_divs
|
|
let isDivLike "div" = True
|
|
isDivLike "section" = True
|
|
isDivLike _ = False
|
|
TagOpen tag attr <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True)
|
|
contents <- pInTags tag block
|
|
let (ident, classes, kvs) = mkAttr attr
|
|
let classes' = if tag == "section"
|
|
then "section":classes
|
|
else classes
|
|
return $ B.divWith (ident, classes', kvs) contents
|
|
|
|
pRawHtmlBlock :: TagParser Blocks
|
|
pRawHtmlBlock = do
|
|
raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag
|
|
parseRaw <- getOption readerParseRaw
|
|
if parseRaw && not (null raw)
|
|
then return $ B.rawBlock "html" raw
|
|
else return mempty
|
|
|
|
pHtmlBlock :: String -> TagParser String
|
|
pHtmlBlock t = try $ do
|
|
open <- pSatisfy (~== TagOpen t [])
|
|
contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
|
|
return $ renderTags' $ [open] ++ contents ++ [TagClose t]
|
|
|
|
-- Sets chapter context
|
|
eSection :: TagParser Blocks
|
|
eSection = try $ do
|
|
let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as)
|
|
let sectTag = tagOpen (`elem` sectioningContent) matchChapter
|
|
TagOpen tag _ <- lookAhead $ pSatisfy sectTag
|
|
setInChapter (pInTags tag block)
|
|
|
|
headerLevel :: String -> TagParser Int
|
|
headerLevel tagtype = do
|
|
let level = read (drop 1 tagtype)
|
|
(try $ do
|
|
guardEnabled Ext_epub_html_exts
|
|
asks inChapter >>= guard
|
|
return (level - 1))
|
|
<|>
|
|
return level
|
|
|
|
eTitlePage :: TagParser ()
|
|
eTitlePage = try $ do
|
|
let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as)
|
|
let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section")
|
|
isTitlePage
|
|
TagOpen tag _ <- lookAhead $ pSatisfy groupTag
|
|
() <$ pInTags tag block
|
|
|
|
pHeader :: TagParser Blocks
|
|
pHeader = try $ do
|
|
TagOpen tagtype attr <- pSatisfy $
|
|
tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
|
|
(const True)
|
|
let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
|
|
level <- headerLevel tagtype
|
|
contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
|
|
let ident = fromMaybe "" $ lookup "id" attr
|
|
let classes = maybe [] words $ lookup "class" attr
|
|
let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
|
|
attr' <- registerHeader (ident, classes, keyvals) contents
|
|
return $ if bodyTitle
|
|
then mempty -- skip a representation of the title in the body
|
|
else B.headerWith attr' level contents
|
|
|
|
pHrule :: TagParser Blocks
|
|
pHrule = do
|
|
pSelfClosing (=="hr") (const True)
|
|
return B.horizontalRule
|
|
|
|
pTable :: TagParser Blocks
|
|
pTable = try $ do
|
|
TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
|
|
skipMany pBlank
|
|
caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
|
|
-- TODO actually read these and take width information from them
|
|
widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
|
|
let pTh = option [] $ pInTags "tr" (pCell "th")
|
|
pTr = try $ skipMany pBlank >> pInTags "tr" (pCell "td" <|> pCell "th")
|
|
pTBody = do pOptInTag "tbody" $ many1 pTr
|
|
head'' <- pOptInTag "thead" pTh
|
|
head' <- pOptInTag "tbody" $ do
|
|
if null head''
|
|
then pTh
|
|
else return head''
|
|
rowsLs <- many pTBody
|
|
rows' <- pOptInTag "tfoot" $ many pTr
|
|
TagClose _ <- pSatisfy (~== TagClose "table")
|
|
let rows = (concat rowsLs) ++ rows'
|
|
-- fail on empty table
|
|
guard $ not $ null head' && null rows
|
|
let isSinglePlain x = case B.toList x of
|
|
[Plain _] -> True
|
|
_ -> False
|
|
let isSimple = all isSinglePlain $ concat (head':rows)
|
|
let cols = length $ if null head' then head rows else head'
|
|
-- fail if there are colspans or rowspans
|
|
guard $ all (\r -> length r == cols) rows
|
|
let aligns = replicate cols AlignDefault
|
|
let widths = if null widths'
|
|
then if isSimple
|
|
then replicate cols 0
|
|
else replicate cols (1.0 / fromIntegral cols)
|
|
else widths'
|
|
return $ B.table caption (zip aligns widths) head' rows
|
|
|
|
pCol :: TagParser Double
|
|
pCol = try $ do
|
|
TagOpen _ attribs <- pSatisfy (~== TagOpen "col" [])
|
|
skipMany pBlank
|
|
optional $ pSatisfy (~== TagClose "col")
|
|
skipMany pBlank
|
|
return $ case lookup "width" attribs of
|
|
Just x | not (null x) && last x == '%' ->
|
|
fromMaybe 0.0 $ safeRead ('0':'.':init x)
|
|
_ -> 0.0
|
|
|
|
pColgroup :: TagParser [Double]
|
|
pColgroup = try $ do
|
|
pSatisfy (~== TagOpen "colgroup" [])
|
|
skipMany pBlank
|
|
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
|
|
|
|
pCell :: String -> TagParser [Blocks]
|
|
pCell celltype = try $ do
|
|
skipMany pBlank
|
|
res <- pInTags celltype block
|
|
skipMany pBlank
|
|
return [res]
|
|
|
|
pBlockQuote :: TagParser Blocks
|
|
pBlockQuote = do
|
|
contents <- pInTags "blockquote" block
|
|
return $ B.blockQuote $ fixPlains False contents
|
|
|
|
pPlain :: TagParser Blocks
|
|
pPlain = do
|
|
contents <- setInPlain $ trimInlines . mconcat <$> many1 inline
|
|
if B.isNull contents
|
|
then return mempty
|
|
else return $ B.plain contents
|
|
|
|
pPara :: TagParser Blocks
|
|
pPara = do
|
|
contents <- trimInlines <$> pInTags "p" inline
|
|
return $ B.para contents
|
|
|
|
pCodeBlock :: TagParser Blocks
|
|
pCodeBlock = try $ do
|
|
TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
|
|
contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
|
|
let rawText = concatMap tagToString contents
|
|
-- drop leading newline if any
|
|
let result' = case rawText of
|
|
'\n':xs -> xs
|
|
_ -> rawText
|
|
-- drop trailing newline if any
|
|
let result = case reverse result' of
|
|
'\n':_ -> init result'
|
|
_ -> result'
|
|
return $ B.codeBlockWith (mkAttr attr) result
|
|
|
|
tagToString :: Tag String -> String
|
|
tagToString (TagText s) = s
|
|
tagToString (TagOpen "br" _) = "\n"
|
|
tagToString _ = ""
|
|
|
|
inline :: TagParser Inlines
|
|
inline = choice
|
|
[ eNoteref
|
|
, eSwitch id inline
|
|
, pTagText
|
|
, pQ
|
|
, pEmph
|
|
, pStrong
|
|
, pSuperscript
|
|
, pSubscript
|
|
, pStrikeout
|
|
, pLineBreak
|
|
, pLink
|
|
, pImage
|
|
, pCode
|
|
, pSpan
|
|
, pMath False
|
|
, pRawHtmlInline
|
|
]
|
|
|
|
pLocation :: TagParser ()
|
|
pLocation = do
|
|
(TagPosition r c) <- pSat isTagPosition
|
|
setPosition $ newPos "input" r c
|
|
|
|
pSat :: (Tag String -> Bool) -> TagParser (Tag String)
|
|
pSat f = do
|
|
pos <- getPosition
|
|
token show (const pos) (\x -> if f x then Just x else Nothing)
|
|
|
|
pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String)
|
|
pSatisfy f = try $ optional pLocation >> pSat f
|
|
|
|
pAnyTag :: TagParser (Tag String)
|
|
pAnyTag = pSatisfy (const True)
|
|
|
|
pSelfClosing :: (String -> Bool) -> ([Attribute String] -> Bool)
|
|
-> TagParser (Tag String)
|
|
pSelfClosing f g = do
|
|
open <- pSatisfy (tagOpen f g)
|
|
optional $ pSatisfy (tagClose f)
|
|
return open
|
|
|
|
pQ :: TagParser Inlines
|
|
pQ = do
|
|
context <- asks quoteContext
|
|
let quoteType = case context of
|
|
InDoubleQuote -> SingleQuote
|
|
_ -> DoubleQuote
|
|
let innerQuoteContext = if quoteType == SingleQuote
|
|
then InSingleQuote
|
|
else InDoubleQuote
|
|
let constructor = case quoteType of
|
|
SingleQuote -> B.singleQuoted
|
|
DoubleQuote -> B.doubleQuoted
|
|
withQuoteContext innerQuoteContext $
|
|
pInlinesInTags "q" constructor
|
|
|
|
pEmph :: TagParser Inlines
|
|
pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph
|
|
|
|
pStrong :: TagParser Inlines
|
|
pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong
|
|
|
|
pSuperscript :: TagParser Inlines
|
|
pSuperscript = pInlinesInTags "sup" B.superscript
|
|
|
|
pSubscript :: TagParser Inlines
|
|
pSubscript = pInlinesInTags "sub" B.subscript
|
|
|
|
pStrikeout :: TagParser Inlines
|
|
pStrikeout = do
|
|
pInlinesInTags "s" B.strikeout <|>
|
|
pInlinesInTags "strike" B.strikeout <|>
|
|
pInlinesInTags "del" B.strikeout <|>
|
|
try (do pSatisfy (~== TagOpen "span" [("class","strikeout")])
|
|
contents <- mconcat <$> manyTill inline (pCloses "span")
|
|
return $ B.strikeout contents)
|
|
|
|
pLineBreak :: TagParser Inlines
|
|
pLineBreak = do
|
|
pSelfClosing (=="br") (const True)
|
|
return B.linebreak
|
|
|
|
pLink :: TagParser Inlines
|
|
pLink = pRelLink <|> pAnchor
|
|
|
|
pAnchor :: TagParser Inlines
|
|
pAnchor = try $ do
|
|
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "id"))
|
|
return $ B.spanWith (fromAttrib "id" tag , [], []) mempty
|
|
|
|
pRelLink :: TagParser Inlines
|
|
pRelLink = try $ do
|
|
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
|
|
mbBaseHref <- baseHref <$> getState
|
|
let url' = fromAttrib "href" tag
|
|
let url = case (isURI url', mbBaseHref) of
|
|
(False, Just h) -> h ++ url'
|
|
_ -> url'
|
|
let title = fromAttrib "title" tag
|
|
let uid = fromAttrib "id" tag
|
|
let spanC = case uid of
|
|
[] -> id
|
|
s -> B.spanWith (s, [], [])
|
|
lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
|
|
return $ spanC $ B.link (escapeURI url) title lab
|
|
|
|
pImage :: TagParser Inlines
|
|
pImage = do
|
|
tag <- pSelfClosing (=="img") (isJust . lookup "src")
|
|
mbBaseHref <- baseHref <$> getState
|
|
let url' = fromAttrib "src" tag
|
|
let url = case (isURI url', mbBaseHref) of
|
|
(False, Just h) -> h ++ url'
|
|
_ -> url'
|
|
let title = fromAttrib "title" tag
|
|
let alt = fromAttrib "alt" tag
|
|
return $ B.image (escapeURI url) title (B.text alt)
|
|
|
|
pCode :: TagParser Inlines
|
|
pCode = try $ do
|
|
(TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
|
|
result <- manyTill pAnyTag (pCloses open)
|
|
return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result
|
|
|
|
pSpan :: TagParser Inlines
|
|
pSpan = try $ do
|
|
guardEnabled Ext_native_spans
|
|
TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
|
|
contents <- pInTags "span" inline
|
|
let isSmallCaps = fontVariant == "small-caps"
|
|
where styleAttr = fromMaybe "" $ lookup "style" attr
|
|
fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr
|
|
let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr)
|
|
return $ tag contents
|
|
|
|
pRawHtmlInline :: TagParser Inlines
|
|
pRawHtmlInline = do
|
|
inplain <- asks inPlain
|
|
result <- pSatisfy (tagComment (const True))
|
|
<|> if inplain
|
|
then pSatisfy (not . isBlockTag)
|
|
else pSatisfy isInlineTag
|
|
parseRaw <- getOption readerParseRaw
|
|
if parseRaw
|
|
then return $ B.rawInline "html" $ renderTags' [result]
|
|
else return mempty
|
|
|
|
mathMLToTeXMath :: String -> Either String String
|
|
mathMLToTeXMath s = writeTeX <$> readMathML s
|
|
|
|
pMath :: Bool -> TagParser Inlines
|
|
pMath inCase = try $ do
|
|
open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True)
|
|
unless (inCase) (guard (maybe False (== mathMLNamespace) (lookup "xmlns" attr)))
|
|
contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math"))
|
|
let math = mathMLToTeXMath $
|
|
(renderTags $ [open] ++ contents ++ [TagClose "math"])
|
|
let constructor =
|
|
maybe B.math (\x -> if (x == "inline") then B.math else B.displayMath)
|
|
(lookup "display" attr)
|
|
return $ either (const mempty)
|
|
(\x -> if null x then mempty else constructor x) math
|
|
|
|
pInlinesInTags :: String -> (Inlines -> Inlines)
|
|
-> TagParser Inlines
|
|
pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
|
|
|
|
pInTags :: (Monoid a) => String -> TagParser a
|
|
-> TagParser a
|
|
pInTags tagtype parser = try $ do
|
|
pSatisfy (~== TagOpen tagtype [])
|
|
mconcat <$> manyTill parser (pCloses tagtype <|> eof)
|
|
|
|
-- parses p, preceeded by an optional opening tag
|
|
-- and followed by an optional closing tags
|
|
pOptInTag :: String -> TagParser a -> TagParser a
|
|
pOptInTag tagtype p = try $ do
|
|
skipMany pBlank
|
|
optional $ pSatisfy (~== TagOpen tagtype [])
|
|
skipMany pBlank
|
|
x <- p
|
|
skipMany pBlank
|
|
optional $ pSatisfy (~== TagClose tagtype)
|
|
skipMany pBlank
|
|
return x
|
|
|
|
pCloses :: String -> TagParser ()
|
|
pCloses tagtype = try $ do
|
|
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
|
|
case t of
|
|
(TagClose t') | t' == tagtype -> pAnyTag >> return ()
|
|
(TagOpen t' _) | t' `closes` tagtype -> return ()
|
|
(TagClose "ul") | tagtype == "li" -> return ()
|
|
(TagClose "ol") | tagtype == "li" -> return ()
|
|
(TagClose "dl") | tagtype == "li" -> return ()
|
|
(TagClose "table") | tagtype == "td" -> return ()
|
|
(TagClose "table") | tagtype == "tr" -> return ()
|
|
_ -> mzero
|
|
|
|
pTagText :: TagParser Inlines
|
|
pTagText = try $ do
|
|
(TagText str) <- pSatisfy isTagText
|
|
st <- getState
|
|
qu <- ask
|
|
case flip runReader qu $ runParserT (many pTagContents) st "text" str of
|
|
Left _ -> fail $ "Could not parse `" ++ str ++ "'"
|
|
Right result -> return $ mconcat result
|
|
|
|
pBlank :: TagParser ()
|
|
pBlank = try $ do
|
|
(TagText str) <- pSatisfy isTagText
|
|
guard $ all isSpace str
|
|
|
|
type InlinesParser = HTMLParser String
|
|
|
|
pTagContents :: InlinesParser Inlines
|
|
pTagContents =
|
|
B.displayMath <$> mathDisplay
|
|
<|> B.math <$> mathInline
|
|
<|> pStr
|
|
<|> pSpace
|
|
<|> smartPunctuation pTagContents
|
|
<|> pSymbol
|
|
<|> pBad
|
|
|
|
pStr :: InlinesParser Inlines
|
|
pStr = do
|
|
result <- many1 $ satisfy $ \c ->
|
|
not (isSpace c) && not (isSpecial c) && not (isBad c)
|
|
updateLastStrPos
|
|
return $ B.str result
|
|
|
|
isSpecial :: Char -> Bool
|
|
isSpecial '"' = True
|
|
isSpecial '\'' = True
|
|
isSpecial '.' = True
|
|
isSpecial '-' = True
|
|
isSpecial '$' = True
|
|
isSpecial '\8216' = True
|
|
isSpecial '\8217' = True
|
|
isSpecial '\8220' = True
|
|
isSpecial '\8221' = True
|
|
isSpecial _ = False
|
|
|
|
pSymbol :: InlinesParser Inlines
|
|
pSymbol = satisfy isSpecial >>= return . B.str . (:[])
|
|
|
|
isBad :: Char -> Bool
|
|
isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
|
|
|
|
pBad :: InlinesParser Inlines
|
|
pBad = do
|
|
c <- satisfy isBad
|
|
let c' = case c of
|
|
'\128' -> '\8364'
|
|
'\130' -> '\8218'
|
|
'\131' -> '\402'
|
|
'\132' -> '\8222'
|
|
'\133' -> '\8230'
|
|
'\134' -> '\8224'
|
|
'\135' -> '\8225'
|
|
'\136' -> '\710'
|
|
'\137' -> '\8240'
|
|
'\138' -> '\352'
|
|
'\139' -> '\8249'
|
|
'\140' -> '\338'
|
|
'\142' -> '\381'
|
|
'\145' -> '\8216'
|
|
'\146' -> '\8217'
|
|
'\147' -> '\8220'
|
|
'\148' -> '\8221'
|
|
'\149' -> '\8226'
|
|
'\150' -> '\8211'
|
|
'\151' -> '\8212'
|
|
'\152' -> '\732'
|
|
'\153' -> '\8482'
|
|
'\154' -> '\353'
|
|
'\155' -> '\8250'
|
|
'\156' -> '\339'
|
|
'\158' -> '\382'
|
|
'\159' -> '\376'
|
|
_ -> '?'
|
|
return $ B.str [c']
|
|
|
|
pSpace :: InlinesParser Inlines
|
|
pSpace = many1 (satisfy isSpace) >> return B.space
|
|
|
|
--
|
|
-- Constants
|
|
--
|
|
|
|
eitherBlockOrInline :: [String]
|
|
eitherBlockOrInline = ["audio", "applet", "button", "iframe", "embed",
|
|
"del", "ins",
|
|
"progress", "map", "area", "noscript", "script",
|
|
"object", "svg", "video", "source"]
|
|
|
|
{-
|
|
inlineHtmlTags :: [[Char]]
|
|
inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
|
|
"br", "cite", "code", "dfn", "em", "font", "i", "img",
|
|
"input", "kbd", "label", "q", "s", "samp", "select",
|
|
"small", "span", "strike", "strong", "sub", "sup",
|
|
"textarea", "tt", "u", "var"]
|
|
-}
|
|
|
|
blockHtmlTags :: [String]
|
|
blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside",
|
|
"blockquote", "body", "button", "canvas",
|
|
"caption", "center", "col", "colgroup", "dd", "dir", "div",
|
|
"dl", "dt", "fieldset", "figcaption", "figure",
|
|
"footer", "form", "h1", "h2", "h3", "h4",
|
|
"h5", "h6", "head", "header", "hgroup", "hr", "html",
|
|
"isindex", "menu", "noframes", "ol", "output", "p", "pre",
|
|
"section", "table", "tbody", "textarea",
|
|
"thead", "tfoot", "ul", "dd",
|
|
"dt", "frameset", "li", "tbody", "td", "tfoot",
|
|
"th", "thead", "tr", "script", "style"]
|
|
|
|
-- We want to allow raw docbook in markdown documents, so we
|
|
-- include docbook block tags here too.
|
|
blockDocBookTags :: [String]
|
|
blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist",
|
|
"orderedlist", "segmentedlist", "simplelist",
|
|
"variablelist", "caution", "important", "note", "tip",
|
|
"warning", "address", "literallayout", "programlisting",
|
|
"programlistingco", "screen", "screenco", "screenshot",
|
|
"synopsis", "example", "informalexample", "figure",
|
|
"informalfigure", "table", "informaltable", "para",
|
|
"simpara", "formalpara", "equation", "informalequation",
|
|
"figure", "screenshot", "mediaobject", "qandaset",
|
|
"procedure", "task", "cmdsynopsis", "funcsynopsis",
|
|
"classsynopsis", "blockquote", "epigraph", "msgset",
|
|
"sidebar", "title"]
|
|
|
|
epubTags :: [String]
|
|
epubTags = ["case", "switch", "default"]
|
|
|
|
blockTags :: [String]
|
|
blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags
|
|
|
|
isInlineTag :: Tag String -> Bool
|
|
isInlineTag t = tagOpen isInlineTagName (const True) t ||
|
|
tagClose isInlineTagName t ||
|
|
tagComment (const True) t
|
|
where isInlineTagName x = x `notElem` blockTags
|
|
|
|
isBlockTag :: Tag String -> Bool
|
|
isBlockTag t = tagOpen isBlockTagName (const True) t ||
|
|
tagClose isBlockTagName t ||
|
|
tagComment (const True) t
|
|
where isBlockTagName ('?':_) = True
|
|
isBlockTagName ('!':_) = True
|
|
isBlockTagName x = x `elem` blockTags
|
|
|| x `elem` eitherBlockOrInline
|
|
|
|
isTextTag :: Tag String -> Bool
|
|
isTextTag = tagText (const True)
|
|
|
|
isCommentTag :: Tag String -> Bool
|
|
isCommentTag = tagComment (const True)
|
|
|
|
-- taken from HXT and extended
|
|
-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
|
|
closes :: String -> String -> Bool
|
|
_ `closes` "body" = False
|
|
_ `closes` "html" = False
|
|
"body" `closes` "head" = True
|
|
"a" `closes` "a" = True
|
|
"li" `closes` "li" = True
|
|
"th" `closes` t | t `elem` ["th","td"] = True
|
|
"tr" `closes` t | t `elem` ["th","td","tr"] = True
|
|
"dd" `closes` t | t `elem` ["dt", "dd"] = True
|
|
"dt" `closes` t | t `elem` ["dt","dd"] = True
|
|
"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
|
|
"optgroup" `closes` "optgroup" = True
|
|
"optgroup" `closes` "option" = True
|
|
"option" `closes` "option" = True
|
|
-- http://www.w3.org/TR/html-markup/p.html
|
|
x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
|
|
"dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
|
|
"h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section",
|
|
"table", "ul"] = True
|
|
"meta" `closes` "meta" = True
|
|
"form" `closes` "form" = True
|
|
"label" `closes` "label" = True
|
|
"map" `closes` "map" = True
|
|
"object" `closes` "object" = True
|
|
_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
|
|
t `closes` "select" | t /= "option" = True
|
|
"thead" `closes` t | t `elem` ["colgroup"] = True
|
|
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
|
|
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
|
|
t `closes` t2 |
|
|
t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] &&
|
|
t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div"
|
|
t1 `closes` t2 |
|
|
t1 `elem` blockTags &&
|
|
t2 `notElem` (blockTags ++ eitherBlockOrInline) = True
|
|
_ `closes` _ = False
|
|
|
|
--- parsers for use in markdown, textile readers
|
|
|
|
-- | Matches a stretch of HTML in balanced tags.
|
|
htmlInBalanced :: (Monad m)
|
|
=> (Tag String -> Bool)
|
|
-> ParserT String st m String
|
|
htmlInBalanced f = try $ do
|
|
(TagOpen t _, tag) <- htmlTag f
|
|
guard $ not $ "/>" `isSuffixOf` tag -- not a self-closing tag
|
|
let stopper = htmlTag (~== TagClose t)
|
|
let anytag = snd <$> htmlTag (const True)
|
|
contents <- many $ notFollowedBy' stopper >>
|
|
(htmlInBalanced f <|> anytag <|> count 1 anyChar)
|
|
endtag <- liftM snd stopper
|
|
return $ tag ++ concat contents ++ endtag
|
|
|
|
-- | Matches a tag meeting a certain condition.
|
|
htmlTag :: Monad m
|
|
=> (Tag String -> Bool)
|
|
-> ParserT [Char] st m (Tag String, String)
|
|
htmlTag f = try $ do
|
|
lookAhead (char '<')
|
|
inp <- getInput
|
|
let hasTagWarning (TagWarning _:_) = True
|
|
hasTagWarning _ = False
|
|
let (next : rest) = canonicalizeTags $ parseTagsOptions
|
|
parseOptions{ optTagWarning = True } inp
|
|
guard $ f next
|
|
case next of
|
|
TagComment s
|
|
| "<!--" `isPrefixOf` inp -> do
|
|
count (length s + 4) anyChar
|
|
skipMany (satisfy (/='>'))
|
|
char '>'
|
|
return (next, "<!--" ++ s ++ "-->")
|
|
| otherwise -> fail "bogus comment mode, HTML5 parse error"
|
|
_ -> do
|
|
-- we get a TagWarning on things like
|
|
-- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
|
|
-- which should NOT be parsed as an HTML tag, see #2277
|
|
guard $ not $ hasTagWarning rest
|
|
rendered <- manyTill anyChar (char '>')
|
|
return (next, rendered ++ ">")
|
|
|
|
mkAttr :: [(String, String)] -> Attr
|
|
mkAttr attr = (attribsId, attribsClasses, attribsKV)
|
|
where attribsId = fromMaybe "" $ lookup "id" attr
|
|
attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes
|
|
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
|
|
epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
|
|
|
|
-- Strip namespace prefixes
|
|
stripPrefixes :: [Tag String] -> [Tag String]
|
|
stripPrefixes = map stripPrefix
|
|
|
|
stripPrefix :: Tag String -> Tag String
|
|
stripPrefix (TagOpen s as) =
|
|
TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)
|
|
stripPrefix (TagClose s) = TagClose (stripPrefix' s)
|
|
stripPrefix x = x
|
|
|
|
stripPrefix' :: String -> String
|
|
stripPrefix' s =
|
|
case span (/= ':') s of
|
|
(_, "") -> s
|
|
(_, (_:ts)) -> ts
|
|
|
|
isSpace :: Char -> Bool
|
|
isSpace ' ' = True
|
|
isSpace '\t' = True
|
|
isSpace '\n' = True
|
|
isSpace '\r' = True
|
|
isSpace _ = False
|
|
|
|
-- Instances
|
|
|
|
instance HasIdentifierList HTMLState where
|
|
extractIdentifierList = identifiers
|
|
updateIdentifierList f s = s{ identifiers = f (identifiers s) }
|
|
|
|
instance HasHeaderMap HTMLState where
|
|
extractHeaderMap = headerMap
|
|
updateHeaderMap f s = s{ headerMap = f (headerMap s) }
|
|
|
|
-- This signature should be more general
|
|
-- MonadReader HTMLLocal m => HasQuoteContext st m
|
|
instance HasQuoteContext st (Reader HTMLLocal) where
|
|
getQuoteContext = asks quoteContext
|
|
withQuoteContext q = local (\s -> s{quoteContext = q})
|
|
|
|
instance HasReaderOptions HTMLState where
|
|
extractReaderOptions = extractReaderOptions . parserState
|
|
|
|
instance HasMeta HTMLState where
|
|
setMeta s b st = st {parserState = setMeta s b $ parserState st}
|
|
deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
|
|
|
|
instance Default HTMLLocal where
|
|
def = HTMLLocal NoQuote False False
|
|
|
|
instance HasLastStrPosition HTMLState where
|
|
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
|
|
getLastStrPos = getLastStrPos . parserState
|
|
|
|
|
|
-- EPUB Specific
|
|
--
|
|
--
|
|
sectioningContent :: [String]
|
|
sectioningContent = ["article", "aside", "nav", "section"]
|
|
|
|
|
|
groupingContent :: [String]
|
|
groupingContent = ["p", "hr", "pre", "blockquote", "ol"
|
|
, "ul", "li", "dl", "dt", "dt", "dd"
|
|
, "figure", "figcaption", "div", "main"]
|
|
|
|
|
|
{-
|
|
|
|
types :: [(String, ([String], Int))]
|
|
types = -- Document divisions
|
|
map (\s -> (s, (["section", "body"], 0)))
|
|
["volume", "part", "chapter", "division"]
|
|
++ -- Document section and components
|
|
[
|
|
("abstract", ([], 0))]
|
|
-}
|