New HTML reader using tagsoup as a lexer.

* The new reader is faster and more accurate.

* API changes for Text.Pandoc.Readers.HTML:
   - removed rawHtmlBlock, anyHtmlBlockTag, anyHtmlInlineTag,
     anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType,
     htmlBlockElement, htmlComment
   - added htmlTag, htmlInBalanced, isInlineTag, isBlockTag, isTextTag

* tagsoup is a new dependency.

* Text.Pandoc.Parsing: Generalized type on readWith.

* Benchmark.hs: Added length calculation to force full evaluation.

* Updated HTML reader tests.

* Updated markdown and textile readers to use the functions from
  the HTML reader.

* Note: The markdown reader now correctly handles some cases it did not
  before. For example:

    <hr/>

  is reproduced without adding a space.

    <script>
      a = '<b>';
    </script>

  is parsed correctly.
This commit is contained in:
John MacFarlane 2010-12-22 20:25:15 -08:00
parent 220fe5fab8
commit 904050fa36
8 changed files with 441 additions and 640 deletions

View file

@ -13,8 +13,11 @@ readerBench doc (name, reader) =
inp = writer defaultWriterOptions{ writerWrapText = True
, writerLiterateHaskell =
"+lhs" `isSuffixOf` name } doc
in bench (name ++ " reader") $ whnf
(reader defaultParserState{stateSmart = True
-- we compute the length to force full evaluation
getLength (Pandoc (Meta a b c) d) =
length a + length b + length c + length d
in bench (name ++ " reader") $ whnf (getLength .
reader defaultParserState{ stateSmart = True
, stateStandalone = True
, stateLiterateHaskell =
"+lhs" `isSuffixOf` name }) inp

View file

@ -176,7 +176,8 @@ Library
citeproc-hs >= 0.3 && < 0.4,
pandoc-types == 1.7.*,
json >= 0.4 && < 0.5,
dlist >= 0.4 && < 0.6
dlist >= 0.4 && < 0.6,
tagsoup >= 0.12 && < 0.13
if impl(ghc >= 6.10)
Build-depends: base >= 4 && < 5, syb
else
@ -249,7 +250,8 @@ Executable pandoc
citeproc-hs >= 0.3 && < 0.4,
pandoc-types == 1.7.*,
json >= 0.4 && < 0.5,
dlist >= 0.4 && < 0.6
dlist >= 0.4 && < 0.6,
tagsoup >= 0.12 && < 0.13
if impl(ghc >= 6.10)
Build-depends: base >= 4 && < 5, syb
else

View file

@ -287,7 +287,7 @@ nullBlock :: GenParser Char st Block
nullBlock = anyChar >> return Null
-- | Fail if reader is in strict markdown syntax mode.
failIfStrict :: GenParser Char ParserState ()
failIfStrict :: GenParser a ParserState ()
failIfStrict = do
state <- getState
if stateStrict state then fail "strict mode" else return ()
@ -567,9 +567,9 @@ gridTableFooter = blanklines
---
-- | Parse a string with a given parser and state.
readWith :: GenParser Char ParserState a -- ^ parser
readWith :: GenParser t ParserState a -- ^ parser
-> ParserState -- ^ initial state
-> String -- ^ input string
-> [t] -- ^ input
-> a
readWith parser state input =
case runParser parser state "source" input of

View file

@ -27,36 +27,355 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Conversion of HTML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.HTML (
readHtml,
rawHtmlInline,
rawHtmlBlock,
htmlTag,
anyHtmlBlockTag,
anyHtmlInlineTag,
anyHtmlTag,
anyHtmlEndTag,
htmlEndTag,
extractTagType,
htmlBlockElement,
htmlComment,
module Text.Pandoc.Readers.HTML ( readHtml
, htmlTag
, htmlInBalanced
, isInlineTag
, isBlockTag
, isTextTag
, isCommentTag
) where
import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Parsec.Pos
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
import Text.Pandoc.Builder (text, toList)
import Text.Pandoc.Shared
import Text.Pandoc.Parsing
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Data.Maybe ( fromMaybe )
import Data.List ( isPrefixOf, isSuffixOf, intercalate )
import Data.Char ( toLower, isAlphaNum )
import Control.Monad ( liftM, when )
import Data.Maybe ( fromMaybe, isJust )
import Data.List ( intercalate )
import Data.Char ( isSpace, isDigit )
import Control.Monad ( liftM, guard )
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ParserState -- ^ Parser state
-> String -- ^ String to parse (assumes @'\n'@ line endings)
-> Pandoc
readHtml = readWith parseHtml
readHtml st inp = Pandoc meta blocks
where blocks = readWith parseBody st body
tags = canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
hasHeader = any (~== TagOpen "head" []) tags
(meta, rest) = if hasHeader
then parseHeader tags
else (Meta [] [] [], tags)
body = filter (\t -> not $
tagOpen (`elem` ["html","head","body"]) (const True) t ||
tagClose (`elem` ["html","head","body"]) t) rest
type TagParser = GenParser (Tag String) ParserState
parseHeader :: [Tag String] -> (Meta, [Tag String])
parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest)
where (tit,r) = break (~== TagClose "title") $ drop 1 $
dropWhile (\t -> not $ t ~== TagOpen "title" []) tags
tit' = concatMap fromTagText $ filter isTagText tit
tit'' = normalizeSpaces $ toList $ text tit'
rest = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head") r
parseBody :: TagParser [Block]
parseBody = liftM concat $ manyTill block eof
block :: TagParser [Block]
block = optional pLocation >>
choice [
pPara
, pHeader
, pBlockQuote
, pCodeBlock
, pList
, pHrule
, pPlain
, pRawHtmlBlock
]
renderTags' :: [Tag String] -> String
renderTags' = renderTagsOptions
renderOptions{ optMinimize = (`elem` ["hr","br","img"]) }
pList :: TagParser [Block]
pList = pBulletList <|> pOrderedList <|> pDefinitionList
pBulletList :: TagParser [Block]
pBulletList = try $ do
pSatisfy (~== TagOpen "ul" [])
let nonItem = pSatisfy (\t ->
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
not (t ~== TagClose "ul"))
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul")
return [BulletList items]
pOrderedList :: TagParser [Block]
pOrderedList = try $ do
TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
st <- getState
let (start, style) = if stateStrict st
then (1, DefaultStyle)
else (sta', sty')
where sta = fromMaybe "1" $
lookup "start" attribs
sta' = if all isDigit sta
then read sta
else 1
sty = fromMaybe (fromMaybe "" $
lookup "style" attribs) $
lookup "class" attribs
sty' = case sty of
"lower-roman" -> LowerRoman
"upper-roman" -> UpperRoman
"lower-alpha" -> LowerAlpha
"upper-alpha" -> UpperAlpha
"decimal" -> Decimal
_ -> DefaultStyle
let nonItem = pSatisfy (\t ->
not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
not (t ~== TagClose "ol"))
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol")
return [OrderedList (start, style, DefaultDelim) items]
pDefinitionList :: TagParser [Block]
pDefinitionList = try $ do
pSatisfy (~== TagOpen "dl" [])
items <- manyTill pDefListItem (pCloses "dl")
return [DefinitionList items]
pDefListItem :: TagParser ([Inline],[[Block]])
pDefListItem = try $ do
let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
skipMany nonItem
let term = intercalate [LineBreak] terms
return (term, defs)
pRawHtmlBlock :: TagParser [Block]
pRawHtmlBlock = do
raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|>
liftM (renderTags' . (:[])) pAnyTag
state <- getState
if stateParseRaw state
then return [RawHtml raw]
else return []
pHtmlBlock :: String -> TagParser String
pHtmlBlock t = try $ do
open <- pSatisfy (~== TagOpen t [])
contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
return $ renderTags' $ [open] ++ contents ++ [TagClose t]
pHeader :: TagParser [Block]
pHeader = try $ do
TagOpen tagtype attr <- pSatisfy $
tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
(const True)
let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
let level = read (drop 1 tagtype)
contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof)
return $ if bodyTitle
then [] -- skip a representation of the title in the body
else [Header level $ normalizeSpaces contents]
pHrule :: TagParser [Block]
pHrule = do
pSelfClosing (=="hr") (const True)
return [HorizontalRule]
pBlockQuote :: TagParser [Block]
pBlockQuote = do
contents <- pInTags "blockquote" block
return [BlockQuote contents]
pPlain :: TagParser [Block]
pPlain = do
contents <- liftM (normalizeSpaces . concat) $ many1 inline
if null contents
then return []
else return [Plain contents]
pPara :: TagParser [Block]
pPara = do
contents <- pInTags "p" inline
return [Para $ normalizeSpaces contents]
pCodeBlock :: TagParser [Block]
pCodeBlock = try $ do
TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
let rawText = concatMap fromTagText $ filter isTagText contents
-- drop leading newline if any
let result' = case rawText of
'\n':xs -> xs
_ -> rawText
-- drop trailing newline if any
let result = case reverse result' of
'\n':_ -> init result'
_ -> result'
let attribsId = fromMaybe "" $ lookup "id" attr
let attribsClasses = words $ fromMaybe "" $ lookup "class" attr
let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
st <- getState
let attribs = if stateStrict st
then ("",[],[])
else (attribsId, attribsClasses, attribsKV)
return [CodeBlock attribs result]
inline :: TagParser [Inline]
inline = choice [
pLocation
, pTagText
, pEmph
, pStrong
, pSuperscript
, pSubscript
, pStrikeout
, pLineBreak
, pLink
, pImage
, pCode
, pRawHtmlInline
]
pLocation :: TagParser [a]
pLocation = do
(TagPosition r c) <- pSatisfy isTagPosition
setPosition $ newPos "input" r c
return []
pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String)
pSatisfy f = do
pos <- getPosition
token show (const pos) (\x -> if f x then Just x else Nothing)
pAnyTag :: TagParser (Tag String)
pAnyTag = pSatisfy (const True)
pSelfClosing :: (String -> Bool) -> ([Attribute String] -> Bool)
-> TagParser (Tag String)
pSelfClosing f g = do
open <- pSatisfy (tagOpen f g)
optional $ try $ pLocation >> pSatisfy (tagClose f)
return open
pEmph :: TagParser [Inline]
pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph
pStrong :: TagParser [Inline]
pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong
pSuperscript :: TagParser [Inline]
pSuperscript = failIfStrict >> pInlinesInTags "sup" Superscript
pSubscript :: TagParser [Inline]
pSubscript = failIfStrict >> pInlinesInTags "sub" Subscript
pStrikeout :: TagParser [Inline]
pStrikeout = do
failIfStrict
pInlinesInTags "s" Strikeout <|>
pInlinesInTags "strike" Strikeout <|>
pInlinesInTags "del" Strikeout <|>
do pSatisfy (~== TagOpen "span" [("class","strikeout")])
contents <- liftM concat $ manyTill inline (pCloses "span")
return [Strikeout contents]
pLineBreak :: TagParser [Inline]
pLineBreak = do
pSelfClosing (=="br") (const True)
return [LineBreak]
pLink :: TagParser [Inline]
pLink = do
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
let url = fromAttrib "href" tag
let title = fromAttrib "title" tag
lab <- liftM concat $ manyTill inline (pCloses "a")
return [Link (normalizeSpaces lab) (escapeURI url, title)]
pImage :: TagParser [Inline]
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
let url = fromAttrib "src" tag
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
return [Image (toList $ text alt) (escapeURI url, title)]
pCode :: TagParser [Inline]
pCode = do
(TagOpen open _) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
result <- manyTill pAnyTag (pCloses open)
return [Code $ intercalate " " $ lines $ innerText result]
pRawHtmlInline :: TagParser [Inline]
pRawHtmlInline = do
result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag
state <- getState
if stateParseRaw state
then return [HtmlInline $ renderTags' [result]]
else return []
pInlinesInTags :: String -> ([Inline] -> Inline)
-> TagParser [Inline]
pInlinesInTags tagtype f = do
contents <- pInTags tagtype inline
return [f contents]
pInTags :: String -> TagParser [a]
-> TagParser [a]
pInTags tagtype parser = try $ do
pSatisfy (~== TagOpen tagtype [])
liftM concat $ manyTill parser (pCloses tagtype <|> eof)
pCloses :: String -> TagParser ()
pCloses tagtype = try $ do
optional pLocation
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
case t of
(TagClose t') | t' == tagtype -> pAnyTag >> return ()
(TagOpen t' _) | t' `closes` tagtype -> return ()
(TagClose "ul") | tagtype == "li" -> return ()
(TagClose "ol") | tagtype == "li" -> return ()
(TagClose "dl") | tagtype == "li" -> return ()
_ -> pzero
pTagText :: TagParser [Inline]
pTagText = do
(TagText str) <- pSatisfy isTagText
st <- getState
case runParser (many pTagContents) st "text" str of
Left _ -> fail $ "Could not parse `" ++ str ++ "'"
Right result -> return result
pTagContents :: GenParser Char ParserState Inline
pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol
pStr :: GenParser Char ParserState Inline
pStr = many1 (satisfy (\c -> not (isSpace c) && not (isSpecial c))) >>= return . Str
isSpecial :: Char -> Bool
isSpecial '"' = True
isSpecial '\'' = True
isSpecial '.' = True
isSpecial '-' = True
isSpecial '\8216' = True
isSpecial '\8217' = True
isSpecial '\8220' = True
isSpecial '\8221' = True
isSpecial _ = False
pSymbol :: GenParser Char ParserState Inline
pSymbol = satisfy isSpecial >>= return . Str . (:[])
pSpace :: GenParser Char ParserState Inline
pSpace = many1 (satisfy isSpace) >> return Space
--
-- Constants
@ -83,10 +402,26 @@ blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr", "script", "style"]
isInlineTag :: Tag String -> Bool
isInlineTag t = tagOpen (`notElem` blockHtmlTags) (const True) t ||
tagClose (`notElem` blockHtmlTags) t ||
tagComment (const True) t
isBlockTag :: Tag String -> Bool
isBlockTag t = tagOpen (`elem` blocktags) (const True) t ||
tagClose (`elem` blocktags) t ||
tagComment (const True) t
where blocktags = blockHtmlTags ++ eitherBlockOrInline
isTextTag :: Tag String -> Bool
isTextTag = tagText (const True)
isCommentTag :: Tag String -> Bool
isCommentTag = tagComment (const True)
-- taken from HXT and extended
closes :: String -> String -> Bool
"EOF" `closes` _ = True
_ `closes` "body" = False
_ `closes` "html" = False
"a" `closes` "a" = True
@ -117,565 +452,27 @@ t1 `closes` t2 |
t2 `notElem` (blockHtmlTags ++ eitherBlockOrInline) = True
_ `closes` _ = False
--
-- HTML utility functions
--
-- | Read blocks until end tag.
blocksTilEnd :: String -> GenParser Char ParserState [Block]
blocksTilEnd tag = do
blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
return $ filter (/= Null) blocks
-- | Read inlines until end tag.
inlinesTilEnd :: String -> GenParser Char ParserState [Inline]
inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
-- | Parse blocks between open and close tag.
blocksIn :: String -> GenParser Char ParserState [Block]
blocksIn tag = try $ htmlOpenTag tag >> spaces >> blocksTilEnd tag
-- | Parse inlines between open and close tag.
inlinesIn :: String -> GenParser Char ParserState [Inline]
inlinesIn tag = try $ htmlOpenTag tag >> spaces >> inlinesTilEnd tag
-- | Extract type from a tag: e.g. @br@ from @\<br\>@
extractTagType :: String -> String
extractTagType ('<':rest) =
let isSpaceOrSlash c = c `elem` "/ \n\t" in
map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
extractTagType _ = ""
-- Parse any HTML tag (opening or self-closing) and return tag type
anyOpener :: GenParser Char ParserState [Char]
anyOpener = try $ do
char '<'
spaces
tag <- many1 alphaNum
skipMany htmlAttribute
spaces
option "" (string "/")
spaces
char '>'
return $ map toLower tag
-- | Parse any HTML tag (opening or self-closing) and return text of tag
anyHtmlTag :: GenParser Char ParserState [Char]
anyHtmlTag = try $ do
char '<'
spaces
first <- letter
rest <- many (alphaNum <|> char ':')
let tag = first : rest
attribs <- many htmlAttribute
spaces
ender <- option "" (string "/")
let ender' = if null ender then "" else " /"
spaces
char '>'
let result = "<" ++ tag ++
concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
return result
anyHtmlEndTag :: GenParser Char ParserState [Char]
anyHtmlEndTag = try $ do
char '<'
spaces
char '/'
spaces
first <- letter
rest <- many (alphaNum <|> char ':')
let tag = first : rest
spaces
char '>'
let result = "</" ++ tag ++ ">"
return result
htmlTag :: Bool
-> String
-> GenParser Char ParserState (String, [(String, String)])
htmlTag selfClosing tag = try $ do
char '<'
spaces
stringAnyCase tag
attribs <- many htmlAttribute
spaces
-- note: we want to handle both HTML and XHTML,
-- so we don't require the /
when selfClosing $ optional $ char '/' >> spaces
char '>'
return (tag, (map (\(name, content, _) -> (name, content)) attribs))
htmlOpenTag :: String
-> GenParser Char ParserState (String, [(String, String)])
htmlOpenTag = htmlTag False
htmlCloseTag :: String
-> GenParser Char ParserState (String, [(String, String)])
htmlCloseTag = htmlTag False . ('/':)
htmlSelfClosingTag :: String
-> GenParser Char ParserState (String, [(String, String)])
htmlSelfClosingTag = htmlTag True
-- parses a quoted html attribute value
quoted :: Char -> GenParser Char st (String, String)
quoted quoteChar = do
result <- between (char quoteChar) (char quoteChar)
(many (noneOf [quoteChar]))
return (result, [quoteChar])
htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
htmlAttribute = do
attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
return attr
-- minimized boolean attribute
htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
htmlMinimizedAttribute = try $ do
many1 space
name <- many1 (choice [letter, oneOf ".-_:"])
return (name, name, name)
htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char])
htmlRegularAttribute = try $ do
many1 space
name <- many1 (choice [letter, oneOf ".-_:"])
spaces
char '='
spaces
(content, quoteStr) <- choice [ (quoted '\''),
(quoted '"'),
(do
a <- many (noneOf " \t\n\r\"'<>")
return (a,"")) ]
return (name, content,
(name ++ "=" ++ quoteStr ++ content ++ quoteStr))
-- | Parse an end tag of type 'tag'
htmlEndTag :: [Char] -> GenParser Char ParserState [Char]
htmlEndTag tag = try $ do
closedByNext <- lookAhead $ option False $ liftM (`closes` tag) $
anyOpener <|> (eof >> return "EOF")
if closedByNext
then return ""
else do char '<'
spaces
char '/'
spaces
stringAnyCase tag
spaces
char '>'
return $ "</" ++ tag ++ ">"
-- | Returns @True@ if the tag is (or can be) a block tag.
isBlock :: String -> Bool
isBlock tag = (extractTagType tag) `elem` (blockHtmlTags ++ eitherBlockOrInline)
anyHtmlBlockTag :: GenParser Char ParserState [Char]
anyHtmlBlockTag = try $ do
tag <- anyHtmlTag <|> anyHtmlEndTag
if isBlock tag then return tag else fail "not a block tag"
anyHtmlInlineTag :: GenParser Char ParserState [Char]
anyHtmlInlineTag = try $ do
tag <- anyHtmlTag <|> anyHtmlEndTag
if not (isBlock tag) then return tag else fail "not an inline tag"
-- | Parses material between script tags.
-- Scripts must be treated differently, because they can contain '<>' etc.
htmlScript :: GenParser Char ParserState [Char]
htmlScript = try $ do
lookAhead $ htmlOpenTag "script"
open <- anyHtmlTag
rest <- manyTill anyChar (htmlEndTag "script")
return $ open ++ rest ++ "</script>"
-- | Parses material between style tags.
-- Style tags must be treated differently, because they can contain CSS
htmlStyle :: GenParser Char ParserState [Char]
htmlStyle = try $ do
lookAhead $ htmlOpenTag "style"
open <- anyHtmlTag
rest <- manyTill anyChar (htmlEndTag "style")
return $ open ++ rest ++ "</style>"
htmlBlockElement :: GenParser Char ParserState [Char]
htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
rawHtmlBlock :: GenParser Char ParserState Block
rawHtmlBlock = try $ do
body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag
state <- getState
if stateParseRaw state then return (RawHtml body) else return Null
-- This is a block whose contents should be passed through verbatim, not interpreted.
rawVerbatimBlock :: GenParser Char ParserState [Char]
rawVerbatimBlock = try $ do
start <- anyHtmlBlockTag
let tagtype = extractTagType start
if tagtype `elem` ["pre"]
then do
contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar)
end <- htmlEndTag tagtype
return $ start ++ contents ++ end
else fail "Not a verbatim block"
-- We don't want to parse </body> or </html> as raw HTML, since these
-- are handled in parseHtml.
rawHtmlBlock' :: GenParser Char ParserState Block
rawHtmlBlock' = do notFollowedBy' (htmlCloseTag "body" <|>
htmlCloseTag "html")
rawHtmlBlock
-- | Parses an HTML comment.
htmlComment :: GenParser Char st [Char]
htmlComment = try $ do
string "<!--"
comment <- many $ noneOf "-"
<|> try (char '-' >>~ notFollowedBy (try (char '-' >> char '>')))
string "-->"
return $ "<!--" ++ comment ++ "-->"
--
-- parsing documents
--
xmlDec :: GenParser Char st [Char]
xmlDec = try $ do
string "<?"
rest <- manyTill anyChar (char '>')
return $ "<?" ++ rest ++ ">"
definition :: GenParser Char st [Char]
definition = try $ do
string "<!"
rest <- manyTill anyChar (char '>')
return $ "<!" ++ rest ++ ">"
nonTitleNonHead :: GenParser Char ParserState Char
nonTitleNonHead = try $ do
notFollowedBy $ (htmlOpenTag "title" >> return ' ') <|>
(htmlEndTag "head" >> return ' ')
(rawHtmlBlock >> return ' ') <|> anyChar
parseTitle :: GenParser Char ParserState [Inline]
parseTitle = try $ do
(tag, _) <- htmlOpenTag "title"
contents <- inlinesTilEnd tag
spaces
return contents
-- parse header and return meta-information (for now, just title)
parseHead :: GenParser Char ParserState Meta
parseHead = try $ do
htmlOpenTag "head"
spaces
skipMany nonTitleNonHead
contents <- option [] parseTitle
skipMany nonTitleNonHead
htmlEndTag "head"
return $ Meta contents [] []
-- h1 class="title" representation of title in body
bodyTitle :: GenParser Char ParserState [Inline]
bodyTitle = try $ do
(_, attribs) <- htmlOpenTag "h1"
case (extractAttribute "class" attribs) of
Just "title" -> return ""
_ -> fail "not title"
inlinesTilEnd "h1"
endOfDoc :: GenParser Char ParserState ()
endOfDoc = try $ do
spaces
optional (htmlEndTag "body")
spaces
optional (htmlEndTag "html" >> many anyChar) -- ignore stuff after </html>
eof
parseHtml :: GenParser Char ParserState Pandoc
parseHtml = do
sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
spaces
optional $ htmlOpenTag "html"
spaces
meta <- option (Meta [] [] []) parseHead
spaces
optional $ htmlOpenTag "body"
spaces
optional bodyTitle -- skip title in body, because it's represented in meta
blocks <- parseBlocks
endOfDoc
return $ Pandoc meta blocks
--
-- parsing blocks
--
parseBlocks :: GenParser Char ParserState [Block]
parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
block :: GenParser Char ParserState Block
block = choice [ codeBlock
, header
, hrule
, list
, blockQuote
, para
, plain
, rawHtmlBlock'
, notFollowedBy' endOfDoc >> char '<' >> return Null
] <?> "block"
--
-- header blocks
--
header :: GenParser Char ParserState Block
header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
headerLevel :: Int -> GenParser Char ParserState Block
headerLevel n = try $ do
let level = "h" ++ show n
htmlOpenTag level
contents <- inlinesTilEnd level
return $ Header n (normalizeSpaces contents)
--
-- hrule block
--
hrule :: GenParser Char ParserState Block
hrule = try $ do
(_, attribs) <- htmlSelfClosingTag "hr"
state <- getState
if not (null attribs) && stateParseRaw state
then unexpected "attributes in hr" -- parse as raw in this case
else return HorizontalRule
--
-- code blocks
--
-- Note: HTML tags in code blocks (e.g. for syntax highlighting) are
-- skipped, because they are not portable to output formats other than HTML.
codeBlock :: GenParser Char ParserState Block
codeBlock = try $ do
htmlOpenTag "pre"
result <- manyTill
(many1 (satisfy (/= '<')) <|>
((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
(htmlEndTag "pre")
let result' = concat result
-- drop leading newline if any
let result'' = if "\n" `isPrefixOf` result'
then drop 1 result'
else result'
-- drop trailing newline if any
let result''' = if "\n" `isSuffixOf` result''
then init result''
else result''
return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result'''
--
-- block quotes
--
blockQuote :: GenParser Char ParserState Block
blockQuote = try $ htmlOpenTag "blockquote" >> spaces >>
blocksTilEnd "blockquote" >>= (return . BlockQuote)
--
-- list blocks
--
list :: GenParser Char ParserState Block
list = choice [ bulletList, orderedList, definitionList ] <?> "list"
orderedList :: GenParser Char ParserState Block
orderedList = try $ do
(_, attribs) <- htmlOpenTag "ol"
(start, style) <- option (1, DefaultStyle) $
do failIfStrict
let sta = fromMaybe "1" $
lookup "start" attribs
let sty = fromMaybe (fromMaybe "" $
lookup "style" attribs) $
lookup "class" attribs
let sty' = case sty of
"lower-roman" -> LowerRoman
"upper-roman" -> UpperRoman
"lower-alpha" -> LowerAlpha
"upper-alpha" -> UpperAlpha
"decimal" -> Decimal
_ -> DefaultStyle
return (read sta, sty')
spaces
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
items <- sepEndBy1 (blocksIn "li" <|> liftM (:[]) list) spaces
htmlEndTag "ol"
return $ OrderedList (start, style, DefaultDelim) items
bulletList :: GenParser Char ParserState Block
bulletList = try $ do
htmlOpenTag "ul"
spaces
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
items <- sepEndBy1 (blocksIn "li" <|> liftM (:[]) list) spaces
htmlEndTag "ul"
return $ BulletList items
definitionList :: GenParser Char ParserState Block
definitionList = try $ do
failIfStrict -- def lists not part of standard markdown
htmlOpenTag "dl"
spaces
items <- sepEndBy1 definitionListItem spaces
htmlEndTag "dl"
return $ DefinitionList items
definitionListItem :: GenParser Char ParserState ([Inline], [[Block]])
definitionListItem = try $ do
terms <- sepEndBy1 (inlinesIn "dt") spaces
defs <- sepEndBy1 (blocksIn "dd") spaces
let term = intercalate [LineBreak] terms
return (term, defs)
--
-- paragraph block
--
para :: GenParser Char ParserState Block
para = try $ htmlOpenTag "p" >> inlinesTilEnd "p" >>=
return . Para . normalizeSpaces
--
-- plain block
--
plain :: GenParser Char ParserState Block
plain = many1 inline >>= return . Plain . normalizeSpaces
--
-- inline
--
inline :: GenParser Char ParserState Inline
inline = choice [ str
, strong
, emph
, superscript
, subscript
, strikeout
, spanStrikeout
, code
, linebreak
, whitespace
, link
, image
, smartPunctuation inline
, charRef
, rawHtmlInline
, symbol
] <?> "inline"
code :: GenParser Char ParserState Inline
code = try $ do
result <- (htmlOpenTag "code" >> manyTill (noneOf "<>") (htmlEndTag "code"))
<|> (htmlOpenTag "tt" >> manyTill (noneOf "<>") (htmlEndTag "tt"))
-- remove internal line breaks, leading and trailing space,
-- and decode character references
return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $
intercalate " " $ lines result
rawHtmlInline :: GenParser Char ParserState Inline
rawHtmlInline = do
result <- anyHtmlInlineTag <|> htmlComment
state <- getState
if stateParseRaw state then return (HtmlInline result) else return (Str "")
symbol :: GenParser Char ParserState Inline
symbol = do
notFollowedBy (char '<')
c <- oneOf specialChars
return $ Str [c]
betweenTags :: [Char] -> GenParser Char ParserState [Inline]
betweenTags tag = try $ htmlOpenTag tag >> inlinesTilEnd tag >>=
return . normalizeSpaces
emph :: GenParser Char ParserState Inline
emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph
strong :: GenParser Char ParserState Inline
strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
superscript :: GenParser Char ParserState Inline
superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
subscript :: GenParser Char ParserState Inline
subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
strikeout :: GenParser Char ParserState Inline
strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
return . Strikeout
spanStrikeout :: GenParser Char ParserState Inline
spanStrikeout = try $ do
failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
(_, attributes) <- htmlOpenTag "span"
result <- case (extractAttribute "class" attributes) of
Just "strikeout" -> inlinesTilEnd "span"
_ -> fail "not a strikeout"
return $ Strikeout result
whitespace :: GenParser Char st Inline
whitespace = many1 space >> return Space
-- hard line break
linebreak :: GenParser Char ParserState Inline
linebreak = htmlSelfClosingTag "br" >> optional newline >> return LineBreak
str :: GenParser Char st Inline
str = many1 (noneOf $ specialChars ++ " \t\n") >>= return . Str
specialChars :: [Char]
specialChars = "<&-\"'.\8216\8217\8220\8221"
--
-- links and images
--
-- extract contents of attribute (attribute names are case-insensitive)
extractAttribute :: [Char] -> [([Char], String)] -> Maybe String
extractAttribute _ [] = Nothing
extractAttribute name ((attrName, contents):rest) =
let name' = map toLower name
attrName' = map toLower attrName
in if attrName' == name'
then Just (decodeCharacterReferences contents)
else extractAttribute name rest
link :: GenParser Char ParserState Inline
link = try $ do
(_, attributes) <- htmlOpenTag "a"
url <- case (extractAttribute "href" attributes) of
Just url -> return url
Nothing -> fail "no href"
let title = fromMaybe "" $ extractAttribute "title" attributes
lab <- inlinesTilEnd "a"
return $ Link (normalizeSpaces lab) (escapeURI url, title)
image :: GenParser Char ParserState Inline
image = try $ do
(_, attributes) <- htmlSelfClosingTag "img"
url <- case (extractAttribute "src" attributes) of
Just url -> return url
Nothing -> fail "no src"
let title = fromMaybe "" $ extractAttribute "title" attributes
let alt = fromMaybe "" (extractAttribute "alt" attributes)
return $ Image [Str alt] (escapeURI url, title)
--- parsers for use in markdown, textile readers
-- | Matches a stretch of HTML in balanced tags.
htmlInBalanced :: (Tag String -> Bool) -> GenParser Char ParserState String
htmlInBalanced f = try $ do
(TagOpen t _, tag) <- htmlTag f
guard $ '/' `notElem` tag -- not a self-closing tag
let nonTagChunk = many1 $ satisfy (/= '<')
let stopper = htmlTag (~== TagClose t)
let anytag = liftM snd $ htmlTag (const True)
contents <- many $ notFollowedBy' stopper >>
(nonTagChunk <|> htmlInBalanced (const True) <|> anytag)
endtag <- liftM snd stopper
return $ tag ++ concat contents ++ endtag
-- | Matches a tag meeting a certain condition.
htmlTag :: (Tag String -> Bool) -> GenParser Char ParserState (Tag String, String)
htmlTag f = try $ do
lookAhead (char '<')
(next : _) <- getInput >>= return . canonicalizeTags . parseTags
guard $ f next
-- advance the parser
rendered <- manyTill anyChar (char '>')
return (next, rendered ++ ">")

View file

@ -29,7 +29,7 @@ Conversion of markdown-formatted plain text to 'Pandoc' document.
-}
module Text.Pandoc.Readers.Markdown ( readMarkdown ) where
import Data.List ( transpose, isSuffixOf, sortBy, findIndex, intercalate )
import Data.List ( transpose, sortBy, findIndex, intercalate )
import qualified Data.Map as M
import Data.Ord ( comparing )
import Data.Char ( isAlphaNum )
@ -39,14 +39,14 @@ import Text.Pandoc.Generic
import Text.Pandoc.Shared
import Text.Pandoc.Parsing
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
anyHtmlInlineTag, anyHtmlTag,
anyHtmlEndTag, htmlEndTag, extractTagType,
htmlBlockElement, htmlComment )
import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag,
isTextTag, isCommentTag )
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Text.ParserCombinators.Parsec
import Control.Monad (when, liftM, guard)
import Text.TeXMath.Macros (applyMacros, Macro, pMacroDefinition)
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match (tagOpen)
-- | Read markdown from an input string and return a Pandoc document.
readMarkdown :: ParserState -- ^ Parser state, including options for parser
@ -532,7 +532,7 @@ listLine = try $ do
notFollowedBy' (do indentSpaces
many (spaceChar)
listStart)
chunks <- manyTill (htmlComment <|> count 1 anyChar) newline
chunks <- manyTill (liftM snd (htmlTag isCommentTag) <|> count 1 anyChar) newline
return $ concat chunks ++ "\n"
-- parse raw text for one list item, excluding start marker and continuations
@ -676,7 +676,7 @@ plain = many1 inline >>~ spaces >>= return . Plain . normalizeSpaces
--
htmlElement :: GenParser Char ParserState [Char]
htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
htmlElement = strictHtmlBlock <|> liftM snd (htmlTag isBlockTag)
htmlBlock :: GenParser Char ParserState Block
htmlBlock = try $ do
@ -686,25 +686,23 @@ htmlBlock = try $ do
finalNewlines <- many newline
return $ RawHtml $ first ++ finalSpace ++ finalNewlines
-- True if tag is self-closing
isSelfClosing :: [Char] -> Bool
isSelfClosing tag =
isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag
strictHtmlBlock :: GenParser Char ParserState [Char]
strictHtmlBlock = try $ do
tag <- anyHtmlBlockTag
let tag' = extractTagType tag
if isSelfClosing tag || tag' == "hr"
then return tag
else do contents <- many (notFollowedBy' (htmlEndTag tag') >>
(htmlElement <|> (count 1 anyChar)))
end <- htmlEndTag tag'
return $ tag ++ concat contents ++ end
strictHtmlBlock = do
failUnlessBeginningOfLine
htmlInBalanced (not . isInlineTag)
rawVerbatimBlock :: GenParser Char ParserState String
rawVerbatimBlock = try $ do
(TagOpen tag _, open) <- htmlTag (tagOpen (\t ->
t == "pre" || t == "style" || t == "script")
(const True))
contents <- manyTill anyChar (htmlTag (~== TagClose tag))
return $ open ++ contents ++ renderTags [TagClose tag]
rawHtmlBlocks :: GenParser Char ParserState Block
rawHtmlBlocks = do
htmlBlocks <- many1 $ do (RawHtml blk) <- rawHtmlBlock
htmlBlocks <- many1 $ do blk <- rawVerbatimBlock <|>
liftM snd (htmlTag isBlockTag)
sps <- do sp1 <- many spaceChar
sp2 <- option "" (blankline >> return "\n")
sp3 <- many spaceChar
@ -921,7 +919,7 @@ inlineParsers = [ str
, subscript
, inlineNote -- after superscript because of ^[link](/foo)^
, autoLink
, rawHtmlInline'
, rawHtmlInline
, rawLaTeXInline'
, escapedChar
, exampleRef
@ -1221,12 +1219,12 @@ inBrackets parser = do
char ']'
return $ "[" ++ contents ++ "]"
rawHtmlInline' :: GenParser Char ParserState Inline
rawHtmlInline' = do
rawHtmlInline :: GenParser Char ParserState Inline
rawHtmlInline = do
st <- getState
result <- if stateStrict st
then choice [htmlBlockElement, anyHtmlTag, anyHtmlEndTag]
else choice [htmlComment, anyHtmlInlineTag]
(_,result) <- if stateStrict st
then htmlTag (not . isTextTag)
else htmlTag isInlineTag
return $ HtmlInline result
-- Citations
@ -1315,3 +1313,4 @@ citation = try $ do
, citationNoteNum = 0
, citationHash = 0
}

View file

@ -58,10 +58,9 @@ module Text.Pandoc.Readers.Textile ( readTextile) where
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.Parsing
import Text.Pandoc.Readers.HTML ( htmlTag, htmlEndTag, -- find code blocks
rawHtmlBlock, rawHtmlInline )
-- import Text.Pandoc.Readers.Markdown (smartPunctuation)
import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
import Text.ParserCombinators.Parsec
import Text.HTML.TagSoup.Match
import Data.Char ( digitToInt, isLetter )
import Control.Monad ( guard, liftM )
@ -127,7 +126,7 @@ blockParsers = [ codeBlock
, blockQuote
, hrule
, anyList
, rawHtmlBlock'
, rawHtmlBlock
, maybeExplicitBlock "table" table
, maybeExplicitBlock "p" para
, nullBlock ]
@ -139,8 +138,8 @@ block = choice blockParsers <?> "block"
-- | Code Blocks in Textile are between <pre> and </pre>
codeBlock :: GenParser Char ParserState Block
codeBlock = try $ do
htmlTag False "pre"
result' <- manyTill anyChar (try $ htmlEndTag "pre" >> blockBreak)
htmlTag (tagOpen (=="pre") null)
result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak)
-- drop leading newline if any
let result'' = case result' of
'\n':xs -> xs
@ -261,21 +260,19 @@ definitionListItem = try $ do
-- this ++ "\n\n" does not look very good
ds <- parseFromString parseBlocks (s ++ "\n\n")
return [ds]
-- | This terminates a block such as a paragraph. Because of raw html
-- blocks support, we have to lookAhead for a rawHtmlBlock.
blockBreak :: GenParser Char ParserState ()
blockBreak = try $ choice
[newline >> blanklines >> return (),
lookAhead rawHtmlBlock' >> return ()]
blockBreak = try (newline >> blanklines >> return ()) <|>
(lookAhead rawHtmlBlock >> return ())
-- | A raw Html Block, optionally followed by blanklines
rawHtmlBlock' :: GenParser Char ParserState Block
rawHtmlBlock' = try $ do
b <- rawHtmlBlock
rawHtmlBlock :: GenParser Char ParserState Block
rawHtmlBlock = try $ do
(_,b) <- htmlTag isBlockTag
optional blanklines
return b
return $ RawHtml b
-- | In textile, paragraphs are separated by blank lines.
para :: GenParser Char ParserState Block
@ -450,6 +447,9 @@ endline = try $ do
newline >> notFollowedBy blankline
return LineBreak
rawHtmlInline :: GenParser Char ParserState Inline
rawHtmlInline = liftM (HtmlInline . snd) $ htmlTag isInlineTag
-- | Textile standard link syntax is label:"target"
link :: GenParser Char ParserState Inline
link = try $ do

View file

@ -269,8 +269,8 @@ Pandoc (Meta {docTitle = [Str "Pandoc",Space,Str "Test",Space,Str "Suite"], docA
, [ Plain [Str "section:",Space,Str "\167"] ]
, [ Plain [Str "set",Space,Str "membership:",Space,Str "\8712"] ]
, [ Plain [Str "copyright:",Space,Str "\169"] ] ]
, Para [Str "AT",Str "&",Str "T",Space,Str "has",Space,Str "an",Space,Str "ampersand",Space,Str "in",Space,Str "their",Space,Str "name",Str "."]
, Para [Str "AT",Str "&",Str "T",Space,Str "is",Space,Str "another",Space,Str "way",Space,Str "to",Space,Str "write",Space,Str "it",Str "."]
, Para [Str "AT&T",Space,Str "has",Space,Str "an",Space,Str "ampersand",Space,Str "in",Space,Str "their",Space,Str "name",Str "."]
, Para [Str "AT&T",Space,Str "is",Space,Str "another",Space,Str "way",Space,Str "to",Space,Str "write",Space,Str "it",Str "."]
, Para [Str "This",Space,Str "&",Space,Str "that",Str "."]
, Para [Str "4",Space,Str "<",Space,Str "5",Str "."]
, Para [Str "6",Space,Str ">",Space,Str "5",Str "."]
@ -316,11 +316,11 @@ Pandoc (Meta {docTitle = [Str "Pandoc",Space,Str "Test",Space,Str "Suite"], docA
, Para [Str "Foo",Space,Link [Str "biz"] ("/url/","Title with \"quote\" inside"),Str "."]
, Header 2 [Str "With",Space,Str "ampersands"]
, Para [Str "Here",Str "'",Str "s",Space,Str "a",Space,Link [Str "link",Space,Str "with",Space,Str "an",Space,Str "ampersand",Space,Str "in",Space,Str "the",Space,Str "URL"] ("http://example.com/?foo=1&bar=2",""),Str "."]
, Para [Str "Here",Str "'",Str "s",Space,Str "a",Space,Str "link",Space,Str "with",Space,Str "an",Space,Str "amersand",Space,Str "in",Space,Str "the",Space,Str "link",Space,Str "text:",Space,Link [Str "AT",Str "&",Str "T"] ("http://att.com/","AT&T"),Str "."]
, Para [Str "Here",Str "'",Str "s",Space,Str "a",Space,Str "link",Space,Str "with",Space,Str "an",Space,Str "amersand",Space,Str "in",Space,Str "the",Space,Str "link",Space,Str "text:",Space,Link [Str "AT&T"] ("http://att.com/","AT&T"),Str "."]
, Para [Str "Here",Str "'",Str "s",Space,Str "an",Space,Link [Str "inline",Space,Str "link"] ("/script?foo=1&bar=2",""),Str "."]
, Para [Str "Here",Str "'",Str "s",Space,Str "an",Space,Link [Str "inline",Space,Str "link",Space,Str "in",Space,Str "pointy",Space,Str "braces"] ("/script?foo=1&bar=2",""),Str "."]
, Header 2 [Str "Autolinks"]
, Para [Str "With",Space,Str "an",Space,Str "ampersand:",Space,Link [Str "http://example",Str ".",Str "com/?foo=1",Str "&",Str "bar=2"] ("http://example.com/?foo=1&bar=2","")]
, Para [Str "With",Space,Str "an",Space,Str "ampersand:",Space,Link [Str "http://example",Str ".",Str "com/?foo=1&bar=2"] ("http://example.com/?foo=1&bar=2","")]
, BulletList
[ [ Plain [Str "In",Space,Str "a",Space,Str "list?"] ]
, [ Plain [Link [Str "http://example",Str ".",Str "com/"] ("http://example.com/","")] ]

View file

@ -431,21 +431,21 @@ Hr's:
<hr>
<hr/>
<hr />
<hr />
<hr>
<hr/>
<hr />
<hr />
<hr class="foo" id="bar" />
<hr class="foo" id="bar"/>
<hr class="foo" id="bar" />
<hr class="foo" id="bar" >
<hr class="foo" id="bar">
-----