Bug fixes in readers:
+ LaTeX reader: skip anything after \end{document} + HTML reader: fixed bug skipping material after </html> -- previously, stuff at the end was skipped even if no </html> was present, which meant only part of the file would be parsed and no error issued + HTML reader: added new constant eitherBlockOrInline with elements that may count either as block-level or inline + Modified isInline and isBlock to take this into account + modified rawHtmlBlock to accept any tag (even an inline tag); this is innocuous, because rawHtmlBlock is tried only if a regular inline element can't be parsed. git-svn-id: https://pandoc.googlecode.com/svn/trunk@862 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
parent
e48f046aa0
commit
4399db4fd2
2 changed files with 20 additions and 7 deletions
|
@ -59,11 +59,21 @@ readHtml = readWith parseHtml
|
|||
-- Constants
|
||||
--
|
||||
|
||||
eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
|
||||
"map", "area", "object", "script"]
|
||||
|
||||
inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
|
||||
"br", "cite", "code", "dfn", "em", "font", "i", "img",
|
||||
"input", "kbd", "label", "q", "s", "samp", "select",
|
||||
"small", "span", "strike", "strong", "sub", "sup",
|
||||
"textarea", "tt", "u", "var"]
|
||||
"textarea", "tt", "u", "var"] ++ eitherBlockOrInline
|
||||
|
||||
blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
|
||||
"dl", "fieldset", "form", "h1", "h2", "h3", "h4",
|
||||
"h5", "h6", "hr", "isindex", "menu", "noframes",
|
||||
"noscript", "ol", "p", "pre", "table", "ul", "dd",
|
||||
"dt", "frameset", "li", "tbody", "td", "tfoot",
|
||||
"th", "thead", "tr"] ++ eitherBlockOrInline
|
||||
|
||||
--
|
||||
-- HTML utility functions
|
||||
|
@ -171,12 +181,15 @@ htmlEndTag tag = try $ do
|
|||
char '>'
|
||||
return $ "</" ++ tag ++ ">"
|
||||
|
||||
-- | Returns @True@ if the tag is an inline tag.
|
||||
-- | Returns @True@ if the tag is (or can be) an inline tag.
|
||||
isInline tag = (extractTagType tag) `elem` inlineHtmlTags
|
||||
|
||||
-- | Returns @True@ if the tag is (or can be) a block tag.
|
||||
isBlock tag = (extractTagType tag) `elem` blockHtmlTags
|
||||
|
||||
anyHtmlBlockTag = try $ do
|
||||
tag <- anyHtmlTag <|> anyHtmlEndTag
|
||||
if isInline tag then fail "inline tag" else return tag
|
||||
if isBlock tag then return tag else fail "inline tag"
|
||||
|
||||
anyHtmlInlineTag = try $ do
|
||||
tag <- anyHtmlTag <|> anyHtmlEndTag
|
||||
|
@ -193,7 +206,7 @@ htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
|
|||
|
||||
rawHtmlBlock = try $ do
|
||||
notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
|
||||
body <- htmlBlockElement <|> anyHtmlBlockTag
|
||||
body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
|
||||
sp <- many space
|
||||
state <- getState
|
||||
if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null
|
||||
|
@ -260,8 +273,7 @@ parseHtml = do
|
|||
spaces
|
||||
optional (htmlEndTag "body")
|
||||
spaces
|
||||
optional (htmlEndTag "html")
|
||||
many anyChar -- ignore anything after </html>
|
||||
optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
|
||||
eof
|
||||
return $ Pandoc (Meta title authors date) blocks
|
||||
|
||||
|
|
|
@ -117,7 +117,8 @@ parseLaTeX = do
|
|||
spaces
|
||||
blocks <- parseBlocks
|
||||
spaces
|
||||
optional $ try (string "\\end{document}") -- might not be present (fragment)
|
||||
optional $ try (string "\\end{document}" >> many anyChar)
|
||||
-- might not be present (fragment)
|
||||
spaces
|
||||
eof
|
||||
state <- getState
|
||||
|
|
Loading…
Add table
Reference in a new issue