Bug fixes in readers:

+ LaTeX reader:  skip anything after \end{document}
+ HTML reader: fixed bug skipping material after </html> -- previously,
  stuff at the end was skipped even if no </html> was present, which
  meant only part of the file would be parsed and no error issued
+ HTML reader: added new constant eitherBlockOrInline with elements that
  may count either as block-level or inline
+ Modified isInline and isBlock to take this into account
+ modified rawHtmlBlock to accept any tag (even an inline tag);
  this is innocuous, because rawHtmlBlock is tried only if a regular
  inline element can't be parsed.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@862 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2007-08-18 23:44:26 +00:00
parent e48f046aa0
commit 4399db4fd2
2 changed files with 20 additions and 7 deletions

View file

@ -59,11 +59,21 @@ readHtml = readWith parseHtml
-- Constants
--
eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
"map", "area", "object", "script"]
inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
"br", "cite", "code", "dfn", "em", "font", "i", "img",
"input", "kbd", "label", "q", "s", "samp", "select",
"small", "span", "strike", "strong", "sub", "sup",
"textarea", "tt", "u", "var"]
"textarea", "tt", "u", "var"] ++ eitherBlockOrInline
blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
"dl", "fieldset", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "hr", "isindex", "menu", "noframes",
"noscript", "ol", "p", "pre", "table", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr"] ++ eitherBlockOrInline
--
-- HTML utility functions
@ -171,12 +181,15 @@ htmlEndTag tag = try $ do
char '>'
return $ "</" ++ tag ++ ">"
-- | Returns @True@ if the tag is an inline tag.
-- | Returns @True@ if the tag is (or can be) an inline tag.
isInline tag = (extractTagType tag) `elem` inlineHtmlTags
-- | Returns @True@ if the tag is (or can be) a block tag.
isBlock tag = (extractTagType tag) `elem` blockHtmlTags
anyHtmlBlockTag = try $ do
tag <- anyHtmlTag <|> anyHtmlEndTag
if isInline tag then fail "inline tag" else return tag
if isBlock tag then return tag else fail "inline tag"
anyHtmlInlineTag = try $ do
tag <- anyHtmlTag <|> anyHtmlEndTag
@ -193,7 +206,7 @@ htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
rawHtmlBlock = try $ do
notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
body <- htmlBlockElement <|> anyHtmlBlockTag
body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
sp <- many space
state <- getState
if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null
@ -260,8 +273,7 @@ parseHtml = do
spaces
optional (htmlEndTag "body")
spaces
optional (htmlEndTag "html")
many anyChar -- ignore anything after </html>
optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
eof
return $ Pandoc (Meta title authors date) blocks

View file

@ -117,7 +117,8 @@ parseLaTeX = do
spaces
blocks <- parseBlocks
spaces
optional $ try (string "\\end{document}") -- might not be present (fragment)
optional $ try (string "\\end{document}" >> many anyChar)
-- might not be present (fragment)
spaces
eof
state <- getState