New HTML reader using tagsoup as a lexer.

* The new reader is faster and more accurate. * API changes for Text.Pandoc.Readers.HTML: - removed rawHtmlBlock, anyHtmlBlockTag, anyHtmlInlineTag, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, htmlBlockElement, htmlComment - added htmlTag, htmlInBalanced, isInlineTag, isBlockTag, isTextTag * tagsoup is a new dependency. * Text.Pandoc.Parsing: Generalized type on readWith. * Benchmark.hs: Added length calculation to force full evaluation. * Updated HTML reader tests. * Updated markdown and textile readers to use the functions from the HTML reader. * Note: The markdown reader now correctly handles some cases it did not before. For example: <hr/> is reproduced without adding a space. <script> a = '<b>'; </script> is parsed correctly.
2010-12-22 20:25:15 -08:00 · 2010-12-22 20:25:15 -08:00 · 904050fa36
commit 904050fa36
parent 220fe5fab8
8 changed files with 441 additions and 640 deletions
--- a/Benchmark.hs
+++ b/Benchmark.hs
@ -13,8 +13,11 @@ readerBench doc (name, reader) =
      inp = writer defaultWriterOptions{ writerWrapText = True
                                       , writerLiterateHaskell =
                                          "+lhs" `isSuffixOf` name } doc
-  in  bench (name ++ " reader") $ whnf
-        (reader defaultParserState{stateSmart = True
+      -- we compute the length to force full evaluation
+      getLength (Pandoc (Meta a b c) d) =
+            length a + length b + length c + length d
+  in  bench (name ++ " reader") $ whnf (getLength .
+         reader defaultParserState{ stateSmart = True
                                  , stateStandalone = True
                                  , stateLiterateHaskell =
                                      "+lhs" `isSuffixOf` name }) inp
--- a/pandoc.cabal
+++ b/pandoc.cabal
@ -176,7 +176,8 @@ Library
                 citeproc-hs >= 0.3 && < 0.4,
                 pandoc-types == 1.7.*,
                 json >= 0.4 && < 0.5,
-                 dlist >= 0.4 && < 0.6
+                 dlist >= 0.4 && < 0.6,
+                 tagsoup >= 0.12 && < 0.13
  if impl(ghc >= 6.10)
    Build-depends: base >= 4 && < 5, syb
  else
@ -249,7 +250,8 @@ Executable pandoc
                 citeproc-hs >= 0.3 && < 0.4,
                 pandoc-types == 1.7.*,
                 json >= 0.4 && < 0.5,
-                 dlist >= 0.4 && < 0.6
+                 dlist >= 0.4 && < 0.6,
+                 tagsoup >= 0.12 && < 0.13
  if impl(ghc >= 6.10)
    Build-depends: base >= 4 && < 5, syb
  else
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@ -287,7 +287,7 @@ nullBlock :: GenParser Char st Block
 nullBlock = anyChar >> return Null

 -- | Fail if reader is in strict markdown syntax mode.
-failIfStrict :: GenParser Char ParserState ()
+failIfStrict :: GenParser a ParserState ()
 failIfStrict = do
  state <- getState
  if stateStrict state then fail "strict mode" else return ()
@ -567,9 +567,9 @@ gridTableFooter = blanklines
 ---

 -- | Parse a string with a given parser and state.
-readWith :: GenParser Char ParserState a      -- ^ parser
+readWith :: GenParser t ParserState a      -- ^ parser
         -> ParserState                    -- ^ initial state
-         -> String                         -- ^ input string
+         -> [t]                            -- ^ input
         -> a
 readWith parser state input = 
    case runParser parser state "source" input of
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@ -27,36 +27,355 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 Conversion of HTML to 'Pandoc' document.
 -}
-module Text.Pandoc.Readers.HTML ( 
-                                 readHtml, 
-                                 rawHtmlInline, 
-                                 rawHtmlBlock,
-                                 htmlTag,
-                                 anyHtmlBlockTag, 
-                                 anyHtmlInlineTag,  
-                                 anyHtmlTag,
-                                 anyHtmlEndTag,
-                                 htmlEndTag,
-                                 extractTagType,
-                                 htmlBlockElement,
-                                 htmlComment,
+module Text.Pandoc.Readers.HTML ( readHtml
+                                , htmlTag
+                                , htmlInBalanced
+                                , isInlineTag
+                                , isBlockTag
+                                , isTextTag
+                                , isCommentTag
                                ) where

 import Text.ParserCombinators.Parsec
+import Text.ParserCombinators.Parsec.Pos
+import Text.HTML.TagSoup
+import Text.HTML.TagSoup.Match
 import Text.Pandoc.Definition
+import Text.Pandoc.Builder (text, toList)
 import Text.Pandoc.Shared
 import Text.Pandoc.Parsing
-import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
-import Data.Maybe ( fromMaybe )
-import Data.List ( isPrefixOf, isSuffixOf, intercalate )
-import Data.Char ( toLower, isAlphaNum )
-import Control.Monad ( liftM, when )
+import Data.Maybe ( fromMaybe, isJust )
+import Data.List ( intercalate )
+import Data.Char ( isSpace, isDigit )
+import Control.Monad ( liftM, guard )

 -- | Convert HTML-formatted string to 'Pandoc' document.
 readHtml :: ParserState   -- ^ Parser state
         -> String        -- ^ String to parse (assumes @'\n'@ line endings)
         -> Pandoc
-readHtml = readWith parseHtml
+readHtml st inp = Pandoc meta blocks
+  where blocks = readWith parseBody st body
+        tags   = canonicalizeTags $
+                  parseTagsOptions parseOptions{ optTagPosition = True } inp
+        hasHeader = any (~== TagOpen "head" []) tags
+        (meta, rest) = if hasHeader
+                          then parseHeader tags
+                          else (Meta [] [] [], tags)
+        body   = filter (\t -> not $
+                     tagOpen (`elem` ["html","head","body"]) (const True) t ||
+                     tagClose (`elem` ["html","head","body"]) t) rest
+
+type TagParser = GenParser (Tag String) ParserState
+
+parseHeader :: [Tag String] -> (Meta, [Tag String])
+parseHeader tags = (Meta{docTitle = tit'', docAuthors = [], docDate = []}, rest)
+  where (tit,r) = break (~== TagClose "title") $ drop 1 $
+                   dropWhile (\t -> not $ t ~== TagOpen "title" []) tags
+        tit' = concatMap fromTagText $ filter isTagText tit
+        tit'' = normalizeSpaces $ toList $ text tit'
+        rest  = drop 1 $ dropWhile (\t -> not $ t ~== TagClose "head") r 
+
+parseBody :: TagParser [Block]
+parseBody = liftM concat $ manyTill block eof
+
+block :: TagParser [Block]
+block = optional pLocation >>
+        choice [
+              pPara
+            , pHeader
+            , pBlockQuote
+            , pCodeBlock
+            , pList
+            , pHrule
+            , pPlain
+            , pRawHtmlBlock
+            ]
+
+renderTags' :: [Tag String] -> String
+renderTags' = renderTagsOptions
+               renderOptions{ optMinimize = (`elem` ["hr","br","img"]) }
+
+pList :: TagParser [Block]
+pList = pBulletList <|> pOrderedList <|> pDefinitionList
+
+pBulletList :: TagParser [Block]
+pBulletList = try $ do
+  pSatisfy (~== TagOpen "ul" [])
+  let nonItem = pSatisfy (\t ->
+                  not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
+                  not (t ~== TagClose "ul"))
+  -- note: if they have an <ol> or <ul> not in scope of a <li>,
+  -- treat it as a list item, though it's not valid xhtml...
+  skipMany nonItem
+  items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul")
+  return [BulletList items]
+
+pOrderedList :: TagParser [Block]
+pOrderedList = try $ do
+  TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
+  st <- getState
+  let (start, style) =  if stateStrict st
+                           then (1, DefaultStyle) 
+                           else (sta', sty')
+                              where sta = fromMaybe "1" $
+                                          lookup "start" attribs
+                                    sta' = if all isDigit sta
+                                              then read sta
+                                              else 1
+                                    sty = fromMaybe (fromMaybe "" $
+                                          lookup "style" attribs) $
+                                          lookup "class" attribs
+                                    sty' = case sty of
+                                            "lower-roman"  -> LowerRoman
+                                            "upper-roman"  -> UpperRoman
+                                            "lower-alpha"  -> LowerAlpha
+                                            "upper-alpha"  -> UpperAlpha
+                                            "decimal"      -> Decimal
+                                            _              -> DefaultStyle
+  let nonItem = pSatisfy (\t ->
+                  not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
+                  not (t ~== TagClose "ol"))
+  -- note: if they have an <ol> or <ul> not in scope of a <li>,
+  -- treat it as a list item, though it's not valid xhtml...
+  skipMany nonItem
+  items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol")
+  return [OrderedList (start, style, DefaultDelim) items]
+
+pDefinitionList :: TagParser [Block]
+pDefinitionList = try $ do
+  pSatisfy (~== TagOpen "dl" [])
+  items <- manyTill pDefListItem (pCloses "dl")
+  return [DefinitionList items]
+
+pDefListItem :: TagParser ([Inline],[[Block]])
+pDefListItem = try $ do
+  let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
+                  not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
+  terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
+  defs  <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
+  skipMany nonItem
+  let term = intercalate [LineBreak] terms
+  return (term, defs)
+
+pRawHtmlBlock :: TagParser [Block]
+pRawHtmlBlock = do
+  raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|>
+          liftM (renderTags' . (:[])) pAnyTag
+  state <- getState
+  if stateParseRaw state
+     then return [RawHtml raw]
+     else return []
+
+pHtmlBlock :: String -> TagParser String
+pHtmlBlock t = try $ do
+  open <- pSatisfy (~== TagOpen t [])
+  contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
+  return $ renderTags' $ [open] ++ contents ++ [TagClose t]
+
+pHeader :: TagParser [Block]
+pHeader = try $ do
+  TagOpen tagtype attr <- pSatisfy $
+                           tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
+                           (const True)
+  let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
+  let level = read (drop 1 tagtype)
+  contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof)
+  return $ if bodyTitle
+              then []  -- skip a representation of the title in the body
+              else [Header level $ normalizeSpaces contents]
+
+pHrule :: TagParser [Block]
+pHrule = do
+  pSelfClosing (=="hr") (const True)
+  return [HorizontalRule]
+
+pBlockQuote :: TagParser [Block]
+pBlockQuote = do
+  contents <- pInTags "blockquote" block
+  return [BlockQuote contents]
+
+pPlain :: TagParser [Block]
+pPlain = do
+  contents <- liftM (normalizeSpaces . concat) $ many1 inline
+  if null contents
+     then return []
+     else return [Plain contents]
+
+pPara :: TagParser [Block]
+pPara = do
+  contents <- pInTags "p" inline
+  return [Para $ normalizeSpaces contents]
+
+pCodeBlock :: TagParser [Block]
+pCodeBlock = try $ do
+  TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
+  contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
+  let rawText = concatMap fromTagText $ filter isTagText contents
+  -- drop leading newline if any
+  let result' = case rawText of
+                     '\n':xs  -> xs
+                     _        -> rawText
+  -- drop trailing newline if any
+  let result = case reverse result' of
+                    '\n':_   -> init result'
+                    _        -> result'
+  let attribsId = fromMaybe "" $ lookup "id" attr
+  let attribsClasses = words $ fromMaybe "" $ lookup "class" attr
+  let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+  st <- getState
+  let attribs = if stateStrict st
+                   then ("",[],[])
+                   else (attribsId, attribsClasses, attribsKV)
+  return [CodeBlock attribs result]
+
+inline :: TagParser [Inline]
+inline = choice [
+             pLocation
+           , pTagText
+           , pEmph
+           , pStrong
+           , pSuperscript
+           , pSubscript
+           , pStrikeout
+           , pLineBreak
+           , pLink
+           , pImage
+           , pCode
+           , pRawHtmlInline
+           ]
+
+pLocation :: TagParser [a]
+pLocation = do
+  (TagPosition r c) <- pSatisfy isTagPosition
+  setPosition $ newPos "input" r c
+  return []
+
+pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String)
+pSatisfy f = do
+  pos <- getPosition
+  token show (const pos) (\x -> if f x then Just x else Nothing) 
+
+pAnyTag :: TagParser (Tag String)
+pAnyTag = pSatisfy (const True)
+
+pSelfClosing :: (String -> Bool) -> ([Attribute String] -> Bool)
+             -> TagParser (Tag String)
+pSelfClosing f g = do
+  open <- pSatisfy (tagOpen f g)
+  optional $ try $ pLocation >> pSatisfy (tagClose f)
+  return open
+
+pEmph :: TagParser [Inline]
+pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph
+
+pStrong :: TagParser [Inline]
+pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong
+
+pSuperscript :: TagParser [Inline]
+pSuperscript = failIfStrict >> pInlinesInTags "sup" Superscript
+
+pSubscript :: TagParser [Inline]
+pSubscript = failIfStrict >> pInlinesInTags "sub" Subscript
+
+pStrikeout :: TagParser [Inline]
+pStrikeout = do
+  failIfStrict
+  pInlinesInTags "s" Strikeout <|>
+    pInlinesInTags "strike" Strikeout <|>
+    pInlinesInTags "del" Strikeout <|>
+    do pSatisfy (~== TagOpen "span" [("class","strikeout")])
+       contents <- liftM concat $ manyTill inline (pCloses "span")
+       return [Strikeout contents]
+
+pLineBreak :: TagParser [Inline]
+pLineBreak = do
+  pSelfClosing (=="br") (const True)
+  return [LineBreak]
+
+pLink :: TagParser [Inline]
+pLink = do
+  tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
+  let url = fromAttrib "href" tag
+  let title = fromAttrib "title" tag
+  lab <- liftM concat $ manyTill inline (pCloses "a")
+  return [Link (normalizeSpaces lab) (escapeURI url, title)]
+
+pImage :: TagParser [Inline]
+pImage = do
+  tag <- pSelfClosing (=="img") (isJust . lookup "src")
+  let url = fromAttrib "src" tag
+  let title = fromAttrib "title" tag
+  let alt = fromAttrib "alt" tag
+  return [Image (toList $ text alt) (escapeURI url, title)]
+
+pCode :: TagParser [Inline]
+pCode = do
+  (TagOpen open _) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
+  result <- manyTill pAnyTag (pCloses open)
+  return [Code $ intercalate " " $ lines $ innerText result]
+
+pRawHtmlInline :: TagParser [Inline]
+pRawHtmlInline = do
+  result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag
+  state <- getState
+  if stateParseRaw state
+     then return [HtmlInline $ renderTags' [result]]
+     else return []
+
+pInlinesInTags :: String -> ([Inline] -> Inline)
+               -> TagParser [Inline]
+pInlinesInTags tagtype f = do
+  contents <- pInTags tagtype inline
+  return [f contents]
+
+pInTags :: String -> TagParser [a]
+        -> TagParser [a]
+pInTags tagtype parser = try $ do
+  pSatisfy (~== TagOpen tagtype [])
+  liftM concat $ manyTill parser (pCloses tagtype <|> eof)
+
+pCloses :: String -> TagParser ()
+pCloses tagtype = try $ do
+  optional pLocation
+  t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
+  case t of
+       (TagClose t')  | t' == tagtype -> pAnyTag >> return ()
+       (TagOpen t' _) | t' `closes` tagtype -> return ()
+       (TagClose "ul") | tagtype == "li" -> return ()
+       (TagClose "ol") | tagtype == "li" -> return ()
+       (TagClose "dl") | tagtype == "li" -> return ()
+       _ -> pzero
+
+pTagText :: TagParser [Inline]
+pTagText = do
+  (TagText str) <- pSatisfy isTagText
+  st <- getState
+  case runParser (many pTagContents) st "text" str of
+       Left _        -> fail $ "Could not parse `" ++ str ++ "'"
+       Right result  -> return result
+
+pTagContents :: GenParser Char ParserState Inline
+pTagContents =  pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol
+
+pStr :: GenParser Char ParserState Inline
+pStr = many1 (satisfy (\c -> not (isSpace c) && not (isSpecial c))) >>= return . Str
+
+isSpecial :: Char -> Bool
+isSpecial '"' = True
+isSpecial '\'' = True
+isSpecial '.' = True
+isSpecial '-' = True
+isSpecial '\8216' = True
+isSpecial '\8217' = True
+isSpecial '\8220' = True
+isSpecial '\8221' = True
+isSpecial _ = False
+
+pSymbol :: GenParser Char ParserState Inline
+pSymbol = satisfy isSpecial >>= return . Str . (:[])
+
+pSpace :: GenParser Char ParserState Inline
+pSpace = many1 (satisfy isSpace) >> return Space

 --
 -- Constants
@ -83,10 +402,26 @@ blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
                 "dt", "frameset", "li", "tbody", "td", "tfoot",
                 "th", "thead", "tr", "script", "style"]

+isInlineTag :: Tag String -> Bool
+isInlineTag t = tagOpen (`notElem` blockHtmlTags) (const True) t ||
+                tagClose (`notElem` blockHtmlTags) t ||
+                tagComment (const True) t
+
+isBlockTag :: Tag String -> Bool
+isBlockTag t = tagOpen (`elem` blocktags) (const True) t ||
+               tagClose (`elem` blocktags) t ||
+               tagComment (const True) t
+                 where blocktags = blockHtmlTags ++ eitherBlockOrInline
+
+isTextTag :: Tag String -> Bool
+isTextTag = tagText (const True)
+
+isCommentTag :: Tag String -> Bool
+isCommentTag = tagComment (const True)
+
 -- taken from HXT and extended

 closes :: String -> String -> Bool
-"EOF" `closes` _ = True
 _ `closes` "body" = False
 _ `closes` "html" = False
 "a" `closes` "a" = True
@ -117,565 +452,27 @@ t1 `closes` t2 |
   t2 `notElem` (blockHtmlTags ++ eitherBlockOrInline) = True
 _ `closes` _ = False

--
-- HTML utility functions
--
-
-- | Read blocks until end tag.
-blocksTilEnd :: String -> GenParser Char ParserState [Block]
-blocksTilEnd tag = do
-  blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
-  return $ filter (/= Null) blocks
-
-- | Read inlines until end tag.
-inlinesTilEnd :: String -> GenParser Char ParserState [Inline]
-inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
-
-- | Parse blocks between open and close tag.
-blocksIn :: String -> GenParser Char ParserState [Block]
-blocksIn tag = try $ htmlOpenTag tag >> spaces >> blocksTilEnd tag
-
-- | Parse inlines between open and close tag.
-inlinesIn :: String -> GenParser Char ParserState [Inline]
-inlinesIn tag = try $ htmlOpenTag tag >> spaces >> inlinesTilEnd tag
-
-- | Extract type from a tag:  e.g. @br@ from @\<br\>@
-extractTagType :: String -> String
-extractTagType ('<':rest) = 
-  let isSpaceOrSlash c = c `elem` "/ \n\t" in
-  map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
-extractTagType _ = ""
-
-- Parse any HTML tag (opening or self-closing) and return tag type
-anyOpener :: GenParser Char ParserState [Char]
-anyOpener = try $ do
-  char '<'
-  spaces
-  tag <- many1 alphaNum
-  skipMany htmlAttribute
-  spaces
-  option "" (string "/")
-  spaces
-  char '>'
-  return $ map toLower tag 
-
-- | Parse any HTML tag (opening or self-closing) and return text of tag
-anyHtmlTag :: GenParser Char ParserState [Char]
-anyHtmlTag = try $ do
-  char '<'
-  spaces
-  first <- letter
-  rest <- many (alphaNum <|> char ':')
-  let tag = first : rest
-  attribs <- many htmlAttribute
-  spaces
-  ender <- option "" (string "/")
-  let ender' = if null ender then "" else " /"
-  spaces
-  char '>'
-  let result = "<" ++ tag ++ 
-               concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
-  return result
-
-anyHtmlEndTag :: GenParser Char ParserState [Char]
-anyHtmlEndTag = try $ do
-  char '<'   
-  spaces
-  char '/'
-  spaces
-  first <- letter
-  rest <- many (alphaNum <|> char ':')
-  let tag = first : rest
-  spaces
-  char '>'
-  let result = "</" ++ tag ++ ">"
-  return result 
-
-htmlTag :: Bool
-        -> String
-        -> GenParser Char ParserState (String, [(String, String)])
-htmlTag selfClosing tag = try $ do
-  char '<'
-  spaces
-  stringAnyCase tag
-  attribs <- many htmlAttribute
-  spaces
-  -- note: we want to handle both HTML and XHTML,
-  -- so we don't require the /
-  when selfClosing $ optional $ char '/' >> spaces
-  char '>'
-  return (tag, (map (\(name, content, _) -> (name, content)) attribs))
-
-htmlOpenTag :: String
-             -> GenParser Char ParserState (String, [(String, String)])
-htmlOpenTag = htmlTag False
-
-htmlCloseTag :: String
-             -> GenParser Char ParserState (String, [(String, String)])
-htmlCloseTag = htmlTag False . ('/':)
-
-htmlSelfClosingTag :: String
-                   -> GenParser Char ParserState (String, [(String, String)])
-htmlSelfClosingTag = htmlTag True
-
-- parses a quoted html attribute value
-quoted :: Char -> GenParser Char st (String, String)
-quoted quoteChar = do
-  result <- between (char quoteChar) (char quoteChar) 
-                    (many (noneOf [quoteChar]))
-  return (result, [quoteChar])
-
-htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
-htmlAttribute = do
-  attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
-  return attr
-
-- minimized boolean attribute
-htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
-htmlMinimizedAttribute = try $ do
-  many1 space
-  name <- many1 (choice [letter, oneOf ".-_:"])
-  return (name, name, name)
-
-htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char])
-htmlRegularAttribute = try $ do
-  many1 space
-  name <- many1 (choice [letter, oneOf ".-_:"])
-  spaces
-  char '='
-  spaces
-  (content, quoteStr) <- choice [ (quoted '\''), 
-                                  (quoted '"'), 
-                                  (do
-                                     a <- many (noneOf " \t\n\r\"'<>")
-                                     return (a,"")) ]
-  return (name, content,
-          (name ++ "=" ++ quoteStr ++ content ++ quoteStr))
-
-- | Parse an end tag of type 'tag'
-htmlEndTag :: [Char] -> GenParser Char ParserState [Char]
-htmlEndTag tag = try $ do
-  closedByNext <- lookAhead $ option False $ liftM (`closes` tag) $
-                   anyOpener <|> (eof >> return "EOF")
-  if closedByNext
-     then return ""
-     else do char '<'   
-             spaces
-             char '/'
-             spaces
-             stringAnyCase tag
-             spaces
-             char '>'
-             return $ "</" ++ tag ++ ">"
-
-- | Returns @True@ if the tag is (or can be) a block tag.
-isBlock :: String -> Bool
-isBlock tag = (extractTagType tag) `elem` (blockHtmlTags ++ eitherBlockOrInline)
-
-anyHtmlBlockTag :: GenParser Char ParserState [Char]
-anyHtmlBlockTag = try $ do
-  tag <- anyHtmlTag <|> anyHtmlEndTag
-  if isBlock tag then return tag else fail "not a block tag"
-
-anyHtmlInlineTag :: GenParser Char ParserState [Char]
-anyHtmlInlineTag = try $ do
-  tag <- anyHtmlTag <|> anyHtmlEndTag
-  if not (isBlock tag) then return tag else fail "not an inline tag"
-
-- | Parses material between script tags.
-- Scripts must be treated differently, because they can contain '<>' etc.
-htmlScript :: GenParser Char ParserState [Char]
-htmlScript = try $ do
-  lookAhead $ htmlOpenTag "script"
-  open <- anyHtmlTag
-  rest <- manyTill anyChar (htmlEndTag "script")
-  return $ open ++ rest ++ "</script>"
-
-- | Parses material between style tags.
-- Style tags must be treated differently, because they can contain CSS
-htmlStyle :: GenParser Char ParserState [Char]
-htmlStyle = try $ do
-  lookAhead $ htmlOpenTag "style"
-  open <- anyHtmlTag
-  rest <- manyTill anyChar (htmlEndTag "style")
-  return $ open ++ rest ++ "</style>"
-
-htmlBlockElement :: GenParser Char ParserState [Char]
-htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
-
-rawHtmlBlock :: GenParser Char ParserState Block
-rawHtmlBlock = try $ do
-  body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag
-  state <- getState
-  if stateParseRaw state then return (RawHtml body) else return Null
-
-- This is a block whose contents should be passed through verbatim, not interpreted.
-rawVerbatimBlock :: GenParser Char ParserState [Char]
-rawVerbatimBlock = try $ do
-  start <- anyHtmlBlockTag
-  let tagtype = extractTagType start
-  if tagtype `elem` ["pre"]
-     then do
-       contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar)
-       end <- htmlEndTag tagtype
-       return $ start ++ contents ++ end
-     else fail "Not a verbatim block"
-
-- We don't want to parse </body> or </html> as raw HTML, since these
-- are handled in parseHtml.
-rawHtmlBlock' :: GenParser Char ParserState Block
-rawHtmlBlock' = do notFollowedBy' (htmlCloseTag "body" <|>
-                                   htmlCloseTag "html")
-                   rawHtmlBlock
-
-- | Parses an HTML comment.
-htmlComment :: GenParser Char st [Char]
-htmlComment = try $ do
-  string "<!--"
-  comment <- many $ noneOf "-"
-               <|> try (char '-' >>~ notFollowedBy (try (char '-' >> char '>')))
-  string "-->"
-  return $ "<!--" ++ comment ++ "-->"
-
--
-- parsing documents
--
-
-xmlDec :: GenParser Char st [Char]
-xmlDec = try $ do
-  string "<?"
-  rest <- manyTill anyChar (char '>')
-  return $ "<?" ++ rest ++ ">"
-
-definition :: GenParser Char st [Char]
-definition = try $ do
-  string "<!"
-  rest <- manyTill anyChar (char '>')
-  return $ "<!" ++ rest ++ ">"
-
-nonTitleNonHead :: GenParser Char ParserState Char
-nonTitleNonHead = try $ do
-  notFollowedBy $ (htmlOpenTag "title" >> return ' ') <|> 
-                  (htmlEndTag "head" >> return ' ')
-  (rawHtmlBlock >> return ' ') <|> anyChar
-
-parseTitle :: GenParser Char ParserState [Inline]
-parseTitle = try $ do
-  (tag, _) <- htmlOpenTag "title"
-  contents <- inlinesTilEnd tag
-  spaces
-  return contents
-
-- parse header and return meta-information (for now, just title)
-parseHead :: GenParser Char ParserState Meta
-parseHead = try $ do
-  htmlOpenTag "head"
-  spaces
-  skipMany nonTitleNonHead
-  contents <- option [] parseTitle
-  skipMany nonTitleNonHead
-  htmlEndTag "head"
-  return $ Meta contents [] []
-
-- h1 class="title" representation of title in body
-bodyTitle :: GenParser Char ParserState [Inline]
-bodyTitle = try $ do
-  (_, attribs) <- htmlOpenTag "h1"  
-  case (extractAttribute "class" attribs) of
-       Just "title" -> return ""
-       _            -> fail "not title"
-  inlinesTilEnd "h1"
-
-endOfDoc :: GenParser Char ParserState ()
-endOfDoc = try $ do
-  spaces
-  optional (htmlEndTag "body")
-  spaces
-  optional (htmlEndTag "html" >> many anyChar) -- ignore stuff after </html>
-  eof 
-
-parseHtml :: GenParser Char ParserState Pandoc
-parseHtml = do
-  sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
-  spaces
-  optional $ htmlOpenTag "html"
-  spaces
-  meta <- option (Meta [] [] []) parseHead
-  spaces
-  optional $ htmlOpenTag "body"
-  spaces
-  optional bodyTitle  -- skip title in body, because it's represented in meta
-  blocks <- parseBlocks
-  endOfDoc
-  return $ Pandoc meta blocks
-
--
-- parsing blocks
--
-
-parseBlocks :: GenParser Char ParserState [Block]
-parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
-
-block :: GenParser Char ParserState Block
-block = choice [ codeBlock
-               , header
-               , hrule
-               , list
-               , blockQuote
-               , para
-               , plain
-               , rawHtmlBlock'
-               , notFollowedBy' endOfDoc >> char '<' >> return Null
-               ] <?> "block"
-
--
-- header blocks
--
-
-header :: GenParser Char ParserState Block
-header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
-
-headerLevel :: Int -> GenParser Char ParserState Block
-headerLevel n = try $ do
-    let level = "h" ++ show n
-    htmlOpenTag level
-    contents <- inlinesTilEnd level
-    return $ Header n (normalizeSpaces contents)
-
--
-- hrule block
--
-
-hrule :: GenParser Char ParserState Block
-hrule = try  $ do
-  (_, attribs) <- htmlSelfClosingTag "hr"
-  state <- getState
-  if not (null attribs) && stateParseRaw state
-     then unexpected "attributes in hr" -- parse as raw in this case
-     else return HorizontalRule
-
--
-- code blocks
--
-
-- Note:  HTML tags in code blocks (e.g. for syntax highlighting) are 
-- skipped, because they are not portable to output formats other than HTML.
-codeBlock :: GenParser Char ParserState Block
-codeBlock = try $ do
-    htmlOpenTag "pre" 
-    result <- manyTill 
-              (many1 (satisfy (/= '<')) <|> 
-               ((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
-              (htmlEndTag "pre")
-    let result' = concat result
-    -- drop leading newline if any
-    let result'' = if "\n" `isPrefixOf` result'
-                      then drop 1 result'
-                      else result'
-    -- drop trailing newline if any
-    let result''' = if "\n" `isSuffixOf` result''
-                       then init result''
-                       else result''
-    return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result'''
-
--
-- block quotes
--
-
-blockQuote :: GenParser Char ParserState Block
-blockQuote = try $ htmlOpenTag "blockquote" >> spaces >> 
-                   blocksTilEnd "blockquote" >>= (return . BlockQuote)
-
--
-- list blocks
--
-
-list :: GenParser Char ParserState Block
-list = choice [ bulletList, orderedList, definitionList ] <?> "list"
-
-orderedList :: GenParser Char ParserState Block
-orderedList = try $ do
-  (_, attribs) <- htmlOpenTag "ol"
-  (start, style) <- option (1, DefaultStyle) $
-                           do failIfStrict
-                              let sta = fromMaybe "1" $ 
-                                        lookup "start" attribs
-                              let sty = fromMaybe (fromMaybe "" $
-                                        lookup "style" attribs) $
-                                        lookup "class" attribs
-                              let sty' = case sty of
-                                          "lower-roman"  -> LowerRoman
-                                          "upper-roman"  -> UpperRoman
-                                          "lower-alpha"  -> LowerAlpha
-                                          "upper-alpha"  -> UpperAlpha
-                                          "decimal"      -> Decimal
-                                          _              -> DefaultStyle
-                              return (read sta, sty')
-  spaces
-  -- note: if they have an <ol> or <ul> not in scope of a <li>,
-  -- treat it as a list item, though it's not valid xhtml...
-  items <- sepEndBy1 (blocksIn "li" <|> liftM (:[]) list) spaces
-  htmlEndTag "ol"
-  return $ OrderedList (start, style, DefaultDelim) items
-
-bulletList :: GenParser Char ParserState Block
-bulletList = try $ do
-  htmlOpenTag "ul"
-  spaces
-  -- note: if they have an <ol> or <ul> not in scope of a <li>,
-  -- treat it as a list item, though it's not valid xhtml...
-  items <- sepEndBy1 (blocksIn "li" <|> liftM (:[]) list) spaces
-  htmlEndTag "ul"
-  return $ BulletList items
-
-definitionList :: GenParser Char ParserState Block
-definitionList = try $ do
-  failIfStrict  -- def lists not part of standard markdown
-  htmlOpenTag "dl"
-  spaces
-  items <- sepEndBy1 definitionListItem spaces
-  htmlEndTag "dl"
-  return $ DefinitionList items
-
-definitionListItem :: GenParser Char ParserState ([Inline], [[Block]])
-definitionListItem = try $ do
-  terms <- sepEndBy1 (inlinesIn "dt") spaces
-  defs <- sepEndBy1 (blocksIn "dd") spaces
-  let term = intercalate [LineBreak] terms
-  return (term, defs)
-
--
-- paragraph block
--
-
-para :: GenParser Char ParserState Block
-para = try $ htmlOpenTag "p" >> inlinesTilEnd "p" >>= 
-             return . Para . normalizeSpaces
-
-- 
-- plain block
--
-
-plain :: GenParser Char ParserState Block
-plain = many1 inline >>= return . Plain . normalizeSpaces
-
-- 
-- inline
--
-
-inline :: GenParser Char ParserState Inline
-inline = choice [ str
-                , strong
-                , emph
-                , superscript
-                , subscript
-                , strikeout
-                , spanStrikeout
-                , code
-                , linebreak
-                , whitespace
-                , link
-                , image
-                , smartPunctuation inline
-                , charRef
-                , rawHtmlInline
-                , symbol
-                ] <?> "inline"
-
-code :: GenParser Char ParserState Inline
-code = try $ do 
-  result <- (htmlOpenTag "code" >> manyTill (noneOf "<>") (htmlEndTag "code"))
-        <|> (htmlOpenTag "tt"   >> manyTill (noneOf "<>") (htmlEndTag "tt"))
-  -- remove internal line breaks, leading and trailing space,
-  -- and decode character references
-  return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ 
-                  intercalate " " $ lines result
-
-rawHtmlInline :: GenParser Char ParserState Inline
-rawHtmlInline = do
-  result <- anyHtmlInlineTag <|> htmlComment
-  state <- getState
-  if stateParseRaw state then return (HtmlInline result) else return (Str "")
-
-symbol :: GenParser Char ParserState Inline
-symbol = do
-  notFollowedBy (char '<')
-  c <- oneOf specialChars
-  return $ Str [c]
-
-betweenTags :: [Char] -> GenParser Char ParserState [Inline]
-betweenTags tag = try $ htmlOpenTag tag >> inlinesTilEnd tag >>= 
-                        return . normalizeSpaces
-
-emph :: GenParser Char ParserState Inline
-emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph
-
-strong :: GenParser Char ParserState Inline
-strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
-
-superscript :: GenParser Char ParserState Inline
-superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
-
-subscript :: GenParser Char ParserState Inline
-subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
-
-strikeout :: GenParser Char ParserState Inline
-strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
-            return . Strikeout
-
-spanStrikeout :: GenParser Char ParserState Inline
-spanStrikeout = try $ do
-  failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
-  (_, attributes) <- htmlOpenTag "span" 
-  result <- case (extractAttribute "class" attributes) of
-              Just "strikeout" -> inlinesTilEnd "span"
-              _                -> fail "not a strikeout"
-  return $ Strikeout result
-
-whitespace :: GenParser Char st Inline
-whitespace = many1 space >> return Space
-
-- hard line break
-linebreak :: GenParser Char ParserState Inline
-linebreak = htmlSelfClosingTag "br" >> optional newline >> return LineBreak
-
-str :: GenParser Char st Inline
-str = many1 (noneOf $ specialChars ++ " \t\n") >>= return . Str
-
-specialChars :: [Char]
-specialChars = "<&-\"'.\8216\8217\8220\8221"
-
--
-- links and images
--
-
-- extract contents of attribute (attribute names are case-insensitive)
-extractAttribute :: [Char] -> [([Char], String)] -> Maybe String
-extractAttribute _    [] = Nothing
-extractAttribute name ((attrName, contents):rest) = 
-  let name'     = map toLower name 
-      attrName' = map toLower attrName
-  in  if attrName' == name'
-         then Just (decodeCharacterReferences contents)
-         else extractAttribute name rest
-
-link :: GenParser Char ParserState Inline
-link = try $ do
-  (_, attributes) <- htmlOpenTag "a"  
-  url <- case (extractAttribute "href" attributes) of
-           Just url -> return url
-           Nothing  -> fail "no href"
-  let title = fromMaybe "" $ extractAttribute "title" attributes
-  lab <- inlinesTilEnd "a"
-  return $ Link (normalizeSpaces lab) (escapeURI url, title)
-
-image :: GenParser Char ParserState Inline
-image = try $ do
-  (_, attributes) <- htmlSelfClosingTag "img"
-  url <- case (extractAttribute "src" attributes) of
-           Just url -> return url
-           Nothing  -> fail "no src"
-  let title = fromMaybe "" $ extractAttribute "title" attributes
-  let alt = fromMaybe "" (extractAttribute "alt" attributes)
-  return $ Image [Str alt] (escapeURI url, title)
-
+--- parsers for use in markdown, textile readers
+
+-- | Matches a stretch of HTML in balanced tags.
+htmlInBalanced :: (Tag String -> Bool) -> GenParser Char ParserState String
+htmlInBalanced f = try $ do
+  (TagOpen t _, tag) <- htmlTag f
+  guard $ '/' `notElem` tag      -- not a self-closing tag
+  let nonTagChunk = many1 $ satisfy (/= '<')
+  let stopper = htmlTag (~== TagClose t)
+  let anytag = liftM snd $ htmlTag (const True)
+  contents <- many $ notFollowedBy' stopper >>
+                     (nonTagChunk <|> htmlInBalanced (const True) <|> anytag)
+  endtag <- liftM snd stopper
+  return $ tag ++ concat contents ++ endtag
+
+-- | Matches a tag meeting a certain condition.
+htmlTag :: (Tag String -> Bool) -> GenParser Char ParserState (Tag String, String)
+htmlTag f = try $ do
+  lookAhead (char '<')
+  (next : _) <- getInput >>= return . canonicalizeTags . parseTags
+  guard $ f next
+  -- advance the parser
+  rendered <- manyTill anyChar (char '>')
+  return (next, rendered ++ ">")
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@ -29,7 +29,7 @@ Conversion of markdown-formatted plain text to 'Pandoc' document.
 -}
 module Text.Pandoc.Readers.Markdown ( readMarkdown ) where

-import Data.List ( transpose, isSuffixOf, sortBy, findIndex, intercalate )
+import Data.List ( transpose, sortBy, findIndex, intercalate )
 import qualified Data.Map as M
 import Data.Ord ( comparing )
 import Data.Char ( isAlphaNum )
@ -39,14 +39,14 @@ import Text.Pandoc.Generic
 import Text.Pandoc.Shared
 import Text.Pandoc.Parsing
 import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
-import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, 
-                                  anyHtmlInlineTag, anyHtmlTag,
-                                  anyHtmlEndTag, htmlEndTag, extractTagType,
-                                  htmlBlockElement, htmlComment )
+import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag,
+                                  isTextTag, isCommentTag )
 import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
 import Text.ParserCombinators.Parsec
 import Control.Monad (when, liftM, guard)
 import Text.TeXMath.Macros (applyMacros, Macro, pMacroDefinition)
+import Text.HTML.TagSoup
+import Text.HTML.TagSoup.Match (tagOpen)

 -- | Read markdown from an input string and return a Pandoc document.
 readMarkdown :: ParserState -- ^ Parser state, including options for parser
@ -532,7 +532,7 @@ listLine = try $ do
  notFollowedBy' (do indentSpaces
                     many (spaceChar)
                     listStart)
-  chunks <- manyTill (htmlComment <|> count 1 anyChar) newline
+  chunks <- manyTill (liftM snd (htmlTag isCommentTag) <|> count 1 anyChar) newline
  return $ concat chunks ++ "\n"

 -- parse raw text for one list item, excluding start marker and continuations
@ -676,7 +676,7 @@ plain = many1 inline >>~ spaces >>= return . Plain . normalizeSpaces
 --

 htmlElement :: GenParser Char ParserState [Char]
-htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
+htmlElement = strictHtmlBlock <|> liftM snd (htmlTag isBlockTag)

 htmlBlock :: GenParser Char ParserState Block
 htmlBlock = try $ do
@ -686,25 +686,23 @@ htmlBlock = try $ do
    finalNewlines <- many newline
    return $ RawHtml $ first ++ finalSpace ++ finalNewlines

-- True if tag is self-closing
-isSelfClosing :: [Char] -> Bool
-isSelfClosing tag = 
-  isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag
-
 strictHtmlBlock :: GenParser Char ParserState [Char]
-strictHtmlBlock = try $ do
-  tag <- anyHtmlBlockTag 
-  let tag' = extractTagType tag
-  if isSelfClosing tag || tag' == "hr" 
-     then return tag
-     else do contents <- many (notFollowedBy' (htmlEndTag tag') >> 
-                               (htmlElement <|> (count 1 anyChar)))
-             end <- htmlEndTag tag'
-             return $ tag ++ concat contents ++ end
+strictHtmlBlock = do
+  failUnlessBeginningOfLine
+  htmlInBalanced (not . isInlineTag)
+
+rawVerbatimBlock :: GenParser Char ParserState String
+rawVerbatimBlock = try $ do
+  (TagOpen tag _, open) <- htmlTag (tagOpen (\t ->
+                                      t == "pre" || t == "style" || t == "script")
+                                     (const True))
+  contents <- manyTill anyChar (htmlTag (~== TagClose tag))
+  return $ open ++ contents ++ renderTags [TagClose tag]

 rawHtmlBlocks :: GenParser Char ParserState Block
 rawHtmlBlocks = do
-  htmlBlocks <- many1 $ do (RawHtml blk) <- rawHtmlBlock
+  htmlBlocks <- many1 $ do blk <- rawVerbatimBlock <|>
+                                   liftM snd (htmlTag isBlockTag)
                           sps <- do sp1 <- many spaceChar
                                     sp2 <- option "" (blankline >> return "\n")
                                     sp3 <- many spaceChar
@ -921,7 +919,7 @@ inlineParsers = [ str
                , subscript
                , inlineNote  -- after superscript because of ^[link](/foo)^
                , autoLink
-                , rawHtmlInline'
+                , rawHtmlInline
                , rawLaTeXInline'
                , escapedChar
                , exampleRef
@ -1221,12 +1219,12 @@ inBrackets parser = do
  char ']'
  return $ "[" ++ contents ++ "]"

-rawHtmlInline' :: GenParser Char ParserState Inline
-rawHtmlInline' = do
+rawHtmlInline :: GenParser Char ParserState Inline
+rawHtmlInline = do
  st <- getState
-  result <- if stateStrict st
-               then choice [htmlBlockElement, anyHtmlTag, anyHtmlEndTag] 
-               else choice [htmlComment, anyHtmlInlineTag]
+  (_,result) <- if stateStrict st
+                   then htmlTag (not . isTextTag)
+                   else htmlTag isInlineTag
  return $ HtmlInline result

 -- Citations
@ -1315,3 +1313,4 @@ citation = try $ do
                     , citationNoteNum = 0
                     , citationHash    = 0
                     }
+
--- a/src/Text/Pandoc/Readers/Textile.hs
+++ b/src/Text/Pandoc/Readers/Textile.hs
@ -58,10 +58,9 @@ module Text.Pandoc.Readers.Textile ( readTextile) where
 import Text.Pandoc.Definition
 import Text.Pandoc.Shared 
 import Text.Pandoc.Parsing
-import Text.Pandoc.Readers.HTML ( htmlTag, htmlEndTag, -- find code blocks
-                                  rawHtmlBlock, rawHtmlInline )
-- import Text.Pandoc.Readers.Markdown (smartPunctuation)
+import Text.Pandoc.Readers.HTML ( htmlTag, isInlineTag, isBlockTag )
 import Text.ParserCombinators.Parsec
+import Text.HTML.TagSoup.Match
 import Data.Char ( digitToInt, isLetter )
 import Control.Monad ( guard, liftM )

@ -127,7 +126,7 @@ blockParsers = [ codeBlock
               , blockQuote
               , hrule
               , anyList
-               , rawHtmlBlock'
+               , rawHtmlBlock
               , maybeExplicitBlock "table" table
               , maybeExplicitBlock "p" para
               , nullBlock ]
@ -139,8 +138,8 @@ block = choice blockParsers <?> "block"
 -- | Code Blocks in Textile are between <pre> and </pre>
 codeBlock :: GenParser Char ParserState Block
 codeBlock = try $ do
-  htmlTag False "pre"
-  result' <- manyTill anyChar (try $ htmlEndTag "pre" >> blockBreak)
+  htmlTag (tagOpen (=="pre") null)
+  result' <- manyTill anyChar (try $ htmlTag (tagClose (=="pre")) >> blockBreak)
  -- drop leading newline if any
  let result'' = case result' of
                      '\n':xs -> xs
@ -261,21 +260,19 @@ definitionListItem = try $ do
          -- this ++ "\n\n" does not look very good
          ds <- parseFromString parseBlocks (s ++ "\n\n")
          return [ds]
-  
-  
+
 -- | This terminates a block such as a paragraph. Because of raw html
 -- blocks support, we have to lookAhead for a rawHtmlBlock.
 blockBreak :: GenParser Char ParserState ()
-blockBreak = try $ choice 
-             [newline >> blanklines >> return (),
-              lookAhead rawHtmlBlock' >> return ()]
+blockBreak = try (newline >> blanklines >> return ()) <|>
+              (lookAhead rawHtmlBlock >> return ())

 -- | A raw Html Block, optionally followed by blanklines
-rawHtmlBlock' :: GenParser Char ParserState Block
-rawHtmlBlock' = try $ do
-  b <- rawHtmlBlock
+rawHtmlBlock :: GenParser Char ParserState Block
+rawHtmlBlock = try $ do
+  (_,b) <- htmlTag isBlockTag
  optional blanklines
-  return b
+  return $ RawHtml b

 -- | In textile, paragraphs are separated by blank lines.
 para :: GenParser Char ParserState Block
@ -450,6 +447,9 @@ endline = try $ do
  newline >> notFollowedBy blankline
  return LineBreak

+rawHtmlInline :: GenParser Char ParserState Inline
+rawHtmlInline = liftM (HtmlInline . snd) $ htmlTag isInlineTag
+
 -- | Textile standard link syntax is label:"target"
 link :: GenParser Char ParserState Inline
 link = try $ do
--- a/tests/html-reader.native
+++ b/tests/html-reader.native
@ -269,8 +269,8 @@ Pandoc (Meta {docTitle = [Str "Pandoc",Space,Str "Test",Space,Str "Suite"], docA
  , [ Plain [Str "section:",Space,Str "\167"] ]
  , [ Plain [Str "set",Space,Str "membership:",Space,Str "\8712"] ]
  , [ Plain [Str "copyright:",Space,Str "\169"] ] ]
-, Para [Str "AT",Str "&",Str "T",Space,Str "has",Space,Str "an",Space,Str "ampersand",Space,Str "in",Space,Str "their",Space,Str "name",Str "."]
-, Para [Str "AT",Str "&",Str "T",Space,Str "is",Space,Str "another",Space,Str "way",Space,Str "to",Space,Str "write",Space,Str "it",Str "."]
+, Para [Str "AT&T",Space,Str "has",Space,Str "an",Space,Str "ampersand",Space,Str "in",Space,Str "their",Space,Str "name",Str "."]
+, Para [Str "AT&T",Space,Str "is",Space,Str "another",Space,Str "way",Space,Str "to",Space,Str "write",Space,Str "it",Str "."]
 , Para [Str "This",Space,Str "&",Space,Str "that",Str "."]
 , Para [Str "4",Space,Str "<",Space,Str "5",Str "."]
 , Para [Str "6",Space,Str ">",Space,Str "5",Str "."]
@ -316,11 +316,11 @@ Pandoc (Meta {docTitle = [Str "Pandoc",Space,Str "Test",Space,Str "Suite"], docA
 , Para [Str "Foo",Space,Link [Str "biz"] ("/url/","Title with \"quote\" inside"),Str "."]
 , Header 2 [Str "With",Space,Str "ampersands"]
 , Para [Str "Here",Str "'",Str "s",Space,Str "a",Space,Link [Str "link",Space,Str "with",Space,Str "an",Space,Str "ampersand",Space,Str "in",Space,Str "the",Space,Str "URL"] ("http://example.com/?foo=1&bar=2",""),Str "."]
-, Para [Str "Here",Str "'",Str "s",Space,Str "a",Space,Str "link",Space,Str "with",Space,Str "an",Space,Str "amersand",Space,Str "in",Space,Str "the",Space,Str "link",Space,Str "text:",Space,Link [Str "AT",Str "&",Str "T"] ("http://att.com/","AT&T"),Str "."]
+, Para [Str "Here",Str "'",Str "s",Space,Str "a",Space,Str "link",Space,Str "with",Space,Str "an",Space,Str "amersand",Space,Str "in",Space,Str "the",Space,Str "link",Space,Str "text:",Space,Link [Str "AT&T"] ("http://att.com/","AT&T"),Str "."]
 , Para [Str "Here",Str "'",Str "s",Space,Str "an",Space,Link [Str "inline",Space,Str "link"] ("/script?foo=1&bar=2",""),Str "."]
 , Para [Str "Here",Str "'",Str "s",Space,Str "an",Space,Link [Str "inline",Space,Str "link",Space,Str "in",Space,Str "pointy",Space,Str "braces"] ("/script?foo=1&bar=2",""),Str "."]
 , Header 2 [Str "Autolinks"]
-, Para [Str "With",Space,Str "an",Space,Str "ampersand:",Space,Link [Str "http://example",Str ".",Str "com/?foo=1",Str "&",Str "bar=2"] ("http://example.com/?foo=1&bar=2","")]
+, Para [Str "With",Space,Str "an",Space,Str "ampersand:",Space,Link [Str "http://example",Str ".",Str "com/?foo=1&bar=2"] ("http://example.com/?foo=1&bar=2","")]
 , BulletList
  [ [ Plain [Str "In",Space,Str "a",Space,Str "list?"] ]
  , [ Plain [Link [Str "http://example",Str ".",Str "com/"] ("http://example.com/","")] ]
--- a/tests/testsuite.txt
+++ b/tests/testsuite.txt
@ -431,21 +431,21 @@ Hr's:

 <hr>

-<hr/>
+<hr />

 <hr />

 <hr>   

-<hr/>  
+<hr />  

 <hr /> 

 <hr class="foo" id="bar" />

-<hr class="foo" id="bar"/>
+<hr class="foo" id="bar" />

-<hr class="foo" id="bar" >
+<hr class="foo" id="bar">

 -----