pandoc/src/Text/Pandoc/Readers/HTML.hs

-- | Converts HTML to 'Pandoc' document.
module Text.Pandoc.Readers.HTML (
                                 readHtml,
                                 rawHtmlInline,
                                 rawHtmlBlock,
                                 anyHtmlBlockTag,
                                 anyHtmlInlineTag
                                ) where

import Text.Regex ( matchRegex, mkRegex )
import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Pandoc
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.HtmlEntities ( decodeEntities, htmlEntityToChar )
import Maybe ( fromMaybe )
import Char ( toUpper, toLower )

-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ParserState   -- ^ Parser state
         -> String        -- ^ String to parse
         -> Pandoc
readHtml = readWith parseHtml

-- for testing
testString :: String -> IO ()
testString = testStringWith parseHtml

--
-- Constants
--

inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "br", "cite",
                  "code", "dfn", "em", "font", "i", "img", "input", "kbd", "label", "q",
                  "s", "samp", "select", "small", "span", "strike", "strong", "sub",
                  "sup", "textarea", "tt", "u", "var"]

--
-- HTML utility functions
--

-- | Read blocks until end tag.
blocksTilEnd tag = try (do
  blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag)
  return blocks)

-- | Read inlines until end tag.
inlinesTilEnd tag = try (do
  inlines <- manyTill inline (htmlEndTag tag)
  return inlines)

-- extract type from a tag:  e.g. br from <br>, < br >, </br>, etc.
extractTagType tag = case (matchRegex (mkRegex  "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of
                       Just [match]   -> (map toLower match)
                       Nothing        -> ""

anyHtmlTag = try (do
  char '<'
  spaces
  tag <- many1 alphaNum
  attribs <- htmlAttributes
  spaces
  ender <- option "" (string "/")
  let ender' = if (null ender) then "" else " /"
  spaces
  char '>'
  return ("<" ++ tag ++ attribs ++ ender' ++ ">"))

anyHtmlEndTag = try (do
  char '<'
  spaces
  char '/'
  spaces
  tagType <- many1 alphaNum
  spaces
  char '>'
  return ("</" ++ tagType ++ ">"))

htmlTag :: String -> GenParser Char st (String, [(String, String)])
htmlTag tag = try (do
  char '<'
  spaces
  stringAnyCase tag
  attribs <- many htmlAttribute
  spaces
  option "" (string "/")
  spaces
  char '>'
  return (tag, (map (\(name, content, raw) -> (name, content)) attribs)))

-- parses a quoted html attribute value
quoted quoteChar = do
  result <- between (char quoteChar) (char quoteChar) (many (noneOf [quoteChar]))
  return (result, [quoteChar])

htmlAttributes = do
  attrList <- many htmlAttribute
  return (concatMap (\(name, content, raw) -> raw) attrList)

htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute

-- minimized boolean attribute (no = and value)
htmlMinimizedAttribute = try (do
  spaces
  name <- many1 (choice [letter, oneOf ".-_:"])
  spaces
  notFollowedBy (char '=')
  let content = name
  return (name, content, (" " ++ name)))

htmlRegularAttribute = try (do
  spaces
  name <- many1 (choice [letter, oneOf ".-_:"])
  spaces
  char '='
  spaces
  (content, quoteStr) <- choice [ (quoted '\''),
                                  (quoted '"'),
                                  (do{ a <- (many (alphaNum <|> (oneOf "-._:")));
                                       return (a,"")} ) ]
  return (name, content, (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))

htmlEndTag tag = try (do
  char '<'
  spaces
  char '/'
  spaces
  stringAnyCase tag
  spaces
  char '>'
  return ("</" ++ tag ++ ">"))

-- | Returns @True@ if the tag is an inline tag.
isInline tag = (extractTagType tag) `elem` inlineHtmlTags

anyHtmlBlockTag = try (do
  tag <- choice [anyHtmlTag, anyHtmlEndTag]
  if isInline tag then
      fail "inline tag"
    else
      return tag)

anyHtmlInlineTag = try (do
  tag <- choice [ anyHtmlTag, anyHtmlEndTag ]
  if isInline tag then
      return tag
    else
      fail "not an inline tag")

-- scripts must be treated differently, because they can contain <> etc.
htmlScript = try (do
  open <- string "<script"
  rest <- manyTill anyChar (htmlEndTag "script")
  return (open ++ rest ++ "</script>"))

rawHtmlBlock = do
  notFollowedBy (do {choice [htmlTag "/body", htmlTag "/html"]; return ' '})
  body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, definition]
  sp <- (many space)
  state <- getState
  if stateParseRaw state then
      return (RawHtml (body ++ sp))
    else
      return Null

htmlComment = try (do
  string "<!--"
  comment <- manyTill anyChar (try (string "-->"))
  return ("<!--" ++ comment ++ "-->"))

--
-- parsing documents
--

xmlDec = try (do
  string "<?"
  rest <- manyTill anyChar (char '>')
  return ("<?" ++ rest ++ ">"))

definition = try (do
  string "<!"
  rest <- manyTill anyChar (char '>')
  return ("<!" ++ rest ++ ">"))

nonTitleNonHead = try (do
  notFollowedBy' (htmlTag "title")
  notFollowedBy' (htmlTag "/head")
  result <- choice [do {rawHtmlBlock; return ' '}, anyChar]
  return result)

parseTitle = try (do
  (tag, attribs) <- htmlTag "title"
  contents <- inlinesTilEnd tag
  spaces
  return contents)

-- parse header and return meta-information (for now, just title)
parseHead = try (do
  htmlTag "head"
  spaces
  skipMany nonTitleNonHead
  contents <- option [] parseTitle
  skipMany nonTitleNonHead
  htmlTag "/head"
  return (contents, [], ""))

skipHtmlTag tag = option ("",[]) (htmlTag tag)

-- h1 class="title" representation of title in body
bodyTitle = try (do
  (tag, attribs) <- htmlTag "h1"
  cl <- case (extractAttribute "class" attribs) of
          Just "title" -> do {return ""}
          otherwise    -> fail "not title"
  inlinesTilEnd "h1"
  return "")

parseHtml = do
  sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
  skipHtmlTag "html"
  spaces
  (title, authors, date) <- option ([], [], "") parseHead
  spaces
  skipHtmlTag "body"
  spaces
  option "" bodyTitle  -- skip title in body, because it's represented in meta
  blocks <- parseBlocks
  spaces
  option "" (htmlEndTag "body")
  spaces
  option "" (htmlEndTag "html")
  many anyChar -- ignore anything after </html>
  eof
  state <- getState
  let keyBlocks = stateKeyBlocks state
  return (Pandoc (Meta title authors date) (blocks ++ (reverse keyBlocks)))

--
-- parsing blocks
--

parseBlocks = do
  spaces
  result <- sepEndBy block spaces
  return result

block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain,
                 rawHtmlBlock ] <?> "block"

--
-- header blocks
--

header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"

headerLevel n = try (do
    let level = "h" ++ show n
    (tag, attribs) <- htmlTag level
    contents <- inlinesTilEnd level
    return (Header n (normalizeSpaces contents)))

--
-- hrule block
--

hrule = try (do
  (tag, attribs) <- htmlTag "hr"
  state <- getState
  if (not (null attribs)) && (stateParseRaw state) then
       unexpected "attributes in hr"  -- in this case we want to parse it as raw html
     else
       return HorizontalRule)

--
-- code blocks
--

codeBlock = choice [ preCodeBlock, bareCodeBlock ] <?> "code block"

preCodeBlock = try (do
    htmlTag "pre"
    spaces
    htmlTag "code"
    result <- manyTill anyChar (htmlEndTag "code")
    spaces
    htmlEndTag "pre"
    return (CodeBlock (decodeEntities result)))

bareCodeBlock = try (do
    htmlTag "code"
    result <- manyTill anyChar (htmlEndTag "code")
    return (CodeBlock (decodeEntities result)))

--
-- block quotes
--

blockQuote = try (do
    tag <- htmlTag "blockquote"
    spaces
    blocks <- blocksTilEnd "blockquote"
    return (BlockQuote blocks))

--
-- list blocks
--

list = choice [ bulletList, orderedList ] <?> "list"

orderedList = try (do
    tag <- htmlTag "ol"
    spaces
    items <- sepEndBy1 listItem spaces
    htmlEndTag "ol"
    return (OrderedList items))

bulletList = try (do
    tag <- htmlTag "ul"
    spaces
    items <- sepEndBy1 listItem spaces
    htmlEndTag "ul"
    return (BulletList items))

listItem = try (do
    tag <- htmlTag "li"
    spaces
    blocks <- blocksTilEnd "li"
    return blocks)

--
-- paragraph block
--

para = try (do
  tag <- htmlTag "p"
  result <- inlinesTilEnd "p"
  return (Para (normalizeSpaces result)))

--
-- plain block
--

plain = do
    result <- many1 inline
    return (Plain (normalizeSpaces result))

--
-- inline
--

inline = choice [ text, special ] <?> "inline"

text =  choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "text"

special = choice [ link, image, rawHtmlInline ] <?> "link, inline html, or image"

entity = try (do
  char '&'
  body <- choice [(many1 letter),
                  (try (do{ char '#'; num <- many1 digit; return ("#" ++ num)}))]
  char ';'
  return (Str [fromMaybe '?' (htmlEntityToChar ("&" ++ body ++ ";"))]))

code = try (do
  htmlTag "code"
  result <- manyTill anyChar (htmlEndTag "code")
  -- remove internal line breaks, leading and trailing space, and decode entities
  let result' = decodeEntities $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result
  return (Code result'))

rawHtmlInline = do
  result <- choice [htmlScript, anyHtmlInlineTag]
  state <- getState
  if stateParseRaw state then
      return (HtmlInline result)
    else
      return (Str "")

betweenTags tag = try (do
  htmlTag tag
  result <- inlinesTilEnd tag
  return (normalizeSpaces result))

emph = try (do
  result <- choice [betweenTags "em", betweenTags "it"]
  return (Emph result))

strong = try (do
  result <- choice [betweenTags "b", betweenTags "strong"]
  return (Strong result))

whitespace = do
  many1 space
  return Space

-- hard line break
linebreak = do
  htmlTag "br"
  return LineBreak

str = do
  result <- many1 (noneOf "<& \t\n")
  return (Str (decodeEntities result))

--
-- links and images
--

-- extract contents of attribute (attribute names are case-insensitive)
extractAttribute name [] = Nothing
extractAttribute name ((attrName, contents):rest) =
  let name' = map toLower name
      attrName' = map toLower attrName in
  if (attrName' == name') then Just contents else extractAttribute name rest

link = try (do
  (tag, attributes) <- htmlTag "a"
  url <- case (extractAttribute "href" attributes) of
           Just url -> do {return url}
           Nothing  -> fail "no href"
  let title = fromMaybe "" (extractAttribute "title" attributes)
  label <- inlinesTilEnd "a"
  ref <- generateReference url title
  return (Link (normalizeSpaces label) ref))

image = try (do
  (tag, attributes) <- htmlTag "img"
  url <- case (extractAttribute "src" attributes) of
           Just url -> do {return url}
           Nothing  -> fail "no src"
  let title = fromMaybe "" (extractAttribute "title" attributes)
  let alt = fromMaybe "" (extractAttribute "alt" attributes)
  ref <- generateReference url title
  return (Image [Str alt] ref))