a2194f23db
HTML reader. Thanks to Bradley Sif for the patch for Strikeout (Issue #18). git-svn-id: https://pandoc.googlecode.com/svn/trunk@753 788f1e2b-df1e-0410-8736-df70ead52e1b
492 lines
13 KiB
Haskell
492 lines
13 KiB
Haskell
{-
|
|
Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
-}
|
|
|
|
{- |
|
|
Module : Text.Pandoc.Readers.HTML
|
|
Copyright : Copyright (C) 2006-7 John MacFarlane
|
|
License : GNU GPL, version 2 or above
|
|
|
|
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
|
Stability : alpha
|
|
Portability : portable
|
|
|
|
Conversion of HTML to 'Pandoc' document.
|
|
-}
|
|
module Text.Pandoc.Readers.HTML (
|
|
readHtml,
|
|
rawHtmlInline,
|
|
rawHtmlBlock,
|
|
anyHtmlBlockTag,
|
|
anyHtmlInlineTag,
|
|
anyHtmlTag,
|
|
anyHtmlEndTag,
|
|
htmlEndTag,
|
|
extractTagType,
|
|
htmlBlockElement
|
|
) where
|
|
|
|
import Text.ParserCombinators.Parsec
|
|
import Text.Pandoc.ParserCombinators
|
|
import Text.Pandoc.Definition
|
|
import Text.Pandoc.Shared
|
|
import Text.Pandoc.Entities ( characterEntity, decodeEntities )
|
|
import Data.Maybe ( fromMaybe )
|
|
import Data.List ( intersect, takeWhile, dropWhile )
|
|
import Data.Char ( toUpper, toLower, isAlphaNum )
|
|
|
|
-- | Convert HTML-formatted string to 'Pandoc' document.
|
|
readHtml :: ParserState -- ^ Parser state
|
|
-> String -- ^ String to parse
|
|
-> Pandoc
|
|
readHtml = readWith parseHtml
|
|
|
|
-- for testing
|
|
testString :: String -> IO ()
|
|
testString = testStringWith parseHtml
|
|
|
|
--
|
|
-- Constants
|
|
--
|
|
|
|
inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
|
|
"br", "cite", "code", "dfn", "em", "font", "i", "img",
|
|
"input", "kbd", "label", "q", "s", "samp", "select",
|
|
"small", "span", "strike", "strong", "sub", "sup",
|
|
"textarea", "tt", "u", "var"]
|
|
|
|
--
|
|
-- HTML utility functions
|
|
--
|
|
|
|
-- | Read blocks until end tag.
|
|
blocksTilEnd tag = try (do
|
|
blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag)
|
|
return blocks)
|
|
|
|
-- | Read inlines until end tag.
|
|
inlinesTilEnd tag = try (do
|
|
inlines <- manyTill inline (htmlEndTag tag)
|
|
return inlines)
|
|
|
|
-- | Extract type from a tag: e.g. @br@ from @\<br\>@
|
|
extractTagType :: String -> String
|
|
extractTagType ('<':rest) =
|
|
let isSpaceOrSlash c = c `elem` "/ \n\t" in
|
|
map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
|
|
extractTagType _ = ""
|
|
|
|
-- | Parse any HTML tag (closing or opening) and return text of tag
|
|
anyHtmlTag = try (do
|
|
char '<'
|
|
spaces
|
|
tag <- many1 alphaNum
|
|
attribs <- htmlAttributes
|
|
spaces
|
|
ender <- option "" (string "/")
|
|
let ender' = if (null ender) then "" else " /"
|
|
spaces
|
|
char '>'
|
|
return ("<" ++ tag ++ attribs ++ ender' ++ ">"))
|
|
|
|
anyHtmlEndTag = try (do
|
|
char '<'
|
|
spaces
|
|
char '/'
|
|
spaces
|
|
tagType <- many1 alphaNum
|
|
spaces
|
|
char '>'
|
|
return ("</" ++ tagType ++ ">"))
|
|
|
|
htmlTag :: String -> GenParser Char st (String, [(String, String)])
|
|
htmlTag tag = try (do
|
|
char '<'
|
|
spaces
|
|
stringAnyCase tag
|
|
attribs <- many htmlAttribute
|
|
spaces
|
|
option "" (string "/")
|
|
spaces
|
|
char '>'
|
|
return (tag, (map (\(name, content, raw) -> (name, content)) attribs)))
|
|
|
|
-- parses a quoted html attribute value
|
|
quoted quoteChar = do
|
|
result <- between (char quoteChar) (char quoteChar)
|
|
(many (noneOf [quoteChar]))
|
|
return (result, [quoteChar])
|
|
|
|
htmlAttributes = do
|
|
attrList <- many htmlAttribute
|
|
return (concatMap (\(name, content, raw) -> raw) attrList)
|
|
|
|
htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
|
|
|
|
-- minimized boolean attribute (no = and value)
|
|
htmlMinimizedAttribute = try (do
|
|
many1 space
|
|
name <- many1 (choice [letter, oneOf ".-_:"])
|
|
spaces
|
|
notFollowedBy (char '=')
|
|
let content = name
|
|
return (name, content, (" " ++ name)))
|
|
|
|
htmlRegularAttribute = try (do
|
|
many1 space
|
|
name <- many1 (choice [letter, oneOf ".-_:"])
|
|
spaces
|
|
char '='
|
|
spaces
|
|
(content, quoteStr) <- choice [ (quoted '\''),
|
|
(quoted '"'),
|
|
(do
|
|
a <- many (alphaNum <|> (oneOf "-._:"))
|
|
return (a,"")) ]
|
|
return (name, content,
|
|
(" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))
|
|
|
|
-- | Parse an end tag of type 'tag'
|
|
htmlEndTag tag = try (do
|
|
char '<'
|
|
spaces
|
|
char '/'
|
|
spaces
|
|
stringAnyCase tag
|
|
spaces
|
|
char '>'
|
|
return ("</" ++ tag ++ ">"))
|
|
|
|
-- | Returns @True@ if the tag is an inline tag.
|
|
isInline tag = (extractTagType tag) `elem` inlineHtmlTags
|
|
|
|
anyHtmlBlockTag = try (do
|
|
tag <- choice [anyHtmlTag, anyHtmlEndTag]
|
|
if isInline tag then fail "inline tag" else return tag)
|
|
|
|
anyHtmlInlineTag = try (do
|
|
tag <- choice [ anyHtmlTag, anyHtmlEndTag ]
|
|
if isInline tag then return tag else fail "not an inline tag")
|
|
|
|
-- | Parses material between script tags.
|
|
-- Scripts must be treated differently, because they can contain '<>' etc.
|
|
htmlScript = try (do
|
|
open <- string "<script"
|
|
rest <- manyTill anyChar (htmlEndTag "script")
|
|
return (open ++ rest ++ "</script>"))
|
|
|
|
htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
|
|
|
|
rawHtmlBlock = try (do
|
|
notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"])
|
|
body <- htmlBlockElement <|> anyHtmlBlockTag
|
|
sp <- (many space)
|
|
state <- getState
|
|
if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null)
|
|
|
|
-- | Parses an HTML comment.
|
|
htmlComment = try (do
|
|
string "<!--"
|
|
comment <- manyTill anyChar (try (string "-->"))
|
|
return ("<!--" ++ comment ++ "-->"))
|
|
|
|
--
|
|
-- parsing documents
|
|
--
|
|
|
|
xmlDec = try (do
|
|
string "<?"
|
|
rest <- manyTill anyChar (char '>')
|
|
return ("<?" ++ rest ++ ">"))
|
|
|
|
definition = try (do
|
|
string "<!"
|
|
rest <- manyTill anyChar (char '>')
|
|
return ("<!" ++ rest ++ ">"))
|
|
|
|
nonTitleNonHead = try (do
|
|
notFollowedBy' (htmlTag "title")
|
|
notFollowedBy' (htmlTag "/head")
|
|
result <- choice [do {rawHtmlBlock; return ' '}, anyChar]
|
|
return result)
|
|
|
|
parseTitle = try (do
|
|
(tag, attribs) <- htmlTag "title"
|
|
contents <- inlinesTilEnd tag
|
|
spaces
|
|
return contents)
|
|
|
|
-- parse header and return meta-information (for now, just title)
|
|
parseHead = try (do
|
|
htmlTag "head"
|
|
spaces
|
|
skipMany nonTitleNonHead
|
|
contents <- option [] parseTitle
|
|
skipMany nonTitleNonHead
|
|
htmlTag "/head"
|
|
return (contents, [], ""))
|
|
|
|
skipHtmlTag tag = option ("",[]) (htmlTag tag)
|
|
|
|
-- h1 class="title" representation of title in body
|
|
bodyTitle = try (do
|
|
(tag, attribs) <- htmlTag "h1"
|
|
cl <- case (extractAttribute "class" attribs) of
|
|
Just "title" -> do {return ""}
|
|
otherwise -> fail "not title"
|
|
inlinesTilEnd "h1"
|
|
return "")
|
|
|
|
parseHtml = do
|
|
sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
|
|
skipHtmlTag "html"
|
|
spaces
|
|
(title, authors, date) <- option ([], [], "") parseHead
|
|
spaces
|
|
skipHtmlTag "body"
|
|
spaces
|
|
option "" bodyTitle -- skip title in body, because it's represented in meta
|
|
blocks <- parseBlocks
|
|
spaces
|
|
option "" (htmlEndTag "body")
|
|
spaces
|
|
option "" (htmlEndTag "html")
|
|
many anyChar -- ignore anything after </html>
|
|
eof
|
|
return (Pandoc (Meta title authors date) blocks)
|
|
|
|
--
|
|
-- parsing blocks
|
|
--
|
|
|
|
parseBlocks = do
|
|
spaces
|
|
result <- sepEndBy block spaces
|
|
return result
|
|
|
|
block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain,
|
|
rawHtmlBlock ] <?> "block"
|
|
|
|
--
|
|
-- header blocks
|
|
--
|
|
|
|
header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
|
|
|
|
headerLevel n = try (do
|
|
let level = "h" ++ show n
|
|
(tag, attribs) <- htmlTag level
|
|
contents <- inlinesTilEnd level
|
|
return (Header n (normalizeSpaces contents)))
|
|
|
|
--
|
|
-- hrule block
|
|
--
|
|
|
|
hrule = try (do
|
|
(tag, attribs) <- htmlTag "hr"
|
|
state <- getState
|
|
if (not (null attribs)) && (stateParseRaw state)
|
|
then -- in this case we want to parse it as raw html
|
|
unexpected "attributes in hr"
|
|
else return HorizontalRule)
|
|
|
|
--
|
|
-- code blocks
|
|
--
|
|
|
|
codeBlock = choice [ preCodeBlock, bareCodeBlock ] <?> "code block"
|
|
|
|
preCodeBlock = try (do
|
|
htmlTag "pre"
|
|
spaces
|
|
htmlTag "code"
|
|
result <- manyTill anyChar (htmlEndTag "code")
|
|
spaces
|
|
htmlEndTag "pre"
|
|
return (CodeBlock (stripTrailingNewlines (decodeEntities result))))
|
|
|
|
bareCodeBlock = try (do
|
|
htmlTag "code"
|
|
result <- manyTill anyChar (htmlEndTag "code")
|
|
return (CodeBlock (stripTrailingNewlines (decodeEntities result))))
|
|
|
|
--
|
|
-- block quotes
|
|
--
|
|
|
|
blockQuote = try (do
|
|
tag <- htmlTag "blockquote"
|
|
spaces
|
|
blocks <- blocksTilEnd "blockquote"
|
|
return (BlockQuote blocks))
|
|
|
|
--
|
|
-- list blocks
|
|
--
|
|
|
|
list = choice [ bulletList, orderedList ] <?> "list"
|
|
|
|
orderedList = try (do
|
|
tag <- htmlTag "ol"
|
|
spaces
|
|
items <- sepEndBy1 listItem spaces
|
|
htmlEndTag "ol"
|
|
return (OrderedList items))
|
|
|
|
bulletList = try (do
|
|
tag <- htmlTag "ul"
|
|
spaces
|
|
items <- sepEndBy1 listItem spaces
|
|
htmlEndTag "ul"
|
|
return (BulletList items))
|
|
|
|
listItem = try (do
|
|
tag <- htmlTag "li"
|
|
spaces
|
|
blocks <- blocksTilEnd "li"
|
|
return blocks)
|
|
|
|
--
|
|
-- paragraph block
|
|
--
|
|
|
|
para = try (do
|
|
tag <- htmlTag "p"
|
|
result <- inlinesTilEnd "p"
|
|
return (Para (normalizeSpaces result)))
|
|
|
|
--
|
|
-- plain block
|
|
--
|
|
|
|
plain = do
|
|
result <- many1 inline
|
|
return (Plain (normalizeSpaces result))
|
|
|
|
--
|
|
-- inline
|
|
--
|
|
|
|
inline = choice [ text, special ] <?> "inline"
|
|
|
|
text = choice [ entity, strong, emph, superscript, subscript,
|
|
strikeout, spanStrikeout, code, str,
|
|
linebreak, whitespace ] <?> "text"
|
|
|
|
special = choice [ link, image, rawHtmlInline ] <?>
|
|
"link, inline html, or image"
|
|
|
|
entity = do
|
|
ent <- characterEntity
|
|
return $ Str [ent]
|
|
|
|
code = try (do
|
|
htmlTag "code"
|
|
result <- manyTill anyChar (htmlEndTag "code")
|
|
-- remove internal line breaks, leading and trailing space,
|
|
-- and decode entities
|
|
let result' = decodeEntities $ removeLeadingTrailingSpace $
|
|
joinWithSep " " $ lines result
|
|
return (Code result'))
|
|
|
|
rawHtmlInline = do
|
|
result <- choice [htmlScript, anyHtmlInlineTag]
|
|
state <- getState
|
|
if stateParseRaw state then return (HtmlInline result) else return (Str "")
|
|
|
|
betweenTags tag = try (do
|
|
htmlTag tag
|
|
result <- inlinesTilEnd tag
|
|
return (normalizeSpaces result))
|
|
|
|
emph = try (do
|
|
result <- choice [betweenTags "em", betweenTags "it"]
|
|
return (Emph result))
|
|
|
|
superscript = try $ do
|
|
failIfStrict -- strict markdown has no superscript, so treat as raw HTML
|
|
result <- betweenTags "sup"
|
|
return (Superscript result)
|
|
|
|
subscript = try $ do
|
|
failIfStrict -- strict markdown has no subscript, so treat as raw HTML
|
|
result <- betweenTags "sub"
|
|
return (Subscript result)
|
|
|
|
strikeout = try $ do
|
|
failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
|
|
result <- choice [betweenTags "s", betweenTags "strike"]
|
|
return (Strikeout result)
|
|
|
|
spanStrikeout = try $ do
|
|
failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
|
|
(tag, attributes) <- htmlTag "span"
|
|
result <- case (extractAttribute "class" attributes) of
|
|
Just "strikeout" -> inlinesTilEnd "span"
|
|
Nothing -> fail "not a strikeout"
|
|
return (Strikeout result)
|
|
|
|
strong = try (do
|
|
result <- choice [betweenTags "b", betweenTags "strong"]
|
|
return (Strong result))
|
|
|
|
whitespace = do
|
|
many1 space
|
|
return Space
|
|
|
|
-- hard line break
|
|
linebreak = do
|
|
htmlTag "br"
|
|
option ' ' newline
|
|
return LineBreak
|
|
|
|
str = do
|
|
result <- many1 (noneOf "<& \t\n")
|
|
return (Str result)
|
|
|
|
--
|
|
-- links and images
|
|
--
|
|
|
|
-- extract contents of attribute (attribute names are case-insensitive)
|
|
extractAttribute name [] = Nothing
|
|
extractAttribute name ((attrName, contents):rest) =
|
|
let name' = map toLower name
|
|
attrName' = map toLower attrName in
|
|
if (attrName' == name')
|
|
then Just (decodeEntities contents)
|
|
else extractAttribute name rest
|
|
|
|
link = try $ do
|
|
(tag, attributes) <- htmlTag "a"
|
|
url <- case (extractAttribute "href" attributes) of
|
|
Just url -> do {return url}
|
|
Nothing -> fail "no href"
|
|
let title = fromMaybe "" (extractAttribute "title" attributes)
|
|
label <- inlinesTilEnd "a"
|
|
return $ Link (normalizeSpaces label) (url, title)
|
|
|
|
image = try $ do
|
|
(tag, attributes) <- htmlTag "img"
|
|
url <- case (extractAttribute "src" attributes) of
|
|
Just url -> do {return url}
|
|
Nothing -> fail "no src"
|
|
let title = fromMaybe "" (extractAttribute "title" attributes)
|
|
let alt = fromMaybe "" (extractAttribute "alt" attributes)
|
|
return $ Image [Str alt] (url, title)
|
|
|