pandoc/src/Text/Pandoc/Readers/HTML.hs

469 lines
12 KiB
Haskell
Raw Normal View History

{-
Copyright (C) 2006 John MacFarlane <jgm at berkeley dot edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.HTML
Copyright : Copyright (C) 2006 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm at berkeley dot edu>
Stability : alpha
Portability : portable
Conversion of HTML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.HTML (
readHtml,
rawHtmlInline,
rawHtmlBlock,
anyHtmlBlockTag,
anyHtmlInlineTag,
anyHtmlTag,
anyHtmlEndTag,
htmlEndTag,
extractTagType,
htmlBlockElement
) where
import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Pandoc
import Text.Pandoc.Definition
import Text.Pandoc.Shared
Changes in entity handling: + Entities are parsed (and unicode characters returned) in both Markdown and HTML readers. + Parsers characterEntity, namedEntity, decimalEntity, hexEntity added to Entities.hs; these parse a string and return a unicode character. + Changed 'entity' parser in HTML reader to use the 'characterEntity' parser from Entities.hs. + Added new 'entity' parser to Markdown reader, and added '&' as a special character. Adjusted test suite accordingly since now we get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T".. + stringToSGML moved to Entities.hs. escapeSGML removed as redundant, given encodeEntities. + stringToSGML, encodeEntities, and specialCharToEntity are given a boolean parameter that causes only numerical entities to be used. This is used in the docbook writer. The HTML writer uses named entities where possible, but not all docbook-consumers know about the named entities without special instructions, so it seems safer to use numerical entities there. + decodeEntities is rewritten in a way that avoids Text.Regex, using the new parsers. + charToEntity and charToNumericalEntity added to Entities.hs. + Moved specialCharToEntity from Shared.hs to Entities.hs. + Removed unneeded 'decodeEntities' from 'str' parser in HTML and Markdown readers. + Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and sgmlCharacterEntity from Shared.hs. + Modified Docbook writer so that it doesn't rely on Text.Regex for detecting "mailto" links. git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b
2007-01-27 03:04:40 +00:00
import Text.Pandoc.Entities ( characterEntity, decodeEntities )
import Maybe ( fromMaybe )
import Data.List ( intersect, takeWhile, dropWhile )
import Data.Char ( toUpper, toLower, isAlphaNum )
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ParserState -- ^ Parser state
-> String -- ^ String to parse
-> Pandoc
readHtml = readWith parseHtml
-- for testing
testString :: String -> IO ()
testString = testStringWith parseHtml
--
-- Constants
--
inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
"br", "cite", "code", "dfn", "em", "font", "i", "img",
"input", "kbd", "label", "q", "s", "samp", "select",
"small", "span", "strike", "strong", "sub", "sup",
"textarea", "tt", "u", "var"]
--
-- HTML utility functions
--
-- | Read blocks until end tag.
blocksTilEnd tag = try (do
blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag)
return blocks)
-- | Read inlines until end tag.
inlinesTilEnd tag = try (do
inlines <- manyTill inline (htmlEndTag tag)
return inlines)
-- | Extract type from a tag: e.g. 'br' from '<br>'
extractTagType :: String -> String
extractTagType ('<':rest) =
let isSpaceOrSlash c = c `elem` "/ \n\t" in
map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
extractTagType _ = ""
-- | Parse any HTML tag (closing or opening) and return text of tag
anyHtmlTag = try (do
char '<'
spaces
tag <- many1 alphaNum
attribs <- htmlAttributes
spaces
ender <- option "" (string "/")
let ender' = if (null ender) then "" else " /"
spaces
char '>'
return ("<" ++ tag ++ attribs ++ ender' ++ ">"))
anyHtmlEndTag = try (do
char '<'
spaces
char '/'
spaces
tagType <- many1 alphaNum
spaces
char '>'
return ("</" ++ tagType ++ ">"))
htmlTag :: String -> GenParser Char st (String, [(String, String)])
htmlTag tag = try (do
char '<'
spaces
stringAnyCase tag
attribs <- many htmlAttribute
spaces
option "" (string "/")
spaces
char '>'
return (tag, (map (\(name, content, raw) -> (name, content)) attribs)))
-- parses a quoted html attribute value
quoted quoteChar = do
result <- between (char quoteChar) (char quoteChar)
(many (noneOf [quoteChar]))
return (result, [quoteChar])
htmlAttributes = do
attrList <- many htmlAttribute
return (concatMap (\(name, content, raw) -> raw) attrList)
htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
-- minimized boolean attribute (no = and value)
htmlMinimizedAttribute = try (do
many1 space
name <- many1 (choice [letter, oneOf ".-_:"])
spaces
notFollowedBy (char '=')
let content = name
return (name, content, (" " ++ name)))
htmlRegularAttribute = try (do
many1 space
name <- many1 (choice [letter, oneOf ".-_:"])
spaces
char '='
spaces
(content, quoteStr) <- choice [ (quoted '\''),
(quoted '"'),
(do
a <- many (alphaNum <|> (oneOf "-._:"))
return (a,"")) ]
return (name, content,
(" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))
-- | Parse an end tag of type 'tag'
htmlEndTag tag = try (do
char '<'
spaces
char '/'
spaces
stringAnyCase tag
spaces
char '>'
return ("</" ++ tag ++ ">"))
-- | Returns @True@ if the tag is an inline tag.
isInline tag = (extractTagType tag) `elem` inlineHtmlTags
anyHtmlBlockTag = try (do
tag <- choice [anyHtmlTag, anyHtmlEndTag]
if isInline tag then fail "inline tag" else return tag)
anyHtmlInlineTag = try (do
tag <- choice [ anyHtmlTag, anyHtmlEndTag ]
if isInline tag then return tag else fail "not an inline tag")
-- | Parses material between script tags.
-- Scripts must be treated differently, because they can contain '<>' etc.
htmlScript = try (do
open <- string "<script"
rest <- manyTill anyChar (htmlEndTag "script")
return (open ++ rest ++ "</script>"))
htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
rawHtmlBlock = try (do
notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"])
body <- htmlBlockElement <|> anyHtmlBlockTag
sp <- (many space)
state <- getState
if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null)
-- | Parses an HTML comment.
htmlComment = try (do
string "<!--"
comment <- manyTill anyChar (try (string "-->"))
return ("<!--" ++ comment ++ "-->"))
--
-- parsing documents
--
xmlDec = try (do
string "<?"
rest <- manyTill anyChar (char '>')
return ("<?" ++ rest ++ ">"))
definition = try (do
string "<!"
rest <- manyTill anyChar (char '>')
return ("<!" ++ rest ++ ">"))
nonTitleNonHead = try (do
notFollowedBy' (htmlTag "title")
notFollowedBy' (htmlTag "/head")
result <- choice [do {rawHtmlBlock; return ' '}, anyChar]
return result)
parseTitle = try (do
(tag, attribs) <- htmlTag "title"
contents <- inlinesTilEnd tag
spaces
return contents)
-- parse header and return meta-information (for now, just title)
parseHead = try (do
htmlTag "head"
spaces
skipMany nonTitleNonHead
contents <- option [] parseTitle
skipMany nonTitleNonHead
htmlTag "/head"
return (contents, [], ""))
skipHtmlTag tag = option ("",[]) (htmlTag tag)
-- h1 class="title" representation of title in body
bodyTitle = try (do
(tag, attribs) <- htmlTag "h1"
cl <- case (extractAttribute "class" attribs) of
Just "title" -> do {return ""}
otherwise -> fail "not title"
inlinesTilEnd "h1"
return "")
parseHtml = do
sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
skipHtmlTag "html"
spaces
(title, authors, date) <- option ([], [], "") parseHead
spaces
skipHtmlTag "body"
spaces
option "" bodyTitle -- skip title in body, because it's represented in meta
blocks <- parseBlocks
spaces
option "" (htmlEndTag "body")
spaces
option "" (htmlEndTag "html")
many anyChar -- ignore anything after </html>
eof
state <- getState
let keyBlocks = stateKeyBlocks state
return (Pandoc (Meta title authors date) (blocks ++ (reverse keyBlocks)))
--
-- parsing blocks
--
parseBlocks = do
spaces
result <- sepEndBy block spaces
return result
block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain,
rawHtmlBlock ] <?> "block"
--
-- header blocks
--
header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
headerLevel n = try (do
let level = "h" ++ show n
(tag, attribs) <- htmlTag level
contents <- inlinesTilEnd level
return (Header n (normalizeSpaces contents)))
--
-- hrule block
--
hrule = try (do
(tag, attribs) <- htmlTag "hr"
state <- getState
if (not (null attribs)) && (stateParseRaw state)
then -- in this case we want to parse it as raw html
unexpected "attributes in hr"
else return HorizontalRule)
--
-- code blocks
--
codeBlock = choice [ preCodeBlock, bareCodeBlock ] <?> "code block"
preCodeBlock = try (do
htmlTag "pre"
spaces
htmlTag "code"
result <- manyTill anyChar (htmlEndTag "code")
spaces
htmlEndTag "pre"
return (CodeBlock (stripTrailingNewlines (decodeEntities result))))
bareCodeBlock = try (do
htmlTag "code"
result <- manyTill anyChar (htmlEndTag "code")
return (CodeBlock (stripTrailingNewlines (decodeEntities result))))
--
-- block quotes
--
blockQuote = try (do
tag <- htmlTag "blockquote"
spaces
blocks <- blocksTilEnd "blockquote"
return (BlockQuote blocks))
--
-- list blocks
--
list = choice [ bulletList, orderedList ] <?> "list"
orderedList = try (do
tag <- htmlTag "ol"
spaces
items <- sepEndBy1 listItem spaces
htmlEndTag "ol"
return (OrderedList items))
bulletList = try (do
tag <- htmlTag "ul"
spaces
items <- sepEndBy1 listItem spaces
htmlEndTag "ul"
return (BulletList items))
listItem = try (do
tag <- htmlTag "li"
spaces
blocks <- blocksTilEnd "li"
return blocks)
--
-- paragraph block
--
para = try (do
tag <- htmlTag "p"
result <- inlinesTilEnd "p"
return (Para (normalizeSpaces result)))
--
-- plain block
--
plain = do
result <- many1 inline
return (Plain (normalizeSpaces result))
--
-- inline
--
inline = choice [ text, special ] <?> "inline"
text = choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "text"
special = choice [ link, image, rawHtmlInline ] <?>
"link, inline html, or image"
Changes in entity handling: + Entities are parsed (and unicode characters returned) in both Markdown and HTML readers. + Parsers characterEntity, namedEntity, decimalEntity, hexEntity added to Entities.hs; these parse a string and return a unicode character. + Changed 'entity' parser in HTML reader to use the 'characterEntity' parser from Entities.hs. + Added new 'entity' parser to Markdown reader, and added '&' as a special character. Adjusted test suite accordingly since now we get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T".. + stringToSGML moved to Entities.hs. escapeSGML removed as redundant, given encodeEntities. + stringToSGML, encodeEntities, and specialCharToEntity are given a boolean parameter that causes only numerical entities to be used. This is used in the docbook writer. The HTML writer uses named entities where possible, but not all docbook-consumers know about the named entities without special instructions, so it seems safer to use numerical entities there. + decodeEntities is rewritten in a way that avoids Text.Regex, using the new parsers. + charToEntity and charToNumericalEntity added to Entities.hs. + Moved specialCharToEntity from Shared.hs to Entities.hs. + Removed unneeded 'decodeEntities' from 'str' parser in HTML and Markdown readers. + Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and sgmlCharacterEntity from Shared.hs. + Modified Docbook writer so that it doesn't rely on Text.Regex for detecting "mailto" links. git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b
2007-01-27 03:04:40 +00:00
entity = do
ent <- characterEntity
return $ Str [ent]
code = try (do
htmlTag "code"
result <- manyTill anyChar (htmlEndTag "code")
-- remove internal line breaks, leading and trailing space,
-- and decode entities
let result' = decodeEntities $ removeLeadingTrailingSpace $
joinWithSep " " $ lines result
return (Code result'))
rawHtmlInline = do
result <- choice [htmlScript, anyHtmlInlineTag]
state <- getState
if stateParseRaw state then return (HtmlInline result) else return (Str "")
betweenTags tag = try (do
htmlTag tag
result <- inlinesTilEnd tag
return (normalizeSpaces result))
emph = try (do
result <- choice [betweenTags "em", betweenTags "it"]
return (Emph result))
strong = try (do
result <- choice [betweenTags "b", betweenTags "strong"]
return (Strong result))
whitespace = do
many1 space
return Space
-- hard line break
linebreak = do
htmlTag "br"
option ' ' newline
return LineBreak
str = do
result <- many1 (noneOf "<& \t\n")
Changes in entity handling: + Entities are parsed (and unicode characters returned) in both Markdown and HTML readers. + Parsers characterEntity, namedEntity, decimalEntity, hexEntity added to Entities.hs; these parse a string and return a unicode character. + Changed 'entity' parser in HTML reader to use the 'characterEntity' parser from Entities.hs. + Added new 'entity' parser to Markdown reader, and added '&' as a special character. Adjusted test suite accordingly since now we get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T".. + stringToSGML moved to Entities.hs. escapeSGML removed as redundant, given encodeEntities. + stringToSGML, encodeEntities, and specialCharToEntity are given a boolean parameter that causes only numerical entities to be used. This is used in the docbook writer. The HTML writer uses named entities where possible, but not all docbook-consumers know about the named entities without special instructions, so it seems safer to use numerical entities there. + decodeEntities is rewritten in a way that avoids Text.Regex, using the new parsers. + charToEntity and charToNumericalEntity added to Entities.hs. + Moved specialCharToEntity from Shared.hs to Entities.hs. + Removed unneeded 'decodeEntities' from 'str' parser in HTML and Markdown readers. + Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and sgmlCharacterEntity from Shared.hs. + Modified Docbook writer so that it doesn't rely on Text.Regex for detecting "mailto" links. git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b
2007-01-27 03:04:40 +00:00
return (Str result)
--
-- links and images
--
-- extract contents of attribute (attribute names are case-insensitive)
extractAttribute name [] = Nothing
extractAttribute name ((attrName, contents):rest) =
let name' = map toLower name
attrName' = map toLower attrName in
if (attrName' == name') then Just contents else extractAttribute name rest
link = try (do
(tag, attributes) <- htmlTag "a"
url <- case (extractAttribute "href" attributes) of
Just url -> do {return url}
Nothing -> fail "no href"
let title = fromMaybe "" (extractAttribute "title" attributes)
label <- inlinesTilEnd "a"
ref <- generateReference url title
return (Link (normalizeSpaces label) ref))
image = try (do
(tag, attributes) <- htmlTag "img"
url <- case (extractAttribute "src" attributes) of
Just url -> do {return url}
Nothing -> fail "no src"
let title = fromMaybe "" (extractAttribute "title" attributes)
let alt = fromMaybe "" (extractAttribute "alt" attributes)
ref <- generateReference url title
return (Image [Str alt] ref))