2007-11-03 23:27:58 +00:00
|
|
|
{-
|
2008-01-08 17:26:16 +00:00
|
|
|
Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu>
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
-}
|
|
|
|
|
|
|
|
{- |
|
|
|
|
Module : Text.Pandoc.Readers.HTML
|
2008-01-08 17:26:16 +00:00
|
|
|
Copyright : Copyright (C) 2006-8 John MacFarlane
|
2007-11-03 23:27:58 +00:00
|
|
|
License : GNU GPL, version 2 or above
|
|
|
|
|
|
|
|
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
|
|
|
Stability : alpha
|
|
|
|
Portability : portable
|
|
|
|
|
|
|
|
Conversion of HTML to 'Pandoc' document.
|
|
|
|
-}
|
|
|
|
module Text.Pandoc.Readers.HTML (
|
|
|
|
readHtml,
|
|
|
|
rawHtmlInline,
|
|
|
|
rawHtmlBlock,
|
|
|
|
anyHtmlBlockTag,
|
|
|
|
anyHtmlInlineTag,
|
|
|
|
anyHtmlTag,
|
|
|
|
anyHtmlEndTag,
|
|
|
|
htmlEndTag,
|
|
|
|
extractTagType,
|
2008-03-22 20:41:56 +00:00
|
|
|
htmlBlockElement,
|
|
|
|
unsanitaryURI
|
2007-11-03 23:27:58 +00:00
|
|
|
) where
|
|
|
|
|
|
|
|
import Text.ParserCombinators.Parsec
|
|
|
|
import Text.Pandoc.Definition
|
|
|
|
import Text.Pandoc.Shared
|
2008-01-16 02:18:23 +00:00
|
|
|
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
|
2007-11-03 23:27:58 +00:00
|
|
|
import Data.Maybe ( fromMaybe )
|
|
|
|
import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf )
|
2008-01-16 02:18:23 +00:00
|
|
|
import Data.Char ( toLower, isAlphaNum )
|
2008-03-22 20:41:56 +00:00
|
|
|
import Network.URI ( parseURIReference, URI (..) )
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
-- | Convert HTML-formatted string to 'Pandoc' document.
|
|
|
|
readHtml :: ParserState -- ^ Parser state
|
|
|
|
-> String -- ^ String to parse
|
|
|
|
-> Pandoc
|
|
|
|
readHtml = readWith parseHtml
|
|
|
|
|
|
|
|
--
|
|
|
|
-- Constants
|
|
|
|
--
|
|
|
|
|
|
|
|
eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
|
2007-12-31 00:45:54 +00:00
|
|
|
"map", "area", "object"]
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
|
|
|
|
"br", "cite", "code", "dfn", "em", "font", "i", "img",
|
|
|
|
"input", "kbd", "label", "q", "s", "samp", "select",
|
|
|
|
"small", "span", "strike", "strong", "sub", "sup",
|
|
|
|
"textarea", "tt", "u", "var"] ++ eitherBlockOrInline
|
|
|
|
|
2008-04-20 03:12:42 +00:00
|
|
|
blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
|
2007-11-03 23:27:58 +00:00
|
|
|
"dl", "fieldset", "form", "h1", "h2", "h3", "h4",
|
2008-04-20 03:12:42 +00:00
|
|
|
"h5", "h6", "hr", "html", "isindex", "menu", "noframes",
|
2007-11-03 23:27:58 +00:00
|
|
|
"noscript", "ol", "p", "pre", "table", "ul", "dd",
|
|
|
|
"dt", "frameset", "li", "tbody", "td", "tfoot",
|
2007-12-31 00:45:54 +00:00
|
|
|
"th", "thead", "tr", "script"] ++ eitherBlockOrInline
|
2007-11-03 23:27:58 +00:00
|
|
|
|
2008-01-03 21:32:32 +00:00
|
|
|
sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
|
|
|
|
"blockquote", "br", "button", "caption", "center",
|
|
|
|
"cite", "code", "col", "colgroup", "dd", "del", "dfn",
|
|
|
|
"dir", "div", "dl", "dt", "em", "fieldset", "font",
|
|
|
|
"form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
|
|
|
|
"i", "img", "input", "ins", "kbd", "label", "legend",
|
|
|
|
"li", "map", "menu", "ol", "optgroup", "option", "p",
|
|
|
|
"pre", "q", "s", "samp", "select", "small", "span",
|
|
|
|
"strike", "strong", "sub", "sup", "table", "tbody",
|
|
|
|
"td", "textarea", "tfoot", "th", "thead", "tr", "tt",
|
|
|
|
"u", "ul", "var"]
|
|
|
|
|
|
|
|
sanitaryAttributes = ["abbr", "accept", "accept-charset",
|
|
|
|
"accesskey", "action", "align", "alt", "axis",
|
|
|
|
"border", "cellpadding", "cellspacing", "char",
|
|
|
|
"charoff", "charset", "checked", "cite", "class",
|
|
|
|
"clear", "cols", "colspan", "color", "compact",
|
|
|
|
"coords", "datetime", "dir", "disabled",
|
|
|
|
"enctype", "for", "frame", "headers", "height",
|
|
|
|
"href", "hreflang", "hspace", "id", "ismap",
|
|
|
|
"label", "lang", "longdesc", "maxlength", "media",
|
|
|
|
"method", "multiple", "name", "nohref", "noshade",
|
|
|
|
"nowrap", "prompt", "readonly", "rel", "rev",
|
|
|
|
"rows", "rowspan", "rules", "scope", "selected",
|
|
|
|
"shape", "size", "span", "src", "start",
|
|
|
|
"summary", "tabindex", "target", "title", "type",
|
|
|
|
"usemap", "valign", "value", "vspace", "width"]
|
|
|
|
|
2007-11-03 23:27:58 +00:00
|
|
|
--
|
|
|
|
-- HTML utility functions
|
|
|
|
--
|
|
|
|
|
2008-01-03 21:32:32 +00:00
|
|
|
-- | Returns @True@ if sanitization is specified and the specified tag is
|
|
|
|
-- not on the sanitized tag list.
|
|
|
|
unsanitaryTag tag = do
|
|
|
|
st <- getState
|
2008-03-22 20:41:56 +00:00
|
|
|
return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
|
2008-01-03 21:32:32 +00:00
|
|
|
|
|
|
|
-- | returns @True@ if sanitization is specified and the specified attribute
|
|
|
|
-- is not on the sanitized attribute list.
|
2008-03-22 20:41:56 +00:00
|
|
|
unsanitaryAttribute (attr, val, _) = do
|
2008-01-03 21:32:32 +00:00
|
|
|
st <- getState
|
2008-03-22 20:41:56 +00:00
|
|
|
return $ stateSanitizeHTML st &&
|
|
|
|
(attr `notElem` sanitaryAttributes ||
|
|
|
|
(attr `elem` ["href","src"] && unsanitaryURI val))
|
|
|
|
|
|
|
|
-- | Returns @True@ if the specified URI is potentially a security risk.
|
|
|
|
unsanitaryURI uri =
|
|
|
|
let safeURISchemes = [ "", "http", "https", "ftp", "mailto", "file",
|
|
|
|
"telnet", "gopher", "aaa", "aaas", "acap", "cap", "cid",
|
|
|
|
"crid", "dav", "dict", "dns", "fax", "go", "h323", "im",
|
|
|
|
"imap", "ldap", "mid", "news", "nfs", "nntp", "pop",
|
|
|
|
"pres", "sip", "sips", "snmp", "tel", "urn", "wais",
|
|
|
|
"xmpp", "z39.50r", "z39.50s", "aim", "callto", "cvs",
|
|
|
|
"ed2k", "feed", "fish", "gg", "irc", "ircs", "lastfm",
|
|
|
|
"ldaps", "magnet", "mms", "msnim", "notes", "rsync",
|
|
|
|
"secondlife", "skype", "ssh", "sftp", "smb", "sms",
|
|
|
|
"snews", "webcal", "ymsgr"]
|
|
|
|
in case parseURIReference uri of
|
|
|
|
Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
|
|
|
|
Nothing -> True
|
2008-01-03 21:32:32 +00:00
|
|
|
|
2007-11-03 23:27:58 +00:00
|
|
|
-- | Read blocks until end tag.
|
|
|
|
blocksTilEnd tag = do
|
|
|
|
blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
|
|
|
|
return $ filter (/= Null) blocks
|
|
|
|
|
|
|
|
-- | Read inlines until end tag.
|
|
|
|
inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
|
|
|
|
|
|
|
|
-- | Parse blocks between open and close tag.
|
|
|
|
blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
|
|
|
|
|
|
|
|
-- | Parse inlines between open and close tag.
|
|
|
|
inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
|
|
|
|
|
|
|
|
-- | Extract type from a tag: e.g. @br@ from @\<br\>@
|
|
|
|
extractTagType :: String -> String
|
|
|
|
extractTagType ('<':rest) =
|
|
|
|
let isSpaceOrSlash c = c `elem` "/ \n\t" in
|
|
|
|
map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest
|
|
|
|
extractTagType _ = ""
|
|
|
|
|
|
|
|
-- | Parse any HTML tag (opening or self-closing) and return text of tag
|
|
|
|
anyHtmlTag = try $ do
|
|
|
|
char '<'
|
|
|
|
spaces
|
|
|
|
tag <- many1 alphaNum
|
|
|
|
attribs <- many htmlAttribute
|
|
|
|
spaces
|
|
|
|
ender <- option "" (string "/")
|
|
|
|
let ender' = if null ender then "" else " /"
|
|
|
|
spaces
|
|
|
|
char '>'
|
2008-01-03 21:32:32 +00:00
|
|
|
let result = "<" ++ tag ++
|
|
|
|
concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
|
|
|
|
unsanitary <- unsanitaryTag tag
|
|
|
|
if unsanitary
|
2008-01-08 04:53:01 +00:00
|
|
|
then return $ "<!-- unsafe HTML removed -->"
|
2008-01-03 21:32:32 +00:00
|
|
|
else return result
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
anyHtmlEndTag = try $ do
|
|
|
|
char '<'
|
|
|
|
spaces
|
|
|
|
char '/'
|
|
|
|
spaces
|
2008-01-03 21:32:32 +00:00
|
|
|
tag <- many1 alphaNum
|
2007-11-03 23:27:58 +00:00
|
|
|
spaces
|
|
|
|
char '>'
|
2008-01-03 21:32:32 +00:00
|
|
|
let result = "</" ++ tag ++ ">"
|
|
|
|
unsanitary <- unsanitaryTag tag
|
|
|
|
if unsanitary
|
2008-01-08 04:53:01 +00:00
|
|
|
then return $ "<!-- unsafe HTML removed -->"
|
2008-01-03 21:32:32 +00:00
|
|
|
else return result
|
2007-11-03 23:27:58 +00:00
|
|
|
|
2008-01-03 21:32:32 +00:00
|
|
|
htmlTag :: String -> GenParser Char ParserState (String, [(String, String)])
|
2007-11-03 23:27:58 +00:00
|
|
|
htmlTag tag = try $ do
|
|
|
|
char '<'
|
|
|
|
spaces
|
|
|
|
stringAnyCase tag
|
|
|
|
attribs <- many htmlAttribute
|
|
|
|
spaces
|
|
|
|
optional (string "/")
|
|
|
|
spaces
|
|
|
|
char '>'
|
|
|
|
return (tag, (map (\(name, content, raw) -> (name, content)) attribs))
|
|
|
|
|
|
|
|
-- parses a quoted html attribute value
|
|
|
|
quoted quoteChar = do
|
|
|
|
result <- between (char quoteChar) (char quoteChar)
|
|
|
|
(many (noneOf [quoteChar]))
|
|
|
|
return (result, [quoteChar])
|
|
|
|
|
2008-01-03 21:32:32 +00:00
|
|
|
nullAttribute = ("", "", "")
|
|
|
|
|
|
|
|
htmlAttribute = do
|
|
|
|
attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
|
|
|
|
unsanitary <- unsanitaryAttribute attr
|
|
|
|
if unsanitary
|
|
|
|
then return nullAttribute
|
|
|
|
else return attr
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
-- minimized boolean attribute
|
|
|
|
htmlMinimizedAttribute = try $ do
|
|
|
|
many1 space
|
|
|
|
name <- many1 (choice [letter, oneOf ".-_:"])
|
|
|
|
return (name, name, name)
|
|
|
|
|
|
|
|
htmlRegularAttribute = try $ do
|
|
|
|
many1 space
|
|
|
|
name <- many1 (choice [letter, oneOf ".-_:"])
|
|
|
|
spaces
|
|
|
|
char '='
|
|
|
|
spaces
|
|
|
|
(content, quoteStr) <- choice [ (quoted '\''),
|
|
|
|
(quoted '"'),
|
|
|
|
(do
|
|
|
|
a <- many (alphaNum <|> (oneOf "-._:"))
|
|
|
|
return (a,"")) ]
|
|
|
|
return (name, content,
|
|
|
|
(name ++ "=" ++ quoteStr ++ content ++ quoteStr))
|
|
|
|
|
|
|
|
-- | Parse an end tag of type 'tag'
|
|
|
|
htmlEndTag tag = try $ do
|
|
|
|
char '<'
|
|
|
|
spaces
|
|
|
|
char '/'
|
|
|
|
spaces
|
|
|
|
stringAnyCase tag
|
|
|
|
spaces
|
|
|
|
char '>'
|
|
|
|
return $ "</" ++ tag ++ ">"
|
|
|
|
|
|
|
|
-- | Returns @True@ if the tag is (or can be) an inline tag.
|
|
|
|
isInline tag = (extractTagType tag) `elem` inlineHtmlTags
|
|
|
|
|
|
|
|
-- | Returns @True@ if the tag is (or can be) a block tag.
|
|
|
|
isBlock tag = (extractTagType tag) `elem` blockHtmlTags
|
|
|
|
|
|
|
|
anyHtmlBlockTag = try $ do
|
|
|
|
tag <- anyHtmlTag <|> anyHtmlEndTag
|
2008-04-20 03:12:42 +00:00
|
|
|
if isBlock tag then return tag else fail "not a block tag"
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
anyHtmlInlineTag = try $ do
|
|
|
|
tag <- anyHtmlTag <|> anyHtmlEndTag
|
2008-04-20 03:12:42 +00:00
|
|
|
if not (isBlock tag) then return tag else fail "not an inline tag"
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
-- | Parses material between script tags.
|
|
|
|
-- Scripts must be treated differently, because they can contain '<>' etc.
|
|
|
|
htmlScript = try $ do
|
|
|
|
open <- string "<script"
|
|
|
|
rest <- manyTill anyChar (htmlEndTag "script")
|
2008-01-03 21:32:32 +00:00
|
|
|
st <- getState
|
|
|
|
if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
|
2008-01-08 04:53:01 +00:00
|
|
|
then return "<!-- unsafe HTML removed -->"
|
2008-01-03 21:32:32 +00:00
|
|
|
else return $ open ++ rest ++ "</script>"
|
2007-11-03 23:27:58 +00:00
|
|
|
|
2007-12-31 00:05:03 +00:00
|
|
|
-- | Parses material between style tags.
|
|
|
|
-- Style tags must be treated differently, because they can contain CSS
|
|
|
|
htmlStyle = try $ do
|
|
|
|
open <- string "<style"
|
|
|
|
rest <- manyTill anyChar (htmlEndTag "style")
|
2008-01-03 21:32:32 +00:00
|
|
|
st <- getState
|
|
|
|
if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
|
2008-01-08 04:53:01 +00:00
|
|
|
then return "<!-- unsafe HTML removed -->"
|
2008-01-03 21:32:32 +00:00
|
|
|
else return $ open ++ rest ++ "</style>"
|
2007-12-31 00:05:03 +00:00
|
|
|
|
|
|
|
htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
rawHtmlBlock = try $ do
|
2008-01-03 21:32:32 +00:00
|
|
|
body <- htmlBlockElement <|> anyHtmlBlockTag
|
2007-11-03 23:27:58 +00:00
|
|
|
state <- getState
|
2007-12-31 01:02:44 +00:00
|
|
|
if stateParseRaw state then return (RawHtml body) else return Null
|
2007-11-03 23:27:58 +00:00
|
|
|
|
2007-12-23 03:46:12 +00:00
|
|
|
-- We don't want to parse </body> or </html> as raw HTML, since these
|
|
|
|
-- are handled in parseHtml.
|
|
|
|
rawHtmlBlock' = do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
|
|
|
|
rawHtmlBlock
|
|
|
|
|
2007-11-03 23:27:58 +00:00
|
|
|
-- | Parses an HTML comment.
|
|
|
|
htmlComment = try $ do
|
|
|
|
string "<!--"
|
|
|
|
comment <- manyTill anyChar (try (string "-->"))
|
|
|
|
return $ "<!--" ++ comment ++ "-->"
|
|
|
|
|
|
|
|
--
|
|
|
|
-- parsing documents
|
|
|
|
--
|
|
|
|
|
|
|
|
xmlDec = try $ do
|
|
|
|
string "<?"
|
|
|
|
rest <- manyTill anyChar (char '>')
|
|
|
|
return $ "<?" ++ rest ++ ">"
|
|
|
|
|
|
|
|
definition = try $ do
|
|
|
|
string "<!"
|
|
|
|
rest <- manyTill anyChar (char '>')
|
|
|
|
return $ "<!" ++ rest ++ ">"
|
|
|
|
|
2008-01-03 21:32:32 +00:00
|
|
|
nonTitleNonHead = try $ do
|
|
|
|
notFollowedBy $ (htmlTag "title" >> return ' ') <|>
|
|
|
|
(htmlEndTag "head" >> return ' ')
|
|
|
|
(rawHtmlBlock >> return ' ') <|> anyChar
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
parseTitle = try $ do
|
|
|
|
(tag, _) <- htmlTag "title"
|
|
|
|
contents <- inlinesTilEnd tag
|
|
|
|
spaces
|
|
|
|
return contents
|
|
|
|
|
|
|
|
-- parse header and return meta-information (for now, just title)
|
|
|
|
parseHead = try $ do
|
|
|
|
htmlTag "head"
|
|
|
|
spaces
|
|
|
|
skipMany nonTitleNonHead
|
|
|
|
contents <- option [] parseTitle
|
|
|
|
skipMany nonTitleNonHead
|
2008-01-03 21:32:32 +00:00
|
|
|
htmlEndTag "head"
|
2007-11-03 23:27:58 +00:00
|
|
|
return (contents, [], "")
|
|
|
|
|
|
|
|
skipHtmlTag tag = optional (htmlTag tag)
|
|
|
|
|
|
|
|
-- h1 class="title" representation of title in body
|
|
|
|
bodyTitle = try $ do
|
|
|
|
(tag, attribs) <- htmlTag "h1"
|
|
|
|
cl <- case (extractAttribute "class" attribs) of
|
|
|
|
Just "title" -> return ""
|
|
|
|
otherwise -> fail "not title"
|
|
|
|
inlinesTilEnd "h1"
|
|
|
|
|
|
|
|
parseHtml = do
|
|
|
|
sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
|
|
|
|
skipHtmlTag "html"
|
|
|
|
spaces
|
|
|
|
(title, authors, date) <- option ([], [], "") parseHead
|
|
|
|
spaces
|
|
|
|
skipHtmlTag "body"
|
|
|
|
spaces
|
|
|
|
optional bodyTitle -- skip title in body, because it's represented in meta
|
|
|
|
blocks <- parseBlocks
|
|
|
|
spaces
|
|
|
|
optional (htmlEndTag "body")
|
|
|
|
spaces
|
|
|
|
optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
|
|
|
|
eof
|
|
|
|
return $ Pandoc (Meta title authors date) blocks
|
|
|
|
|
|
|
|
--
|
|
|
|
-- parsing blocks
|
|
|
|
--
|
|
|
|
|
|
|
|
parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
|
|
|
|
|
|
|
|
block = choice [ codeBlock
|
|
|
|
, header
|
|
|
|
, hrule
|
|
|
|
, list
|
|
|
|
, blockQuote
|
|
|
|
, para
|
|
|
|
, plain
|
2007-12-23 03:46:12 +00:00
|
|
|
, rawHtmlBlock'
|
|
|
|
] <?> "block"
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
--
|
|
|
|
-- header blocks
|
|
|
|
--
|
|
|
|
|
|
|
|
header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
|
|
|
|
|
|
|
|
headerLevel n = try $ do
|
|
|
|
let level = "h" ++ show n
|
|
|
|
(tag, attribs) <- htmlTag level
|
|
|
|
contents <- inlinesTilEnd level
|
|
|
|
return $ Header n (normalizeSpaces contents)
|
|
|
|
|
|
|
|
--
|
|
|
|
-- hrule block
|
|
|
|
--
|
|
|
|
|
|
|
|
hrule = try $ do
|
|
|
|
(tag, attribs) <- htmlTag "hr"
|
|
|
|
state <- getState
|
|
|
|
if not (null attribs) && stateParseRaw state
|
|
|
|
then unexpected "attributes in hr" -- parse as raw in this case
|
|
|
|
else return HorizontalRule
|
|
|
|
|
|
|
|
--
|
|
|
|
-- code blocks
|
|
|
|
--
|
|
|
|
|
|
|
|
-- Note: HTML tags in code blocks (e.g. for syntax highlighting) are
|
|
|
|
-- skipped, because they are not portable to output formats other than HTML.
|
|
|
|
codeBlock = try $ do
|
|
|
|
htmlTag "pre"
|
|
|
|
result <- manyTill
|
|
|
|
(many1 (satisfy (/= '<')) <|>
|
|
|
|
((anyHtmlTag <|> anyHtmlEndTag) >> return ""))
|
|
|
|
(htmlEndTag "pre")
|
|
|
|
let result' = concat result
|
|
|
|
-- drop leading newline if any
|
|
|
|
let result'' = if "\n" `isPrefixOf` result'
|
|
|
|
then drop 1 result'
|
|
|
|
else result'
|
|
|
|
-- drop trailing newline if any
|
|
|
|
let result''' = if "\n" `isSuffixOf` result''
|
|
|
|
then init result''
|
|
|
|
else result''
|
2008-02-09 03:19:43 +00:00
|
|
|
return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result'''
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
--
|
|
|
|
-- block quotes
|
|
|
|
--
|
|
|
|
|
|
|
|
blockQuote = try $ htmlTag "blockquote" >> spaces >>
|
|
|
|
blocksTilEnd "blockquote" >>= (return . BlockQuote)
|
|
|
|
|
|
|
|
--
|
|
|
|
-- list blocks
|
|
|
|
--
|
|
|
|
|
|
|
|
list = choice [ bulletList, orderedList, definitionList ] <?> "list"
|
|
|
|
|
|
|
|
orderedList = try $ do
|
|
|
|
(_, attribs) <- htmlTag "ol"
|
|
|
|
(start, style) <- option (1, DefaultStyle) $
|
|
|
|
do failIfStrict
|
|
|
|
let sta = fromMaybe "1" $
|
|
|
|
lookup "start" attribs
|
|
|
|
let sty = fromMaybe (fromMaybe "" $
|
|
|
|
lookup "style" attribs) $
|
|
|
|
lookup "class" attribs
|
|
|
|
let sty' = case sty of
|
|
|
|
"lower-roman" -> LowerRoman
|
|
|
|
"upper-roman" -> UpperRoman
|
|
|
|
"lower-alpha" -> LowerAlpha
|
|
|
|
"upper-alpha" -> UpperAlpha
|
|
|
|
"decimal" -> Decimal
|
|
|
|
_ -> DefaultStyle
|
|
|
|
return (read sta, sty')
|
|
|
|
spaces
|
|
|
|
items <- sepEndBy1 (blocksIn "li") spaces
|
|
|
|
htmlEndTag "ol"
|
|
|
|
return $ OrderedList (start, style, DefaultDelim) items
|
|
|
|
|
|
|
|
bulletList = try $ do
|
|
|
|
htmlTag "ul"
|
|
|
|
spaces
|
|
|
|
items <- sepEndBy1 (blocksIn "li") spaces
|
|
|
|
htmlEndTag "ul"
|
|
|
|
return $ BulletList items
|
|
|
|
|
|
|
|
definitionList = try $ do
|
|
|
|
failIfStrict -- def lists not part of standard markdown
|
|
|
|
tag <- htmlTag "dl"
|
|
|
|
spaces
|
|
|
|
items <- sepEndBy1 definitionListItem spaces
|
|
|
|
htmlEndTag "dl"
|
|
|
|
return $ DefinitionList items
|
|
|
|
|
|
|
|
definitionListItem = try $ do
|
|
|
|
terms <- sepEndBy1 (inlinesIn "dt") spaces
|
|
|
|
defs <- sepEndBy1 (blocksIn "dd") spaces
|
|
|
|
let term = joinWithSep [LineBreak] terms
|
|
|
|
return (term, concat defs)
|
|
|
|
|
|
|
|
--
|
|
|
|
-- paragraph block
|
|
|
|
--
|
|
|
|
|
|
|
|
para = try $ htmlTag "p" >> inlinesTilEnd "p" >>=
|
|
|
|
return . Para . normalizeSpaces
|
|
|
|
|
|
|
|
--
|
|
|
|
-- plain block
|
|
|
|
--
|
|
|
|
|
|
|
|
plain = many1 inline >>= return . Plain . normalizeSpaces
|
|
|
|
|
|
|
|
--
|
|
|
|
-- inline
|
|
|
|
--
|
|
|
|
|
|
|
|
inline = choice [ charRef
|
|
|
|
, strong
|
|
|
|
, emph
|
|
|
|
, superscript
|
|
|
|
, subscript
|
|
|
|
, strikeout
|
|
|
|
, spanStrikeout
|
|
|
|
, code
|
|
|
|
, str
|
|
|
|
, linebreak
|
|
|
|
, whitespace
|
|
|
|
, link
|
|
|
|
, image
|
|
|
|
, rawHtmlInline
|
|
|
|
] <?> "inline"
|
|
|
|
|
|
|
|
code = try $ do
|
|
|
|
htmlTag "code"
|
|
|
|
result <- manyTill anyChar (htmlEndTag "code")
|
|
|
|
-- remove internal line breaks, leading and trailing space,
|
|
|
|
-- and decode character references
|
|
|
|
return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $
|
|
|
|
joinWithSep " " $ lines result
|
|
|
|
|
|
|
|
rawHtmlInline = do
|
2007-12-31 00:05:03 +00:00
|
|
|
result <- htmlScript <|> htmlStyle <|> htmlComment <|> anyHtmlInlineTag
|
2007-11-03 23:27:58 +00:00
|
|
|
state <- getState
|
|
|
|
if stateParseRaw state then return (HtmlInline result) else return (Str "")
|
|
|
|
|
|
|
|
betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>=
|
|
|
|
return . normalizeSpaces
|
|
|
|
|
2007-12-30 02:21:01 +00:00
|
|
|
emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph
|
2007-11-03 23:27:58 +00:00
|
|
|
|
|
|
|
strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
|
|
|
|
|
|
|
|
superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
|
|
|
|
|
|
|
|
subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
|
|
|
|
|
|
|
|
strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
|
|
|
|
return . Strikeout
|
|
|
|
|
|
|
|
spanStrikeout = try $ do
|
|
|
|
failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
|
|
|
|
(tag, attributes) <- htmlTag "span"
|
|
|
|
result <- case (extractAttribute "class" attributes) of
|
|
|
|
Just "strikeout" -> inlinesTilEnd "span"
|
|
|
|
_ -> fail "not a strikeout"
|
|
|
|
return $ Strikeout result
|
|
|
|
|
|
|
|
whitespace = many1 space >> return Space
|
|
|
|
|
|
|
|
-- hard line break
|
|
|
|
linebreak = htmlTag "br" >> optional newline >> return LineBreak
|
|
|
|
|
|
|
|
str = many1 (noneOf "<& \t\n") >>= return . Str
|
|
|
|
|
|
|
|
--
|
|
|
|
-- links and images
|
|
|
|
--
|
|
|
|
|
|
|
|
-- extract contents of attribute (attribute names are case-insensitive)
|
|
|
|
extractAttribute name [] = Nothing
|
|
|
|
extractAttribute name ((attrName, contents):rest) =
|
|
|
|
let name' = map toLower name
|
|
|
|
attrName' = map toLower attrName
|
|
|
|
in if attrName' == name'
|
|
|
|
then Just (decodeCharacterReferences contents)
|
|
|
|
else extractAttribute name rest
|
|
|
|
|
|
|
|
link = try $ do
|
|
|
|
(tag, attributes) <- htmlTag "a"
|
|
|
|
url <- case (extractAttribute "href" attributes) of
|
|
|
|
Just url -> return url
|
|
|
|
Nothing -> fail "no href"
|
|
|
|
let title = fromMaybe "" $ extractAttribute "title" attributes
|
|
|
|
label <- inlinesTilEnd "a"
|
|
|
|
return $ Link (normalizeSpaces label) (url, title)
|
|
|
|
|
|
|
|
image = try $ do
|
|
|
|
(tag, attributes) <- htmlTag "img"
|
|
|
|
url <- case (extractAttribute "src" attributes) of
|
|
|
|
Just url -> return url
|
|
|
|
Nothing -> fail "no src"
|
|
|
|
let title = fromMaybe "" $ extractAttribute "title" attributes
|
|
|
|
let alt = fromMaybe "" (extractAttribute "alt" attributes)
|
|
|
|
return $ Image [Str alt] (url, title)
|
|
|
|
|