The '--sanitize-html' option now examines URIs in markdown links
and images, and in HTML href and src attributes. If the URI scheme is not on a whitelist of safe schemes, it is rejected. The main point is to prevent cross-site scripting attacks using 'javascript:' URIs. See http://www.mail-archive.com/markdown-discuss@six.pairlist.net/msg01186.html and http://ha.ckers.org/xss.html. Resolves Issue #62. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1262 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
parent
4988441f3c
commit
8624ed9bd3
5 changed files with 39 additions and 19 deletions
3
README
3
README
|
@ -356,7 +356,8 @@ For further documentation, see the `pandoc(1)` man page.
|
|||
`--sanitize-html`
|
||||
: sanitizes HTML (in markdown or HTML input) using a whitelist.
|
||||
Unsafe tags are replaced by HTML comments; unsafe attributes
|
||||
are omitted.
|
||||
are omitted. URIs in links and images are also checked against a
|
||||
whitelist of URI schemes.
|
||||
|
||||
`--dump-args`
|
||||
: is intended to make it easier to create wrapper scripts that use
|
||||
|
|
|
@ -37,7 +37,8 @@ module Text.Pandoc.Readers.HTML (
|
|||
anyHtmlEndTag,
|
||||
htmlEndTag,
|
||||
extractTagType,
|
||||
htmlBlockElement
|
||||
htmlBlockElement,
|
||||
unsanitaryURI
|
||||
) where
|
||||
|
||||
import Text.ParserCombinators.Parsec
|
||||
|
@ -47,6 +48,7 @@ import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
|
|||
import Data.Maybe ( fromMaybe )
|
||||
import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf )
|
||||
import Data.Char ( toLower, isAlphaNum )
|
||||
import Network.URI ( parseURIReference, URI (..) )
|
||||
|
||||
-- | Convert HTML-formatted string to 'Pandoc' document.
|
||||
readHtml :: ParserState -- ^ Parser state
|
||||
|
@ -110,17 +112,31 @@ sanitaryAttributes = ["abbr", "accept", "accept-charset",
|
|||
-- not on the sanitized tag list.
|
||||
unsanitaryTag tag = do
|
||||
st <- getState
|
||||
if stateSanitizeHTML st && not (tag `elem` sanitaryTags)
|
||||
then return True
|
||||
else return False
|
||||
return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
|
||||
|
||||
-- | returns @True@ if sanitization is specified and the specified attribute
|
||||
-- is not on the sanitized attribute list.
|
||||
unsanitaryAttribute (attr, _, _) = do
|
||||
unsanitaryAttribute (attr, val, _) = do
|
||||
st <- getState
|
||||
if stateSanitizeHTML st && not (attr `elem` sanitaryAttributes)
|
||||
then return True
|
||||
else return False
|
||||
return $ stateSanitizeHTML st &&
|
||||
(attr `notElem` sanitaryAttributes ||
|
||||
(attr `elem` ["href","src"] && unsanitaryURI val))
|
||||
|
||||
-- | Returns @True@ if the specified URI is potentially a security risk.
|
||||
unsanitaryURI uri =
|
||||
let safeURISchemes = [ "", "http", "https", "ftp", "mailto", "file",
|
||||
"telnet", "gopher", "aaa", "aaas", "acap", "cap", "cid",
|
||||
"crid", "dav", "dict", "dns", "fax", "go", "h323", "im",
|
||||
"imap", "ldap", "mid", "news", "nfs", "nntp", "pop",
|
||||
"pres", "sip", "sips", "snmp", "tel", "urn", "wais",
|
||||
"xmpp", "z39.50r", "z39.50s", "aim", "callto", "cvs",
|
||||
"ed2k", "feed", "fish", "gg", "irc", "ircs", "lastfm",
|
||||
"ldaps", "magnet", "mms", "msnim", "notes", "rsync",
|
||||
"secondlife", "skype", "ssh", "sftp", "smb", "sms",
|
||||
"snews", "webcal", "ymsgr"]
|
||||
in case parseURIReference uri of
|
||||
Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
|
||||
Nothing -> True
|
||||
|
||||
-- | Read blocks until end tag.
|
||||
blocksTilEnd tag = do
|
||||
|
|
|
@ -41,7 +41,7 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
|
|||
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
|
||||
anyHtmlInlineTag, anyHtmlTag,
|
||||
anyHtmlEndTag, htmlEndTag, extractTagType,
|
||||
htmlBlockElement )
|
||||
htmlBlockElement, unsanitaryURI )
|
||||
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
|
||||
import Text.ParserCombinators.Parsec
|
||||
|
||||
|
@ -921,7 +921,10 @@ linkTitle = try $ do
|
|||
link = try $ do
|
||||
label <- reference
|
||||
src <- source <|> referenceLink label
|
||||
return $ Link label src
|
||||
sanitize <- getState >>= return . stateSanitizeHTML
|
||||
if sanitize && unsanitaryURI (fst src)
|
||||
then fail "Unsanitary URI"
|
||||
else return $ Link label src
|
||||
|
||||
-- a link like [this][ref] or [this][] or [this]
|
||||
referenceLink label = do
|
||||
|
@ -941,9 +944,12 @@ autoLink = try $ do
|
|||
then drop 7 src
|
||||
else src
|
||||
st <- getState
|
||||
return $ if stateStrict st
|
||||
then Link [Str src'] (src, "")
|
||||
else Link [Code src'] (src, "")
|
||||
let sanitize = stateSanitizeHTML st
|
||||
if sanitize && unsanitaryURI src
|
||||
then fail "Unsanitary URI"
|
||||
else return $ if stateStrict st
|
||||
then Link [Str src'] (src, "")
|
||||
else Link [Code src'] (src, "")
|
||||
|
||||
image = try $ do
|
||||
char '!'
|
||||
|
|
|
@ -51,10 +51,6 @@ a complete list. The following options are most relevant:
|
|||
\--no-wrap
|
||||
: Disable text wrapping in output. (Default is to wrap text.)
|
||||
|
||||
\--sanitize-html
|
||||
: Sanitizes HTML using a whitelist. Unsafe tags are replaced by HTML
|
||||
comments; unsafe attributes are omitted.
|
||||
|
||||
-H *FILE*, \--include-in-header=*FILE*
|
||||
: Include contents of *FILE* at the end of the header. Implies
|
||||
`-s`.
|
||||
|
|
|
@ -128,7 +128,8 @@ to Pandoc. Or use `html2markdown`(1), a wrapper around `pandoc`.
|
|||
\--sanitize-html
|
||||
: Sanitizes HTML (in markdown or HTML input) using a whitelist.
|
||||
Unsafe tags are replaced by HTML comments; unsafe attributes
|
||||
are omitted.
|
||||
are omitted. URIs in links and images are also checked against a
|
||||
whitelist of URI schemes.
|
||||
|
||||
\--toc, \--table-of-contents
|
||||
: Include an automatically generated table of contents (HTML, markdown,
|
||||
|
|
Loading…
Add table
Reference in a new issue