Removed HTML sanitization.
This is better done on the resulting HTML; use the xss-sanitize library for this. xss-sanitize is based on pandoc's sanitization, but improves it. - Removed stateSanitize from ParserState. - Removed --sanitize-html option.
This commit is contained in:
parent
17d48cf4af
commit
5770ceca36
5 changed files with 10 additions and 118 deletions
6
README
6
README
|
@ -269,12 +269,6 @@ Options
|
||||||
: Disable text wrapping in output. By default, text is wrapped
|
: Disable text wrapping in output. By default, text is wrapped
|
||||||
appropriately for the output format.
|
appropriately for the output format.
|
||||||
|
|
||||||
`--sanitize-html`
|
|
||||||
: Sanitizes HTML (in markdown or HTML input) using a whitelist.
|
|
||||||
Unsafe tags are replaced by HTML comments; unsafe attributes
|
|
||||||
are omitted. URIs in links and images are also checked against a
|
|
||||||
whitelist of URI schemes.
|
|
||||||
|
|
||||||
`--email-obfuscation=`*none|javascript|references*
|
`--email-obfuscation=`*none|javascript|references*
|
||||||
: Specify a method for obfuscating `mailto:` links in HTML documents.
|
: Specify a method for obfuscating `mailto:` links in HTML documents.
|
||||||
*none* leaves `mailto:` links as they are. *javascript* obfuscates
|
*none* leaves `mailto:` links as they are. *javascript* obfuscates
|
||||||
|
|
|
@ -586,7 +586,6 @@ data ParserState = ParserState
|
||||||
{ stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX?
|
{ stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX?
|
||||||
stateParserContext :: ParserContext, -- ^ Inside list?
|
stateParserContext :: ParserContext, -- ^ Inside list?
|
||||||
stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment?
|
stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment?
|
||||||
stateSanitizeHTML :: Bool, -- ^ Sanitize HTML?
|
|
||||||
stateKeys :: KeyTable, -- ^ List of reference keys
|
stateKeys :: KeyTable, -- ^ List of reference keys
|
||||||
stateCitations :: [String], -- ^ List of available citations
|
stateCitations :: [String], -- ^ List of available citations
|
||||||
stateNotes :: NoteTable, -- ^ List of notes
|
stateNotes :: NoteTable, -- ^ List of notes
|
||||||
|
@ -614,7 +613,6 @@ defaultParserState =
|
||||||
ParserState { stateParseRaw = False,
|
ParserState { stateParseRaw = False,
|
||||||
stateParserContext = NullState,
|
stateParserContext = NullState,
|
||||||
stateQuoteContext = NoQuote,
|
stateQuoteContext = NoQuote,
|
||||||
stateSanitizeHTML = False,
|
|
||||||
stateKeys = M.empty,
|
stateKeys = M.empty,
|
||||||
stateCitations = [],
|
stateCitations = [],
|
||||||
stateNotes = [],
|
stateNotes = [],
|
||||||
|
|
|
@ -40,7 +40,6 @@ module Text.Pandoc.Readers.HTML (
|
||||||
extractTagType,
|
extractTagType,
|
||||||
htmlBlockElement,
|
htmlBlockElement,
|
||||||
htmlComment,
|
htmlComment,
|
||||||
unsanitaryURI
|
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Text.ParserCombinators.Parsec
|
import Text.ParserCombinators.Parsec
|
||||||
|
@ -51,7 +50,6 @@ import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
|
||||||
import Data.Maybe ( fromMaybe )
|
import Data.Maybe ( fromMaybe )
|
||||||
import Data.List ( isPrefixOf, isSuffixOf, intercalate )
|
import Data.List ( isPrefixOf, isSuffixOf, intercalate )
|
||||||
import Data.Char ( toLower, isAlphaNum )
|
import Data.Char ( toLower, isAlphaNum )
|
||||||
import Network.URI ( parseURIReference, URI (..) )
|
|
||||||
import Control.Monad ( liftM, when )
|
import Control.Monad ( liftM, when )
|
||||||
|
|
||||||
-- | Convert HTML-formatted string to 'Pandoc' document.
|
-- | Convert HTML-formatted string to 'Pandoc' document.
|
||||||
|
@ -85,36 +83,6 @@ blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
|
||||||
"dt", "frameset", "li", "tbody", "td", "tfoot",
|
"dt", "frameset", "li", "tbody", "td", "tfoot",
|
||||||
"th", "thead", "tr", "script", "style"]
|
"th", "thead", "tr", "script", "style"]
|
||||||
|
|
||||||
sanitaryTags :: [[Char]]
|
|
||||||
sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
|
|
||||||
"blockquote", "br", "button", "caption", "center",
|
|
||||||
"cite", "code", "col", "colgroup", "dd", "del", "dfn",
|
|
||||||
"dir", "div", "dl", "dt", "em", "fieldset", "font",
|
|
||||||
"form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
|
|
||||||
"i", "img", "input", "ins", "kbd", "label", "legend",
|
|
||||||
"li", "map", "menu", "ol", "optgroup", "option", "p",
|
|
||||||
"pre", "q", "s", "samp", "select", "small", "span",
|
|
||||||
"strike", "strong", "sub", "sup", "table", "tbody",
|
|
||||||
"td", "textarea", "tfoot", "th", "thead", "tr", "tt",
|
|
||||||
"u", "ul", "var"]
|
|
||||||
|
|
||||||
sanitaryAttributes :: [[Char]]
|
|
||||||
sanitaryAttributes = ["abbr", "accept", "accept-charset",
|
|
||||||
"accesskey", "action", "align", "alt", "axis",
|
|
||||||
"border", "cellpadding", "cellspacing", "char",
|
|
||||||
"charoff", "charset", "checked", "cite", "class",
|
|
||||||
"clear", "cols", "colspan", "color", "compact",
|
|
||||||
"coords", "datetime", "dir", "disabled",
|
|
||||||
"enctype", "for", "frame", "headers", "height",
|
|
||||||
"href", "hreflang", "hspace", "id", "ismap",
|
|
||||||
"label", "lang", "longdesc", "maxlength", "media",
|
|
||||||
"method", "multiple", "name", "nohref", "noshade",
|
|
||||||
"nowrap", "prompt", "readonly", "rel", "rev",
|
|
||||||
"rows", "rowspan", "rules", "scope", "selected",
|
|
||||||
"shape", "size", "span", "src", "start",
|
|
||||||
"summary", "tabindex", "target", "title", "type",
|
|
||||||
"usemap", "valign", "value", "vspace", "width"]
|
|
||||||
|
|
||||||
-- taken from HXT and extended
|
-- taken from HXT and extended
|
||||||
|
|
||||||
closes :: String -> String -> Bool
|
closes :: String -> String -> Bool
|
||||||
|
@ -153,41 +121,6 @@ _ `closes` _ = False
|
||||||
-- HTML utility functions
|
-- HTML utility functions
|
||||||
--
|
--
|
||||||
|
|
||||||
-- | Returns @True@ if sanitization is specified and the specified tag is
|
|
||||||
-- not on the sanitized tag list.
|
|
||||||
unsanitaryTag :: [Char]
|
|
||||||
-> GenParser tok ParserState Bool
|
|
||||||
unsanitaryTag tag = do
|
|
||||||
st <- getState
|
|
||||||
return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
|
|
||||||
|
|
||||||
-- | returns @True@ if sanitization is specified and the specified attribute
|
|
||||||
-- is not on the sanitized attribute list.
|
|
||||||
unsanitaryAttribute :: ([Char], String, t)
|
|
||||||
-> GenParser tok ParserState Bool
|
|
||||||
unsanitaryAttribute (attr, val, _) = do
|
|
||||||
st <- getState
|
|
||||||
return $ stateSanitizeHTML st &&
|
|
||||||
(attr `notElem` sanitaryAttributes ||
|
|
||||||
(attr `elem` ["href","src"] && unsanitaryURI val))
|
|
||||||
|
|
||||||
-- | Returns @True@ if the specified URI is potentially a security risk.
|
|
||||||
unsanitaryURI :: String -> Bool
|
|
||||||
unsanitaryURI u =
|
|
||||||
let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:",
|
|
||||||
"telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:",
|
|
||||||
"crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:",
|
|
||||||
"imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:",
|
|
||||||
"pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:",
|
|
||||||
"xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:",
|
|
||||||
"ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:",
|
|
||||||
"ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
|
|
||||||
"secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
|
|
||||||
"snews:", "webcal:", "ymsgr:"]
|
|
||||||
in case parseURIReference (escapeURI u) of
|
|
||||||
Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
|
|
||||||
Nothing -> True
|
|
||||||
|
|
||||||
-- | Read blocks until end tag.
|
-- | Read blocks until end tag.
|
||||||
blocksTilEnd :: String -> GenParser Char ParserState [Block]
|
blocksTilEnd :: String -> GenParser Char ParserState [Block]
|
||||||
blocksTilEnd tag = do
|
blocksTilEnd tag = do
|
||||||
|
@ -240,10 +173,7 @@ anyHtmlTag = try $ do
|
||||||
char '>'
|
char '>'
|
||||||
let result = "<" ++ tag ++
|
let result = "<" ++ tag ++
|
||||||
concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
|
concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
|
||||||
unsanitary <- unsanitaryTag tag
|
return result
|
||||||
if unsanitary
|
|
||||||
then return $ "<!-- unsafe HTML removed -->"
|
|
||||||
else return result
|
|
||||||
|
|
||||||
anyHtmlEndTag :: GenParser Char ParserState [Char]
|
anyHtmlEndTag :: GenParser Char ParserState [Char]
|
||||||
anyHtmlEndTag = try $ do
|
anyHtmlEndTag = try $ do
|
||||||
|
@ -255,10 +185,7 @@ anyHtmlEndTag = try $ do
|
||||||
spaces
|
spaces
|
||||||
char '>'
|
char '>'
|
||||||
let result = "</" ++ tag ++ ">"
|
let result = "</" ++ tag ++ ">"
|
||||||
unsanitary <- unsanitaryTag tag
|
return result
|
||||||
if unsanitary
|
|
||||||
then return $ "<!-- unsafe HTML removed -->"
|
|
||||||
else return result
|
|
||||||
|
|
||||||
htmlTag :: Bool
|
htmlTag :: Bool
|
||||||
-> String
|
-> String
|
||||||
|
@ -294,16 +221,10 @@ quoted quoteChar = do
|
||||||
(many (noneOf [quoteChar]))
|
(many (noneOf [quoteChar]))
|
||||||
return (result, [quoteChar])
|
return (result, [quoteChar])
|
||||||
|
|
||||||
nullAttribute :: ([Char], [Char], [Char])
|
|
||||||
nullAttribute = ("", "", "")
|
|
||||||
|
|
||||||
htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
|
htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
|
||||||
htmlAttribute = do
|
htmlAttribute = do
|
||||||
attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
|
attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
|
||||||
unsanitary <- unsanitaryAttribute attr
|
return attr
|
||||||
if unsanitary
|
|
||||||
then return nullAttribute
|
|
||||||
else return attr
|
|
||||||
|
|
||||||
-- minimized boolean attribute
|
-- minimized boolean attribute
|
||||||
htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
|
htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
|
||||||
|
@ -364,10 +285,7 @@ htmlScript = try $ do
|
||||||
lookAhead $ htmlOpenTag "script"
|
lookAhead $ htmlOpenTag "script"
|
||||||
open <- anyHtmlTag
|
open <- anyHtmlTag
|
||||||
rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script")
|
rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script")
|
||||||
st <- getState
|
return $ open ++ rest ++ "</script>"
|
||||||
if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
|
|
||||||
then return "<!-- unsafe HTML removed -->"
|
|
||||||
else return $ open ++ rest ++ "</script>"
|
|
||||||
|
|
||||||
scriptChunk :: GenParser Char ParserState [Char]
|
scriptChunk :: GenParser Char ParserState [Char]
|
||||||
scriptChunk = jsComment <|> jsString <|> jsChars
|
scriptChunk = jsComment <|> jsString <|> jsChars
|
||||||
|
@ -399,10 +317,7 @@ htmlStyle = try $ do
|
||||||
lookAhead $ htmlOpenTag "style"
|
lookAhead $ htmlOpenTag "style"
|
||||||
open <- anyHtmlTag
|
open <- anyHtmlTag
|
||||||
rest <- manyTill anyChar (htmlEndTag "style")
|
rest <- manyTill anyChar (htmlEndTag "style")
|
||||||
st <- getState
|
return $ open ++ rest ++ "</style>"
|
||||||
if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
|
|
||||||
then return "<!-- unsafe HTML removed -->"
|
|
||||||
else return $ open ++ rest ++ "</style>"
|
|
||||||
|
|
||||||
htmlBlockElement :: GenParser Char ParserState [Char]
|
htmlBlockElement :: GenParser Char ParserState [Char]
|
||||||
htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
|
htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
|
||||||
|
|
|
@ -41,7 +41,7 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
|
||||||
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
|
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
|
||||||
anyHtmlInlineTag, anyHtmlTag,
|
anyHtmlInlineTag, anyHtmlTag,
|
||||||
anyHtmlEndTag, htmlEndTag, extractTagType,
|
anyHtmlEndTag, htmlEndTag, extractTagType,
|
||||||
htmlBlockElement, htmlComment, unsanitaryURI )
|
htmlBlockElement, htmlComment )
|
||||||
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
|
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
|
||||||
import Text.ParserCombinators.Parsec
|
import Text.ParserCombinators.Parsec
|
||||||
import Control.Monad (when, liftM, guard)
|
import Control.Monad (when, liftM, guard)
|
||||||
|
@ -1152,10 +1152,7 @@ link :: GenParser Char ParserState Inline
|
||||||
link = try $ do
|
link = try $ do
|
||||||
lab <- reference
|
lab <- reference
|
||||||
(src, tit) <- source <|> referenceLink lab
|
(src, tit) <- source <|> referenceLink lab
|
||||||
sanitize <- getState >>= return . stateSanitizeHTML
|
return $ Link lab (src, tit)
|
||||||
if sanitize && unsanitaryURI src
|
|
||||||
then fail "Unsanitary URI"
|
|
||||||
else return $ Link lab (src, tit)
|
|
||||||
|
|
||||||
-- a link like [this][ref] or [this][] or [this]
|
-- a link like [this][ref] or [this][] or [this]
|
||||||
referenceLink :: [Inline]
|
referenceLink :: [Inline]
|
||||||
|
@ -1175,12 +1172,9 @@ autoLink = try $ do
|
||||||
(orig, src) <- uri <|> emailAddress
|
(orig, src) <- uri <|> emailAddress
|
||||||
char '>'
|
char '>'
|
||||||
st <- getState
|
st <- getState
|
||||||
let sanitize = stateSanitizeHTML st
|
return $ if stateStrict st
|
||||||
if sanitize && unsanitaryURI src
|
then Link [Str orig] (src, "")
|
||||||
then fail "Unsanitary URI"
|
else Link [Code orig] (src, "")
|
||||||
else return $ if stateStrict st
|
|
||||||
then Link [Str orig] (src, "")
|
|
||||||
else Link [Code orig] (src, "")
|
|
||||||
|
|
||||||
image :: GenParser Char ParserState Inline
|
image :: GenParser Char ParserState Inline
|
||||||
image = try $ do
|
image = try $ do
|
||||||
|
|
|
@ -155,7 +155,6 @@ data Opt = Opt
|
||||||
, optStrict :: Bool -- ^ Use strict markdown syntax
|
, optStrict :: Bool -- ^ Use strict markdown syntax
|
||||||
, optReferenceLinks :: Bool -- ^ Use reference links in writing markdown, rst
|
, optReferenceLinks :: Bool -- ^ Use reference links in writing markdown, rst
|
||||||
, optWrapText :: Bool -- ^ Wrap text
|
, optWrapText :: Bool -- ^ Wrap text
|
||||||
, optSanitizeHTML :: Bool -- ^ Sanitize HTML
|
|
||||||
, optPlugins :: [Pandoc -> IO Pandoc] -- ^ Plugins to apply
|
, optPlugins :: [Pandoc -> IO Pandoc] -- ^ Plugins to apply
|
||||||
, optEmailObfuscation :: ObfuscationMethod
|
, optEmailObfuscation :: ObfuscationMethod
|
||||||
, optIdentifierPrefix :: String
|
, optIdentifierPrefix :: String
|
||||||
|
@ -194,7 +193,6 @@ defaultOpts = Opt
|
||||||
, optStrict = False
|
, optStrict = False
|
||||||
, optReferenceLinks = False
|
, optReferenceLinks = False
|
||||||
, optWrapText = True
|
, optWrapText = True
|
||||||
, optSanitizeHTML = False
|
|
||||||
, optPlugins = []
|
, optPlugins = []
|
||||||
, optEmailObfuscation = JavascriptObfuscation
|
, optEmailObfuscation = JavascriptObfuscation
|
||||||
, optIdentifierPrefix = ""
|
, optIdentifierPrefix = ""
|
||||||
|
@ -344,11 +342,6 @@ options =
|
||||||
(\opt -> return opt { optWrapText = False }))
|
(\opt -> return opt { optWrapText = False }))
|
||||||
"" -- "Do not wrap text in output"
|
"" -- "Do not wrap text in output"
|
||||||
|
|
||||||
, Option "" ["sanitize-html"]
|
|
||||||
(NoArg
|
|
||||||
(\opt -> return opt { optSanitizeHTML = True }))
|
|
||||||
"" -- "Sanitize HTML"
|
|
||||||
|
|
||||||
, Option "" ["email-obfuscation"]
|
, Option "" ["email-obfuscation"]
|
||||||
(ReqArg
|
(ReqArg
|
||||||
(\arg opt -> do
|
(\arg opt -> do
|
||||||
|
@ -673,7 +666,6 @@ main = do
|
||||||
, optStrict = strict
|
, optStrict = strict
|
||||||
, optReferenceLinks = referenceLinks
|
, optReferenceLinks = referenceLinks
|
||||||
, optWrapText = wrap
|
, optWrapText = wrap
|
||||||
, optSanitizeHTML = sanitize
|
|
||||||
, optEmailObfuscation = obfuscationMethod
|
, optEmailObfuscation = obfuscationMethod
|
||||||
, optIdentifierPrefix = idPrefix
|
, optIdentifierPrefix = idPrefix
|
||||||
, optIndentedCodeClasses = codeBlockClasses
|
, optIndentedCodeClasses = codeBlockClasses
|
||||||
|
@ -772,7 +764,6 @@ main = do
|
||||||
let startParserState =
|
let startParserState =
|
||||||
defaultParserState { stateParseRaw = parseRaw,
|
defaultParserState { stateParseRaw = parseRaw,
|
||||||
stateTabStop = tabStop,
|
stateTabStop = tabStop,
|
||||||
stateSanitizeHTML = sanitize,
|
|
||||||
stateLiterateHaskell = "+lhs" `isSuffixOf` readerName' ||
|
stateLiterateHaskell = "+lhs" `isSuffixOf` readerName' ||
|
||||||
lhsExtension sources,
|
lhsExtension sources,
|
||||||
stateStandalone = standalone',
|
stateStandalone = standalone',
|
||||||
|
|
Loading…
Add table
Reference in a new issue