Added optional HTML sanitization using a whitelist.

When this option is specified (--sanitize-html on the command line),
unsafe HTML tags will be replaced by HTML comments, and unsafe HTML
attributes will be removed.  This option should be especially useful
for those who want to use pandoc libraries in web applications, where
users will provide the input.

+ Main.hs:  Added --sanitize-html option.
+ Text.Pandoc.Shared:  Added stateSanitizeHTML to ParserState.
+ Text.Pandoc.Readers.HTML:
  - Added whitelists of sanitaryTags and sanitaryAttributes.
  - Added parsers to check these lists (and state) to see if a given
    tag or attribute should be counted unsafe.
  - Modified anyHtmlTag and anyHtmlEndTag to replace unsafe tags
    with comments.
  - Modified htmlAttribute to remove unsafe attributes.
  - Modified htmlScript and htmlStyle to remove these elements if
    unsafe.
  - Modified rawHtmlBlock to use anyHtmlBlockTag instead of anyHtmlTag
    and anyHtmlEndTag.  This fixes a bug in markdown parsing, where
    inline tags would be included in raw HTML blocks.
  - Modified anyHtmlBlockTag to test for (not inline) rather than
    directly for block.  This allows us to handle e.g. docbook in
    the markdown reader.
  - Minor tweaks in nonTitleNonHead  and parseTitle.
+ Text.Pandoc.Readers.Markdown:
  - In non-strict mode use rawHtmlBlocks instead of htmlBlock.
    Simplified htmlBlock, since we know it's only called in strict
    mode.
+ Modified README and man pages to document new option.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@1166 788f1e2b-df1e-0410-8736-df70ead52e1b
This commit is contained in:
fiddlosopher 2008-01-03 21:32:32 +00:00
parent a505f70f0b
commit 5df912b162
7 changed files with 119 additions and 30 deletions

23
Main.hs
View file

@ -104,6 +104,7 @@ data Opt = Opt
, optStrict :: Bool -- ^ Use strict markdown syntax
, optReferenceLinks :: Bool -- ^ Use reference links in writing markdown, rst
, optWrapText :: Bool -- ^ Wrap text
, optSanitizeHTML :: Bool -- ^ Sanitize HTML
}
-- | Defaults for command-line options.
@ -132,6 +133,7 @@ defaultOpts = Opt
, optStrict = False
, optReferenceLinks = False
, optWrapText = True
, optSanitizeHTML = False
}
-- | A list of functions, each transforming the options data structure
@ -226,6 +228,11 @@ options =
(\opt -> return opt { optWrapText = False }))
"" -- "Do not wrap text in output"
, Option "" ["sanitize-html"]
(NoArg
(\opt -> return opt { optSanitizeHTML = True }))
"" -- "Sanitize HTML"
, Option "" ["toc", "table-of-contents"]
(NoArg
(\opt -> return opt { optTableOfContents = True }))
@ -424,6 +431,7 @@ main = do
, optStrict = strict
, optReferenceLinks = referenceLinks
, optWrapText = wrap
, optSanitizeHTML = sanitize
} = opts
if dumpArgs
@ -476,13 +484,14 @@ main = do
x:(tabFilter (spsToNextStop - 1) xs)
let startParserState =
defaultParserState { stateParseRaw = parseRaw,
stateTabStop = tabStop,
stateStandalone = standalone && (not strict),
stateSmart = smart || writerName' `elem`
["latex", "context"],
stateColumns = columns,
stateStrict = strict }
defaultParserState { stateParseRaw = parseRaw,
stateTabStop = tabStop,
stateSanitizeHTML = sanitize,
stateStandalone = standalone && (not strict),
stateSmart = smart || writerName' `elem`
["latex", "context"],
stateColumns = columns,
stateStrict = strict }
let csslink = if (css == "")
then ""
else "<link rel=\"stylesheet\" href=\"" ++ css ++

5
README
View file

@ -345,6 +345,11 @@ For further documentation, see the `pandoc(1)` man page.
: disables text-wrapping in output. By default, text is wrapped
appropriately for the output format.
`--sanitize-html`
: sanitizes HTML (in markdown or HTML input) using a whitelist.
Unsafe tags are replaced by HTML comments; unsafe attributes
are omitted.
`--dump-args`
: is intended to make it easier to create wrapper scripts that use
Pandoc. It causes Pandoc to dump information about the arguments

View file

@ -75,10 +75,54 @@ blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr", "script"] ++ eitherBlockOrInline
sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
"blockquote", "br", "button", "caption", "center",
"cite", "code", "col", "colgroup", "dd", "del", "dfn",
"dir", "div", "dl", "dt", "em", "fieldset", "font",
"form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
"i", "img", "input", "ins", "kbd", "label", "legend",
"li", "map", "menu", "ol", "optgroup", "option", "p",
"pre", "q", "s", "samp", "select", "small", "span",
"strike", "strong", "sub", "sup", "table", "tbody",
"td", "textarea", "tfoot", "th", "thead", "tr", "tt",
"u", "ul", "var"]
sanitaryAttributes = ["abbr", "accept", "accept-charset",
"accesskey", "action", "align", "alt", "axis",
"border", "cellpadding", "cellspacing", "char",
"charoff", "charset", "checked", "cite", "class",
"clear", "cols", "colspan", "color", "compact",
"coords", "datetime", "dir", "disabled",
"enctype", "for", "frame", "headers", "height",
"href", "hreflang", "hspace", "id", "ismap",
"label", "lang", "longdesc", "maxlength", "media",
"method", "multiple", "name", "nohref", "noshade",
"nowrap", "prompt", "readonly", "rel", "rev",
"rows", "rowspan", "rules", "scope", "selected",
"shape", "size", "span", "src", "start",
"summary", "tabindex", "target", "title", "type",
"usemap", "valign", "value", "vspace", "width"]
--
-- HTML utility functions
--
-- | Returns @True@ if sanitization is specified and the specified tag is
-- not on the sanitized tag list.
unsanitaryTag tag = do
st <- getState
if stateSanitizeHTML st && not (tag `elem` sanitaryTags)
then return True
else return False
-- | returns @True@ if sanitization is specified and the specified attribute
-- is not on the sanitized attribute list.
unsanitaryAttribute (attr, _, _) = do
st <- getState
if stateSanitizeHTML st && not (attr `elem` sanitaryAttributes)
then return True
else return False
-- | Read blocks until end tag.
blocksTilEnd tag = do
blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
@ -111,20 +155,28 @@ anyHtmlTag = try $ do
let ender' = if null ender then "" else " /"
spaces
char '>'
return $ "<" ++ tag ++
concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
let result = "<" ++ tag ++
concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
unsanitary <- unsanitaryTag tag
if unsanitary
then return $ "<!-- unsafe tag " ++ result ++ " omitted -->"
else return result
anyHtmlEndTag = try $ do
char '<'
spaces
char '/'
spaces
tagType <- many1 alphaNum
tag <- many1 alphaNum
spaces
char '>'
return $ "</" ++ tagType ++ ">"
let result = "</" ++ tag ++ ">"
unsanitary <- unsanitaryTag tag
if unsanitary
then return $ "<!-- unsafe tag " ++ result ++ " omitted -->"
else return result
htmlTag :: String -> GenParser Char st (String, [(String, String)])
htmlTag :: String -> GenParser Char ParserState (String, [(String, String)])
htmlTag tag = try $ do
char '<'
spaces
@ -142,7 +194,14 @@ quoted quoteChar = do
(many (noneOf [quoteChar]))
return (result, [quoteChar])
htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
nullAttribute = ("", "", "")
htmlAttribute = do
attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
unsanitary <- unsanitaryAttribute attr
if unsanitary
then return nullAttribute
else return attr
-- minimized boolean attribute
htmlMinimizedAttribute = try $ do
@ -183,7 +242,7 @@ isBlock tag = (extractTagType tag) `elem` blockHtmlTags
anyHtmlBlockTag = try $ do
tag <- anyHtmlTag <|> anyHtmlEndTag
if isBlock tag then return tag else fail "not a block tag"
if not (isInline tag) then return tag else fail "not a block tag"
anyHtmlInlineTag = try $ do
tag <- anyHtmlTag <|> anyHtmlEndTag
@ -194,19 +253,25 @@ anyHtmlInlineTag = try $ do
htmlScript = try $ do
open <- string "<script"
rest <- manyTill anyChar (htmlEndTag "script")
return $ open ++ rest ++ "</script>"
st <- getState
if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
then return "<!-- unsafe script omitted -->"
else return $ open ++ rest ++ "</script>"
-- | Parses material between style tags.
-- Style tags must be treated differently, because they can contain CSS
htmlStyle = try $ do
open <- string "<style"
rest <- manyTill anyChar (htmlEndTag "style")
return $ open ++ rest ++ "</style>"
st <- getState
if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
then return "<!-- unsafe style omitted -->"
else return $ open ++ rest ++ "</style>"
htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
rawHtmlBlock = try $ do
body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
body <- htmlBlockElement <|> anyHtmlBlockTag
state <- getState
if stateParseRaw state then return (RawHtml body) else return Null
@ -235,8 +300,10 @@ definition = try $ do
rest <- manyTill anyChar (char '>')
return $ "<!" ++ rest ++ ">"
nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >>
((rawHtmlBlock >> return ' ') <|> anyChar)
nonTitleNonHead = try $ do
notFollowedBy $ (htmlTag "title" >> return ' ') <|>
(htmlEndTag "head" >> return ' ')
(rawHtmlBlock >> return ' ') <|> anyChar
parseTitle = try $ do
(tag, _) <- htmlTag "title"
@ -251,7 +318,7 @@ parseHead = try $ do
skipMany nonTitleNonHead
contents <- option [] parseTitle
skipMany nonTitleNonHead
htmlTag "/head"
htmlEndTag "head"
return (contents, [], "")
skipHtmlTag tag = optional (htmlTag tag)

View file

@ -249,7 +249,7 @@ block = do
, blockQuote
, rawLaTeXEnvironment
, para
, htmlBlock
, rawHtmlBlocks
, plain
, nullBlock ]) <?> "block"
@ -482,15 +482,12 @@ plain = many1 inline >>= return . Plain . normalizeSpaces
htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
htmlBlock = do
st <- getState
if stateStrict st
then try $ do failUnlessBeginningOfLine
first <- htmlElement
finalSpace <- many (oneOf spaceChars)
finalNewlines <- many newline
return $ RawHtml $ first ++ finalSpace ++ finalNewlines
else rawHtmlBlocks
htmlBlock = try $ do
failUnlessBeginningOfLine
first <- htmlElement
finalSpace <- many (oneOf spaceChars)
finalNewlines <- many newline
return $ RawHtml $ first ++ finalSpace ++ finalNewlines
-- True if tag is self-closing
isSelfClosing tag =

View file

@ -595,6 +595,7 @@ data ParserState = ParserState
{ stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX?
stateParserContext :: ParserContext, -- ^ Inside list?
stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment?
stateSanitizeHTML :: Bool, -- ^ Sanitize HTML?
stateKeys :: KeyTable, -- ^ List of reference keys
stateNotes :: NoteTable, -- ^ List of notes
stateTabStop :: Int, -- ^ Tab stop
@ -614,6 +615,7 @@ defaultParserState =
ParserState { stateParseRaw = False,
stateParserContext = NullState,
stateQuoteContext = NoQuote,
stateSanitizeHTML = False,
stateKeys = [],
stateNotes = [],
stateTabStop = 4,

View file

@ -51,6 +51,10 @@ a complete list. The following options are most relevant:
\--no-wrap
: Disable text wrapping in output. (Default is to wrap text.)
\--sanitize-html
: Sanitizes HTML using a whitelist. Unsafe tags are replaced by HTML
comments; unsafe attributes are omitted.
-H *FILE*, \--include-in-header=*FILE*
: Include contents of *FILE* at the end of the header. Implies
`-s`.

View file

@ -126,6 +126,11 @@ to Pandoc. Or use `html2markdown`(1), a wrapper around `pandoc`.
\--no-wrap
: Disable text wrapping in output. (Default is to wrap text.)
\--sanitize-html
: Sanitizes HTML (in markdown or HTML input) using a whitelist.
Unsafe tags are replaced by HTML comments; unsafe attributes
are omitted.
\--toc, \--table-of-contents
: Include an automatically generated table of contents (HTML, markdown,
RTF) or an instruction to create one (LaTeX, reStructuredText).