Removed HTML sanitization.

This is better done on the resulting HTML; use the xss-sanitize library for this. xss-sanitize is based on pandoc's sanitization, but improves it. - Removed stateSanitize from ParserState. - Removed --sanitize-html option.
2010-12-10 12:26:03 -08:00 · 2010-12-10 12:26:03 -08:00 · 5770ceca36
commit 5770ceca36
parent 17d48cf4af
5 changed files with 10 additions and 118 deletions
--- a/6
+++ b/6
@ -269,12 +269,6 @@ Options
 :   Disable text wrapping in output. By default, text is wrapped
    appropriately for the output format.

-`--sanitize-html`
-:   Sanitizes HTML (in markdown or HTML input) using a whitelist.
-    Unsafe tags are replaced by HTML comments; unsafe attributes
-    are omitted.  URIs in links and images are also checked against a
-    whitelist of URI schemes.
-
 `--email-obfuscation=`*none|javascript|references*
 :   Specify a method for obfuscating `mailto:` links in HTML documents.
    *none* leaves `mailto:` links as they are.  *javascript* obfuscates
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@ -586,7 +586,6 @@ data ParserState = ParserState
    { stateParseRaw        :: Bool,          -- ^ Parse raw HTML and LaTeX?
      stateParserContext   :: ParserContext, -- ^ Inside list?
      stateQuoteContext    :: QuoteContext,  -- ^ Inside quoted environment?
-      stateSanitizeHTML    :: Bool,          -- ^ Sanitize HTML?
      stateKeys            :: KeyTable,      -- ^ List of reference keys
      stateCitations       :: [String],      -- ^ List of available citations
      stateNotes           :: NoteTable,     -- ^ List of notes
@ -614,7 +613,6 @@ defaultParserState =
    ParserState { stateParseRaw        = False,
                  stateParserContext   = NullState,
                  stateQuoteContext    = NoQuote,
-                  stateSanitizeHTML    = False,
                  stateKeys            = M.empty,
                  stateCitations       = [],
                  stateNotes           = [],
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@ -40,7 +40,6 @@ module Text.Pandoc.Readers.HTML (
                                 extractTagType,
                                 htmlBlockElement,
                                 htmlComment,
-                                 unsanitaryURI
                                ) where

 import Text.ParserCombinators.Parsec
@ -51,7 +50,6 @@ import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
 import Data.Maybe ( fromMaybe )
 import Data.List ( isPrefixOf, isSuffixOf, intercalate )
 import Data.Char ( toLower, isAlphaNum )
-import Network.URI ( parseURIReference, URI (..) )
 import Control.Monad ( liftM, when )

 -- | Convert HTML-formatted string to 'Pandoc' document.
@ -85,36 +83,6 @@ blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
                 "dt", "frameset", "li", "tbody", "td", "tfoot",
                 "th", "thead", "tr", "script", "style"]

-sanitaryTags :: [[Char]]
-sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
-                "blockquote", "br", "button", "caption", "center",
-                "cite", "code", "col", "colgroup", "dd", "del", "dfn",
-                "dir", "div", "dl", "dt", "em", "fieldset", "font",
-                "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
-                "i", "img", "input", "ins", "kbd", "label", "legend",
-                "li", "map", "menu", "ol", "optgroup", "option", "p",
-                "pre", "q", "s", "samp", "select", "small", "span",
-                "strike", "strong", "sub", "sup", "table", "tbody",
-                "td", "textarea", "tfoot", "th", "thead", "tr", "tt",
-                "u", "ul", "var"]
-
-sanitaryAttributes :: [[Char]]
-sanitaryAttributes = ["abbr", "accept", "accept-charset",
-                      "accesskey", "action", "align", "alt", "axis",
-                      "border", "cellpadding", "cellspacing", "char",
-                      "charoff", "charset", "checked", "cite", "class",
-                      "clear", "cols", "colspan", "color", "compact",
-                      "coords", "datetime", "dir", "disabled",
-                      "enctype", "for", "frame", "headers", "height",
-                      "href", "hreflang", "hspace", "id", "ismap",
-                      "label", "lang", "longdesc", "maxlength", "media",
-                      "method", "multiple", "name", "nohref", "noshade",
-                      "nowrap", "prompt", "readonly", "rel", "rev",
-                      "rows", "rowspan", "rules", "scope", "selected",
-                      "shape", "size", "span", "src", "start",
-                      "summary", "tabindex", "target", "title", "type",
-                      "usemap", "valign", "value", "vspace", "width"]
-
 -- taken from HXT and extended

 closes :: String -> String -> Bool
@ -153,41 +121,6 @@ _ `closes` _ = False
 -- HTML utility functions
 --

-- | Returns @True@ if sanitization is specified and the specified tag is 
--  not on the sanitized tag list.
-unsanitaryTag :: [Char]
-              -> GenParser tok ParserState Bool
-unsanitaryTag tag = do
-  st <- getState
-  return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
-
-- | returns @True@ if sanitization is specified and the specified attribute
--  is not on the sanitized attribute list.
-unsanitaryAttribute :: ([Char], String, t)
-                    -> GenParser tok ParserState Bool
-unsanitaryAttribute (attr, val, _) = do
-  st <- getState
-  return $ stateSanitizeHTML st &&
-           (attr `notElem` sanitaryAttributes ||
-             (attr `elem` ["href","src"] && unsanitaryURI val))
-
-- | Returns @True@ if the specified URI is potentially a security risk.
-unsanitaryURI :: String -> Bool
-unsanitaryURI u =
-  let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:",
-             "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:",
-             "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:",
-             "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:",
-             "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:",
-             "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:",
-             "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:",
-             "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
-             "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
-             "snews:", "webcal:", "ymsgr:"]
-  in  case parseURIReference (escapeURI u) of
-           Just p  -> (map toLower $ uriScheme p) `notElem` safeURISchemes
-           Nothing -> True
-
 -- | Read blocks until end tag.
 blocksTilEnd :: String -> GenParser Char ParserState [Block]
 blocksTilEnd tag = do
@ -240,10 +173,7 @@ anyHtmlTag = try $ do
  char '>'
  let result = "<" ++ tag ++ 
               concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
-  unsanitary <- unsanitaryTag tag
-  if unsanitary
-     then return $ "<!-- unsafe HTML removed -->"
-     else return result
+  return result

 anyHtmlEndTag :: GenParser Char ParserState [Char]
 anyHtmlEndTag = try $ do
@ -255,10 +185,7 @@ anyHtmlEndTag = try $ do
  spaces
  char '>'
  let result = "</" ++ tag ++ ">"
-  unsanitary <- unsanitaryTag tag
-  if unsanitary
-     then return $ "<!-- unsafe HTML removed -->"
-     else return result 
+  return result 

 htmlTag :: Bool
        -> String
@ -294,16 +221,10 @@ quoted quoteChar = do
                    (many (noneOf [quoteChar]))
  return (result, [quoteChar])

-nullAttribute :: ([Char], [Char], [Char])
-nullAttribute = ("", "", "") 
-
 htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
 htmlAttribute = do
  attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
-  unsanitary <- unsanitaryAttribute attr
-  if unsanitary
-     then return nullAttribute
-     else return attr
+  return attr

 -- minimized boolean attribute
 htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
@ -364,10 +285,7 @@ htmlScript = try $ do
  lookAhead $ htmlOpenTag "script"
  open <- anyHtmlTag
  rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script")
-  st <- getState
-  if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
-     then return "<!-- unsafe HTML removed -->"
-     else return $ open ++ rest ++ "</script>"
+  return $ open ++ rest ++ "</script>"

 scriptChunk :: GenParser Char ParserState [Char]
 scriptChunk = jsComment <|> jsString <|> jsChars
@ -399,10 +317,7 @@ htmlStyle = try $ do
  lookAhead $ htmlOpenTag "style"
  open <- anyHtmlTag
  rest <- manyTill anyChar (htmlEndTag "style")
-  st <- getState
-  if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
-     then return "<!-- unsafe HTML removed -->"
-     else return $ open ++ rest ++ "</style>"
+  return $ open ++ rest ++ "</style>"

 htmlBlockElement :: GenParser Char ParserState [Char]
 htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@ -41,7 +41,7 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
 import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, 
                                  anyHtmlInlineTag, anyHtmlTag,
                                  anyHtmlEndTag, htmlEndTag, extractTagType,
-                                  htmlBlockElement, htmlComment, unsanitaryURI )
+                                  htmlBlockElement, htmlComment )
 import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
 import Text.ParserCombinators.Parsec
 import Control.Monad (when, liftM, guard)
@ -1152,10 +1152,7 @@ link :: GenParser Char ParserState Inline
 link = try $ do
  lab <- reference
  (src, tit) <- source <|> referenceLink lab
-  sanitize <- getState >>= return . stateSanitizeHTML
-  if sanitize && unsanitaryURI src
-     then fail "Unsanitary URI"
-     else return $ Link lab (src, tit)
+  return $ Link lab (src, tit)

 -- a link like [this][ref] or [this][] or [this]
 referenceLink :: [Inline]
@ -1175,10 +1172,7 @@ autoLink = try $ do
  (orig, src) <- uri <|> emailAddress
  char '>'
  st <- getState
-  let sanitize = stateSanitizeHTML st
-  if sanitize && unsanitaryURI src
-     then fail "Unsanitary URI"
-     else return $ if stateStrict st
+  return $ if stateStrict st
              then Link [Str orig] (src, "")
              else Link [Code orig] (src, "")

--- a/src/pandoc.hs
+++ b/src/pandoc.hs
@ -155,7 +155,6 @@ data Opt = Opt
    , optStrict            :: Bool    -- ^ Use strict markdown syntax
    , optReferenceLinks    :: Bool    -- ^ Use reference links in writing markdown, rst
    , optWrapText          :: Bool    -- ^ Wrap text
-    , optSanitizeHTML      :: Bool    -- ^ Sanitize HTML
    , optPlugins           :: [Pandoc -> IO Pandoc] -- ^ Plugins to apply
    , optEmailObfuscation  :: ObfuscationMethod
    , optIdentifierPrefix  :: String
@ -194,7 +193,6 @@ defaultOpts = Opt
    , optStrict            = False
    , optReferenceLinks    = False
    , optWrapText          = True
-    , optSanitizeHTML      = False
    , optPlugins           = []
    , optEmailObfuscation  = JavascriptObfuscation
    , optIdentifierPrefix  = ""
@ -344,11 +342,6 @@ options =
                  (\opt -> return opt { optWrapText = False }))
                 "" -- "Do not wrap text in output"

-    , Option "" ["sanitize-html"]
-                 (NoArg
-                  (\opt -> return opt { optSanitizeHTML = True }))
-                 "" -- "Sanitize HTML"
-
    , Option "" ["email-obfuscation"]
                 (ReqArg
                  (\arg opt -> do
@ -673,7 +666,6 @@ main = do
              , optStrict            = strict
              , optReferenceLinks    = referenceLinks
              , optWrapText          = wrap
-              , optSanitizeHTML      = sanitize
              , optEmailObfuscation  = obfuscationMethod
              , optIdentifierPrefix  = idPrefix
              , optIndentedCodeClasses = codeBlockClasses
@ -772,7 +764,6 @@ main = do
  let startParserState =
         defaultParserState { stateParseRaw        = parseRaw,
                              stateTabStop         = tabStop,
-                              stateSanitizeHTML    = sanitize,
                              stateLiterateHaskell = "+lhs" `isSuffixOf` readerName' ||
                                                     lhsExtension sources,
                              stateStandalone      = standalone',