From 5df912b162575cb9daf6702bb7f2c2a5858c0b00 Mon Sep 17 00:00:00 2001
From: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>
Date: Thu, 3 Jan 2008 21:32:32 +0000
Subject: [PATCH] Added optional HTML sanitization using a whitelist. When this
 option is specified (--sanitize-html on the command line), unsafe HTML tags
 will be replaced by HTML comments, and unsafe HTML attributes will be
 removed.  This option should be especially useful for those who want to use
 pandoc libraries in web applications, where users will provide the input.

+ Main.hs:  Added --sanitize-html option.
+ Text.Pandoc.Shared:  Added stateSanitizeHTML to ParserState.
+ Text.Pandoc.Readers.HTML:
  - Added whitelists of sanitaryTags and sanitaryAttributes.
  - Added parsers to check these lists (and state) to see if a given
    tag or attribute should be counted unsafe.
  - Modified anyHtmlTag and anyHtmlEndTag to replace unsafe tags
    with comments.
  - Modified htmlAttribute to remove unsafe attributes.
  - Modified htmlScript and htmlStyle to remove these elements if
    unsafe.
  - Modified rawHtmlBlock to use anyHtmlBlockTag instead of anyHtmlTag
    and anyHtmlEndTag.  This fixes a bug in markdown parsing, where
    inline tags would be included in raw HTML blocks.
  - Modified anyHtmlBlockTag to test for (not inline) rather than
    directly for block.  This allows us to handle e.g. docbook in
    the markdown reader.
  - Minor tweaks in nonTitleNonHead  and parseTitle.
+ Text.Pandoc.Readers.Markdown:
  - In non-strict mode use rawHtmlBlocks instead of htmlBlock.
    Simplified htmlBlock, since we know it's only called in strict
    mode.
+ Modified README and man pages to document new option.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@1166 788f1e2b-df1e-0410-8736-df70ead52e1b
---
 Main.hs                         | 23 +++++---
 README                          |  5 ++
 Text/Pandoc/Readers/HTML.hs     | 93 ++++++++++++++++++++++++++++-----
 Text/Pandoc/Readers/Markdown.hs | 17 +++---
 Text/Pandoc/Shared.hs           |  2 +
 man/man1/html2markdown.1.md     |  4 ++
 man/man1/pandoc.1.md            |  5 ++
 7 files changed, 119 insertions(+), 30 deletions(-)

diff --git a/Main.hs b/Main.hs
index 5938fefc4..98bc0897f 100644
--- a/Main.hs
+++ b/Main.hs
@@ -104,6 +104,7 @@ data Opt = Opt
     , optStrict            :: Bool    -- ^ Use strict markdown syntax
     , optReferenceLinks    :: Bool    -- ^ Use reference links in writing markdown, rst
     , optWrapText          :: Bool    -- ^ Wrap text
+    , optSanitizeHTML      :: Bool    -- ^ Sanitize HTML
     }
 
 -- | Defaults for command-line options.
@@ -132,6 +133,7 @@ defaultOpts = Opt
     , optStrict            = False
     , optReferenceLinks    = False
     , optWrapText          = True
+    , optSanitizeHTML      = False
     }
 
 -- | A list of functions, each transforming the options data structure
@@ -226,6 +228,11 @@ options =
                   (\opt -> return opt { optWrapText = False }))
                  "" -- "Do not wrap text in output"
 
+    , Option "" ["sanitize-html"]
+                 (NoArg
+                  (\opt -> return opt { optSanitizeHTML = True }))
+                 "" -- "Sanitize HTML"
+
     , Option "" ["toc", "table-of-contents"]
                 (NoArg
                  (\opt -> return opt { optTableOfContents = True }))
@@ -424,6 +431,7 @@ main = do
               , optStrict            = strict
               , optReferenceLinks    = referenceLinks
               , optWrapText          = wrap
+              , optSanitizeHTML      = sanitize
              } = opts
 
   if dumpArgs
@@ -476,13 +484,14 @@ main = do
         x:(tabFilter (spsToNextStop - 1) xs)
 
   let startParserState = 
-         defaultParserState { stateParseRaw    = parseRaw,
-                              stateTabStop     = tabStop, 
-                              stateStandalone  = standalone && (not strict),
-                              stateSmart       = smart || writerName' `elem` 
-                                                          ["latex", "context"],
-                              stateColumns     = columns,
-                              stateStrict      = strict }
+         defaultParserState { stateParseRaw     = parseRaw,
+                              stateTabStop      = tabStop, 
+                              stateSanitizeHTML = sanitize,
+                              stateStandalone   = standalone && (not strict),
+                              stateSmart        = smart || writerName' `elem` 
+                                                           ["latex", "context"],
+                              stateColumns      = columns,
+                              stateStrict       = strict }
   let csslink = if (css == "")
                    then "" 
                    else "<link rel=\"stylesheet\" href=\"" ++ css ++ 
diff --git a/README b/README
index cd6b30cda..0f17b3b5e 100644
--- a/README
+++ b/README
@@ -345,6 +345,11 @@ For further documentation, see the `pandoc(1)` man page.
 :   disables text-wrapping in output.  By default, text is wrapped
     appropriately for the output format.
 
+`--sanitize-html`
+:   sanitizes HTML (in markdown or HTML input) using a whitelist.
+    Unsafe tags are replaced by HTML comments; unsafe attributes
+    are omitted.
+
 `--dump-args`
 :   is intended to make it easier to create wrapper scripts that use
     Pandoc. It causes Pandoc to dump information about the arguments
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs
index 1fff4705f..42a085f63 100644
--- a/Text/Pandoc/Readers/HTML.hs
+++ b/Text/Pandoc/Readers/HTML.hs
@@ -75,10 +75,54 @@ blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
                  "dt", "frameset", "li", "tbody", "td", "tfoot",
                  "th", "thead", "tr", "script"] ++ eitherBlockOrInline
 
+sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
+                "blockquote", "br", "button", "caption", "center",
+                "cite", "code", "col", "colgroup", "dd", "del", "dfn",
+                "dir", "div", "dl", "dt", "em", "fieldset", "font",
+                "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
+                "i", "img", "input", "ins", "kbd", "label", "legend",
+                "li", "map", "menu", "ol", "optgroup", "option", "p",
+                "pre", "q", "s", "samp", "select", "small", "span",
+                "strike", "strong", "sub", "sup", "table", "tbody",
+                "td", "textarea", "tfoot", "th", "thead", "tr", "tt",
+                "u", "ul", "var"]
+
+sanitaryAttributes = ["abbr", "accept", "accept-charset",
+                      "accesskey", "action", "align", "alt", "axis",
+                      "border", "cellpadding", "cellspacing", "char",
+                      "charoff", "charset", "checked", "cite", "class",
+                      "clear", "cols", "colspan", "color", "compact",
+                      "coords", "datetime", "dir", "disabled",
+                      "enctype", "for", "frame", "headers", "height",
+                      "href", "hreflang", "hspace", "id", "ismap",
+                      "label", "lang", "longdesc", "maxlength", "media",
+                      "method", "multiple", "name", "nohref", "noshade",
+                      "nowrap", "prompt", "readonly", "rel", "rev",
+                      "rows", "rowspan", "rules", "scope", "selected",
+                      "shape", "size", "span", "src", "start",
+                      "summary", "tabindex", "target", "title", "type",
+                      "usemap", "valign", "value", "vspace", "width"]
+
 --
 -- HTML utility functions
 --
 
+-- | Returns @True@ if sanitization is specified and the specified tag is 
+--  not on the sanitized tag list.
+unsanitaryTag tag = do
+  st <- getState
+  if stateSanitizeHTML st && not (tag `elem` sanitaryTags)
+     then return True
+     else return False
+
+-- | returns @True@ if sanitization is specified and the specified attribute
+--  is not on the sanitized attribute list.
+unsanitaryAttribute (attr, _, _) = do
+  st <- getState
+  if stateSanitizeHTML st && not (attr `elem` sanitaryAttributes)
+    then return True
+    else return False
+
 -- | Read blocks until end tag.
 blocksTilEnd tag = do
   blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
@@ -111,20 +155,28 @@ anyHtmlTag = try $ do
   let ender' = if null ender then "" else " /"
   spaces
   char '>'
-  return $ "<" ++ tag ++ 
-           concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
+  let result = "<" ++ tag ++ 
+               concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
+  unsanitary <- unsanitaryTag tag
+  if unsanitary
+     then return $ "<!-- unsafe tag " ++ result ++ " omitted -->"
+     else return result
 
 anyHtmlEndTag = try $ do
   char '<'   
   spaces
   char '/'
   spaces
-  tagType <- many1 alphaNum
+  tag <- many1 alphaNum
   spaces
   char '>'
-  return $ "</" ++ tagType ++ ">"
+  let result = "</" ++ tag ++ ">"
+  unsanitary <- unsanitaryTag tag
+  if unsanitary
+     then return $ "<!-- unsafe tag " ++ result ++ " omitted -->"
+     else return result 
 
-htmlTag :: String -> GenParser Char st (String, [(String, String)])
+htmlTag :: String -> GenParser Char ParserState (String, [(String, String)])
 htmlTag tag = try $ do
   char '<'
   spaces
@@ -142,7 +194,14 @@ quoted quoteChar = do
                     (many (noneOf [quoteChar]))
   return (result, [quoteChar])
 
-htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
+nullAttribute = ("", "", "") 
+
+htmlAttribute = do
+  attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
+  unsanitary <- unsanitaryAttribute attr
+  if unsanitary
+     then return nullAttribute
+     else return attr
 
 -- minimized boolean attribute
 htmlMinimizedAttribute = try $ do
@@ -183,7 +242,7 @@ isBlock tag = (extractTagType tag) `elem` blockHtmlTags
 
 anyHtmlBlockTag = try $ do
   tag <- anyHtmlTag <|> anyHtmlEndTag
-  if isBlock tag then return tag else fail "not a block tag"
+  if not (isInline tag) then return tag else fail "not a block tag"
 
 anyHtmlInlineTag = try $ do
   tag <- anyHtmlTag <|> anyHtmlEndTag
@@ -194,19 +253,25 @@ anyHtmlInlineTag = try $ do
 htmlScript = try $ do
   open <- string "<script"
   rest <- manyTill anyChar (htmlEndTag "script")
-  return $ open ++ rest ++ "</script>"
+  st <- getState
+  if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
+     then return "<!-- unsafe script omitted -->"
+     else return $ open ++ rest ++ "</script>"
 
 -- | Parses material between style tags.
 -- Style tags must be treated differently, because they can contain CSS
 htmlStyle = try $ do
   open <- string "<style"
   rest <- manyTill anyChar (htmlEndTag "style")
-  return $ open ++ rest ++ "</style>"
+  st <- getState
+  if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
+     then return "<!-- unsafe style omitted -->"
+     else return $ open ++ rest ++ "</style>"
 
 htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
 
 rawHtmlBlock = try $ do
-  body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
+  body <- htmlBlockElement <|> anyHtmlBlockTag
   state <- getState
   if stateParseRaw state then return (RawHtml body) else return Null
 
@@ -235,8 +300,10 @@ definition = try $ do
   rest <- manyTill anyChar (char '>')
   return $ "<!" ++ rest ++ ">"
 
-nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >>
-  ((rawHtmlBlock >> return ' ') <|> anyChar)
+nonTitleNonHead = try $ do
+  notFollowedBy $ (htmlTag "title" >> return ' ') <|> 
+                  (htmlEndTag "head" >> return ' ')
+  (rawHtmlBlock >> return ' ') <|> anyChar
 
 parseTitle = try $ do
   (tag, _) <- htmlTag "title"
@@ -251,7 +318,7 @@ parseHead = try $ do
   skipMany nonTitleNonHead
   contents <- option [] parseTitle
   skipMany nonTitleNonHead
-  htmlTag "/head"
+  htmlEndTag "head"
   return (contents, [], "")
 
 skipHtmlTag tag = optional (htmlTag tag)
diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs
index 6455dcd9d..2d1fa7583 100644
--- a/Text/Pandoc/Readers/Markdown.hs
+++ b/Text/Pandoc/Readers/Markdown.hs
@@ -249,7 +249,7 @@ block = do
                    , blockQuote
                    , rawLaTeXEnvironment
                    , para
-                   , htmlBlock
+                   , rawHtmlBlocks
                    , plain
                    , nullBlock ]) <?> "block"
 
@@ -482,15 +482,12 @@ plain = many1 inline >>= return . Plain . normalizeSpaces
 
 htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
 
-htmlBlock = do
-  st <- getState
-  if stateStrict st
-    then try $ do failUnlessBeginningOfLine
-                  first <- htmlElement
-                  finalSpace <- many (oneOf spaceChars)
-                  finalNewlines <- many newline
-                  return $ RawHtml $ first ++ finalSpace ++ finalNewlines
-    else rawHtmlBlocks
+htmlBlock = try $ do
+    failUnlessBeginningOfLine
+    first <- htmlElement
+    finalSpace <- many (oneOf spaceChars)
+    finalNewlines <- many newline
+    return $ RawHtml $ first ++ finalSpace ++ finalNewlines
 
 -- True if tag is self-closing
 isSelfClosing tag = 
diff --git a/Text/Pandoc/Shared.hs b/Text/Pandoc/Shared.hs
index 7086ca452..477d86464 100644
--- a/Text/Pandoc/Shared.hs
+++ b/Text/Pandoc/Shared.hs
@@ -595,6 +595,7 @@ data ParserState = ParserState
     { stateParseRaw        :: Bool,          -- ^ Parse raw HTML and LaTeX?
       stateParserContext   :: ParserContext, -- ^ Inside list?
       stateQuoteContext    :: QuoteContext,  -- ^ Inside quoted environment?
+      stateSanitizeHTML    :: Bool,          -- ^ Sanitize HTML?
       stateKeys            :: KeyTable,      -- ^ List of reference keys
       stateNotes           :: NoteTable,     -- ^ List of notes
       stateTabStop         :: Int,           -- ^ Tab stop
@@ -614,6 +615,7 @@ defaultParserState =
     ParserState { stateParseRaw        = False,
                   stateParserContext   = NullState,
                   stateQuoteContext    = NoQuote,
+                  stateSanitizeHTML    = False,
                   stateKeys            = [],
                   stateNotes           = [],
                   stateTabStop         = 4,
diff --git a/man/man1/html2markdown.1.md b/man/man1/html2markdown.1.md
index 6c5d2dcc8..19d5104af 100644
--- a/man/man1/html2markdown.1.md
+++ b/man/man1/html2markdown.1.md
@@ -51,6 +51,10 @@ a complete list.  The following options are most relevant:
 \--no-wrap
 :   Disable text wrapping in output.  (Default is to wrap text.)
 
+\--sanitize-html
+:   Sanitizes HTML using a whitelist. Unsafe tags are replaced by HTML
+    comments; unsafe attributes are omitted.
+
 -H *FILE*, \--include-in-header=*FILE*
 :   Include contents of *FILE* at the end of the header.  Implies
     `-s`.
diff --git a/man/man1/pandoc.1.md b/man/man1/pandoc.1.md
index 37d3dc262..427004419 100644
--- a/man/man1/pandoc.1.md
+++ b/man/man1/pandoc.1.md
@@ -126,6 +126,11 @@ to Pandoc.  Or use `html2markdown`(1), a wrapper around `pandoc`.
 \--no-wrap
 :   Disable text wrapping in output. (Default is to wrap text.)
 
+\--sanitize-html
+:   Sanitizes HTML (in markdown or HTML input) using a whitelist.
+    Unsafe tags are replaced by HTML comments; unsafe attributes
+    are omitted.
+
 \--toc, \--table-of-contents
 :   Include an automatically generated table of contents (HTML, markdown,
     RTF) or an instruction to create one (LaTeX, reStructuredText).