From 8624ed9bd3c38c1907070a3b7de244fd487976c4 Mon Sep 17 00:00:00 2001
From: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>
Date: Sat, 22 Mar 2008 20:41:56 +0000
Subject: [PATCH] The '--sanitize-html' option now examines URIs in markdown
 links and images, and in HTML href and src attributes.  If the URI scheme is
 not on a whitelist of safe schemes, it is rejected.  The main point is to
 prevent cross-site scripting attacks using 'javascript:' URIs. See
 http://www.mail-archive.com/markdown-discuss@six.pairlist.net/msg01186.html
 and http://ha.ckers.org/xss.html.  Resolves Issue #62.

git-svn-id: https://pandoc.googlecode.com/svn/trunk@1262 788f1e2b-df1e-0410-8736-df70ead52e1b
---
 README                          |  3 ++-
 Text/Pandoc/Readers/HTML.hs     | 32 ++++++++++++++++++++++++--------
 Text/Pandoc/Readers/Markdown.hs | 16 +++++++++++-----
 man/man1/html2markdown.1.md     |  4 ----
 man/man1/pandoc.1.md            |  3 ++-
 5 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/README b/README
index e54821ba3..75d482333 100644
--- a/README
+++ b/README
@@ -356,7 +356,8 @@ For further documentation, see the `pandoc(1)` man page.
 `--sanitize-html`
 :   sanitizes HTML (in markdown or HTML input) using a whitelist.
     Unsafe tags are replaced by HTML comments; unsafe attributes
-    are omitted.
+    are omitted. URIs in links and images are also checked against a
+    whitelist of URI schemes.
 
 `--dump-args`
 :   is intended to make it easier to create wrapper scripts that use
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs
index 359ff3021..7bd76d983 100644
--- a/Text/Pandoc/Readers/HTML.hs
+++ b/Text/Pandoc/Readers/HTML.hs
@@ -37,7 +37,8 @@ module Text.Pandoc.Readers.HTML (
                                  anyHtmlEndTag,
                                  htmlEndTag,
                                  extractTagType,
-                                 htmlBlockElement 
+                                 htmlBlockElement,
+                                 unsanitaryURI
                                 ) where
 
 import Text.ParserCombinators.Parsec
@@ -47,6 +48,7 @@ import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
 import Data.Maybe ( fromMaybe )
 import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf )
 import Data.Char ( toLower, isAlphaNum )
+import Network.URI ( parseURIReference, URI (..) )
 
 -- | Convert HTML-formatted string to 'Pandoc' document.
 readHtml :: ParserState   -- ^ Parser state
@@ -110,17 +112,31 @@ sanitaryAttributes = ["abbr", "accept", "accept-charset",
 --  not on the sanitized tag list.
 unsanitaryTag tag = do
   st <- getState
-  if stateSanitizeHTML st && not (tag `elem` sanitaryTags)
-     then return True
-     else return False
+  return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
 
 -- | returns @True@ if sanitization is specified and the specified attribute
 --  is not on the sanitized attribute list.
-unsanitaryAttribute (attr, _, _) = do
+unsanitaryAttribute (attr, val, _) = do
   st <- getState
-  if stateSanitizeHTML st && not (attr `elem` sanitaryAttributes)
-    then return True
-    else return False
+  return $ stateSanitizeHTML st &&
+           (attr `notElem` sanitaryAttributes ||
+             (attr `elem` ["href","src"] && unsanitaryURI val))
+
+-- | Returns @True@ if the specified URI is potentially a security risk.
+unsanitaryURI uri =
+  let safeURISchemes = [ "", "http", "https", "ftp", "mailto", "file",
+             "telnet", "gopher", "aaa", "aaas", "acap", "cap", "cid",
+             "crid", "dav", "dict", "dns", "fax", "go", "h323", "im",
+             "imap", "ldap", "mid", "news", "nfs", "nntp", "pop",
+             "pres", "sip", "sips", "snmp", "tel", "urn", "wais",
+             "xmpp", "z39.50r", "z39.50s", "aim", "callto", "cvs",
+             "ed2k", "feed", "fish", "gg", "irc", "ircs", "lastfm",
+             "ldaps", "magnet", "mms", "msnim", "notes", "rsync",
+             "secondlife", "skype", "ssh", "sftp", "smb", "sms",
+             "snews", "webcal", "ymsgr"]
+  in  case parseURIReference uri of
+           Just p  -> (map toLower $ uriScheme p) `notElem` safeURISchemes
+           Nothing -> True
 
 -- | Read blocks until end tag.
 blocksTilEnd tag = do
diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs
index e6f09f97a..2dbf9e189 100644
--- a/Text/Pandoc/Readers/Markdown.hs
+++ b/Text/Pandoc/Readers/Markdown.hs
@@ -41,7 +41,7 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
 import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, 
                                   anyHtmlInlineTag, anyHtmlTag,
                                   anyHtmlEndTag, htmlEndTag, extractTagType,
-                                  htmlBlockElement )
+                                  htmlBlockElement, unsanitaryURI )
 import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
 import Text.ParserCombinators.Parsec
 
@@ -921,7 +921,10 @@ linkTitle = try $ do
 link = try $ do
   label <- reference
   src <- source <|> referenceLink label
-  return $ Link label src
+  sanitize <- getState >>= return . stateSanitizeHTML
+  if sanitize && unsanitaryURI (fst src)
+     then fail "Unsanitary URI"
+     else return $ Link label src
 
 -- a link like [this][ref] or [this][] or [this]
 referenceLink label = do
@@ -941,9 +944,12 @@ autoLink = try $ do
                 then drop 7 src
                 else src 
   st <- getState
-  return $ if stateStrict st
-              then Link [Str src'] (src, "")
-              else Link [Code src'] (src, "")
+  let sanitize = stateSanitizeHTML st
+  if sanitize && unsanitaryURI src
+     then fail "Unsanitary URI"
+     else return $ if stateStrict st
+                      then Link [Str src'] (src, "")
+                      else Link [Code src'] (src, "")
 
 image = try $ do
   char '!'
diff --git a/man/man1/html2markdown.1.md b/man/man1/html2markdown.1.md
index 1db37cf47..905bdd0d0 100644
--- a/man/man1/html2markdown.1.md
+++ b/man/man1/html2markdown.1.md
@@ -51,10 +51,6 @@ a complete list.  The following options are most relevant:
 \--no-wrap
 :   Disable text wrapping in output.  (Default is to wrap text.)
 
-\--sanitize-html
-:   Sanitizes HTML using a whitelist. Unsafe tags are replaced by HTML
-    comments; unsafe attributes are omitted.
-
 -H *FILE*, \--include-in-header=*FILE*
 :   Include contents of *FILE* at the end of the header.  Implies
     `-s`.
diff --git a/man/man1/pandoc.1.md b/man/man1/pandoc.1.md
index 5bf734d5a..e3ca8e591 100644
--- a/man/man1/pandoc.1.md
+++ b/man/man1/pandoc.1.md
@@ -128,7 +128,8 @@ to Pandoc.  Or use `html2markdown`(1), a wrapper around `pandoc`.
 \--sanitize-html
 :   Sanitizes HTML (in markdown or HTML input) using a whitelist.
     Unsafe tags are replaced by HTML comments; unsafe attributes
-    are omitted.
+    are omitted.  URIs in links and images are also checked against a
+    whitelist of URI schemes.
 
 \--toc, \--table-of-contents
 :   Include an automatically generated table of contents (HTML, markdown,