diff --git a/Main.hs b/Main.hs index 5938fefc4..98bc0897f 100644 --- a/Main.hs +++ b/Main.hs @@ -104,6 +104,7 @@ data Opt = Opt , optStrict :: Bool -- ^ Use strict markdown syntax , optReferenceLinks :: Bool -- ^ Use reference links in writing markdown, rst , optWrapText :: Bool -- ^ Wrap text + , optSanitizeHTML :: Bool -- ^ Sanitize HTML } -- | Defaults for command-line options. @@ -132,6 +133,7 @@ defaultOpts = Opt , optStrict = False , optReferenceLinks = False , optWrapText = True + , optSanitizeHTML = False } -- | A list of functions, each transforming the options data structure @@ -226,6 +228,11 @@ options = (\opt -> return opt { optWrapText = False })) "" -- "Do not wrap text in output" + , Option "" ["sanitize-html"] + (NoArg + (\opt -> return opt { optSanitizeHTML = True })) + "" -- "Sanitize HTML" + , Option "" ["toc", "table-of-contents"] (NoArg (\opt -> return opt { optTableOfContents = True })) @@ -424,6 +431,7 @@ main = do , optStrict = strict , optReferenceLinks = referenceLinks , optWrapText = wrap + , optSanitizeHTML = sanitize } = opts if dumpArgs @@ -476,13 +484,14 @@ main = do x:(tabFilter (spsToNextStop - 1) xs) let startParserState = - defaultParserState { stateParseRaw = parseRaw, - stateTabStop = tabStop, - stateStandalone = standalone && (not strict), - stateSmart = smart || writerName' `elem` - ["latex", "context"], - stateColumns = columns, - stateStrict = strict } + defaultParserState { stateParseRaw = parseRaw, + stateTabStop = tabStop, + stateSanitizeHTML = sanitize, + stateStandalone = standalone && (not strict), + stateSmart = smart || writerName' `elem` + ["latex", "context"], + stateColumns = columns, + stateStrict = strict } let csslink = if (css == "") then "" else ">~ spaces) (htmlEndTag tag) @@ -111,20 +155,28 @@ anyHtmlTag = try $ do let ender' = if null ender then "" else " /" spaces char '>' - return $ "<" ++ tag ++ - concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" + let result = "<" ++ tag ++ + concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" + unsanitary <- unsanitaryTag tag + if unsanitary + then return $ "" + else return result anyHtmlEndTag = try $ do char '<' spaces char '/' spaces - tagType <- many1 alphaNum + tag <- many1 alphaNum spaces char '>' - return $ "" + let result = "" + unsanitary <- unsanitaryTag tag + if unsanitary + then return $ "" + else return result -htmlTag :: String -> GenParser Char st (String, [(String, String)]) +htmlTag :: String -> GenParser Char ParserState (String, [(String, String)]) htmlTag tag = try $ do char '<' spaces @@ -142,7 +194,14 @@ quoted quoteChar = do (many (noneOf [quoteChar])) return (result, [quoteChar]) -htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute +nullAttribute = ("", "", "") + +htmlAttribute = do + attr <- htmlRegularAttribute <|> htmlMinimizedAttribute + unsanitary <- unsanitaryAttribute attr + if unsanitary + then return nullAttribute + else return attr -- minimized boolean attribute htmlMinimizedAttribute = try $ do @@ -183,7 +242,7 @@ isBlock tag = (extractTagType tag) `elem` blockHtmlTags anyHtmlBlockTag = try $ do tag <- anyHtmlTag <|> anyHtmlEndTag - if isBlock tag then return tag else fail "not a block tag" + if not (isInline tag) then return tag else fail "not a block tag" anyHtmlInlineTag = try $ do tag <- anyHtmlTag <|> anyHtmlEndTag @@ -194,19 +253,25 @@ anyHtmlInlineTag = try $ do htmlScript = try $ do open <- string "" + st <- getState + if stateSanitizeHTML st && not ("script" `elem` sanitaryTags) + then return "" + else return $ open ++ rest ++ "" -- | Parses material between style tags. -- Style tags must be treated differently, because they can contain CSS htmlStyle = try $ do open <- string "" + st <- getState + if stateSanitizeHTML st && not ("style" `elem` sanitaryTags) + then return "" + else return $ open ++ rest ++ "" htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ] rawHtmlBlock = try $ do - body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag + body <- htmlBlockElement <|> anyHtmlBlockTag state <- getState if stateParseRaw state then return (RawHtml body) else return Null @@ -235,8 +300,10 @@ definition = try $ do rest <- manyTill anyChar (char '>') return $ "" -nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >> - ((rawHtmlBlock >> return ' ') <|> anyChar) +nonTitleNonHead = try $ do + notFollowedBy $ (htmlTag "title" >> return ' ') <|> + (htmlEndTag "head" >> return ' ') + (rawHtmlBlock >> return ' ') <|> anyChar parseTitle = try $ do (tag, _) <- htmlTag "title" @@ -251,7 +318,7 @@ parseHead = try $ do skipMany nonTitleNonHead contents <- option [] parseTitle skipMany nonTitleNonHead - htmlTag "/head" + htmlEndTag "head" return (contents, [], "") skipHtmlTag tag = optional (htmlTag tag) diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs index 6455dcd9d..2d1fa7583 100644 --- a/Text/Pandoc/Readers/Markdown.hs +++ b/Text/Pandoc/Readers/Markdown.hs @@ -249,7 +249,7 @@ block = do , blockQuote , rawLaTeXEnvironment , para - , htmlBlock + , rawHtmlBlocks , plain , nullBlock ]) "block" @@ -482,15 +482,12 @@ plain = many1 inline >>= return . Plain . normalizeSpaces htmlElement = strictHtmlBlock <|> htmlBlockElement "html element" -htmlBlock = do - st <- getState - if stateStrict st - then try $ do failUnlessBeginningOfLine - first <- htmlElement - finalSpace <- many (oneOf spaceChars) - finalNewlines <- many newline - return $ RawHtml $ first ++ finalSpace ++ finalNewlines - else rawHtmlBlocks +htmlBlock = try $ do + failUnlessBeginningOfLine + first <- htmlElement + finalSpace <- many (oneOf spaceChars) + finalNewlines <- many newline + return $ RawHtml $ first ++ finalSpace ++ finalNewlines -- True if tag is self-closing isSelfClosing tag = diff --git a/Text/Pandoc/Shared.hs b/Text/Pandoc/Shared.hs index 7086ca452..477d86464 100644 --- a/Text/Pandoc/Shared.hs +++ b/Text/Pandoc/Shared.hs @@ -595,6 +595,7 @@ data ParserState = ParserState { stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX? stateParserContext :: ParserContext, -- ^ Inside list? stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment? + stateSanitizeHTML :: Bool, -- ^ Sanitize HTML? stateKeys :: KeyTable, -- ^ List of reference keys stateNotes :: NoteTable, -- ^ List of notes stateTabStop :: Int, -- ^ Tab stop @@ -614,6 +615,7 @@ defaultParserState = ParserState { stateParseRaw = False, stateParserContext = NullState, stateQuoteContext = NoQuote, + stateSanitizeHTML = False, stateKeys = [], stateNotes = [], stateTabStop = 4, diff --git a/man/man1/html2markdown.1.md b/man/man1/html2markdown.1.md index 6c5d2dcc8..19d5104af 100644 --- a/man/man1/html2markdown.1.md +++ b/man/man1/html2markdown.1.md @@ -51,6 +51,10 @@ a complete list. The following options are most relevant: \--no-wrap : Disable text wrapping in output. (Default is to wrap text.) +\--sanitize-html +: Sanitizes HTML using a whitelist. Unsafe tags are replaced by HTML + comments; unsafe attributes are omitted. + -H *FILE*, \--include-in-header=*FILE* : Include contents of *FILE* at the end of the header. Implies `-s`. diff --git a/man/man1/pandoc.1.md b/man/man1/pandoc.1.md index 37d3dc262..427004419 100644 --- a/man/man1/pandoc.1.md +++ b/man/man1/pandoc.1.md @@ -126,6 +126,11 @@ to Pandoc. Or use `html2markdown`(1), a wrapper around `pandoc`. \--no-wrap : Disable text wrapping in output. (Default is to wrap text.) +\--sanitize-html +: Sanitizes HTML (in markdown or HTML input) using a whitelist. + Unsafe tags are replaced by HTML comments; unsafe attributes + are omitted. + \--toc, \--table-of-contents : Include an automatically generated table of contents (HTML, markdown, RTF) or an instruction to create one (LaTeX, reStructuredText).