pandoc/Main.hs
fiddlosopher 5df912b162 Added optional HTML sanitization using a whitelist.
When this option is specified (--sanitize-html on the command line),
unsafe HTML tags will be replaced by HTML comments, and unsafe HTML
attributes will be removed.  This option should be especially useful
for those who want to use pandoc libraries in web applications, where
users will provide the input.

+ Main.hs:  Added --sanitize-html option.
+ Text.Pandoc.Shared:  Added stateSanitizeHTML to ParserState.
+ Text.Pandoc.Readers.HTML:
  - Added whitelists of sanitaryTags and sanitaryAttributes.
  - Added parsers to check these lists (and state) to see if a given
    tag or attribute should be counted unsafe.
  - Modified anyHtmlTag and anyHtmlEndTag to replace unsafe tags
    with comments.
  - Modified htmlAttribute to remove unsafe attributes.
  - Modified htmlScript and htmlStyle to remove these elements if
    unsafe.
  - Modified rawHtmlBlock to use anyHtmlBlockTag instead of anyHtmlTag
    and anyHtmlEndTag.  This fixes a bug in markdown parsing, where
    inline tags would be included in raw HTML blocks.
  - Modified anyHtmlBlockTag to test for (not inline) rather than
    directly for block.  This allows us to handle e.g. docbook in
    the markdown reader.
  - Minor tweaks in nonTitleNonHead  and parseTitle.
+ Text.Pandoc.Readers.Markdown:
  - In non-strict mode use rawHtmlBlocks instead of htmlBlock.
    Simplified htmlBlock, since we know it's only called in strict
    mode.
+ Modified README and man pages to document new option.


git-svn-id: https://pandoc.googlecode.com/svn/trunk@1166 788f1e2b-df1e-0410-8736-df70ead52e1b
2008-01-03 21:32:32 +00:00

531 lines
21 KiB
Haskell

{-
Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Main
Copyright : Copyright (C) 2006-7 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley@edu>
Stability : alpha
Portability : portable
Parses command-line options and calls the appropriate readers and
writers.
-}
module Main where
import Text.Pandoc
import Text.Pandoc.UTF8
import Text.Pandoc.Shared ( joinWithSep, HTMLMathMethod (..) )
import Text.Regex ( mkRegex, matchRegex )
import System.Environment ( getArgs, getProgName, getEnvironment )
import System.Exit ( exitWith, ExitCode (..) )
import System.Console.GetOpt
import System.IO
import Data.Maybe ( fromMaybe )
import Data.List ( isPrefixOf )
import Data.Char ( toLower )
import Control.Monad ( (>>=) )
copyrightMessage :: String
copyrightMessage = "\nCopyright (C) 2006-7 John MacFarlane\n\
\Web: http://johnmacfarlane.net/pandoc\n\
\This is free software; see the source for copying conditions. There is no\n\
\warranty, not even for merchantability or fitness for a particular purpose."
-- | Association list of formats and readers.
readers :: [(String, ParserState -> String -> Pandoc)]
readers = [("native" , readPandoc)
,("markdown" , readMarkdown)
,("rst" , readRST)
,("html" , readHtml)
,("latex" , readLaTeX)
]
-- | Reader for native Pandoc format.
readPandoc :: ParserState -> String -> Pandoc
readPandoc state input = read input
-- | Association list of formats and pairs of writers and default headers.
writers :: [ ( String, ( WriterOptions -> Pandoc -> String, String ) ) ]
writers = [("native" , (writeDoc, ""))
,("html" , (writeHtmlString, ""))
,("s5" , (writeS5String, defaultS5Header))
,("docbook" , (writeDocbook, defaultDocbookHeader))
,("latex" , (writeLaTeX, defaultLaTeXHeader))
,("context" , (writeConTeXt, defaultConTeXtHeader))
,("man" , (writeMan, ""))
,("markdown" , (writeMarkdown, ""))
,("rst" , (writeRST, ""))
,("rtf" , (writeRTF, defaultRTFHeader))
]
-- | Writer for Pandoc native format.
writeDoc :: WriterOptions -> Pandoc -> String
writeDoc options = prettyPandoc
-- | Data structure for command line options.
data Opt = Opt
{ optPreserveTabs :: Bool -- ^ Convert tabs to spaces
, optTabStop :: Int -- ^ Number of spaces per tab
, optStandalone :: Bool -- ^ Include header, footer
, optReader :: String -- ^ Reader format
, optWriter :: String -- ^ Writer format
, optParseRaw :: Bool -- ^ Parse unconvertable HTML and TeX
, optCSS :: String -- ^ CSS file to link to
, optTableOfContents :: Bool -- ^ Include table of contents
, optIncludeInHeader :: String -- ^ File to include in header
, optIncludeBeforeBody :: String -- ^ File to include at top of body
, optIncludeAfterBody :: String -- ^ File to include at end of body
, optCustomHeader :: String -- ^ Custom header to use, or "DEFAULT"
, optTitlePrefix :: String -- ^ Optional prefix for HTML title
, optOutputFile :: String -- ^ Name of output file
, optNumberSections :: Bool -- ^ Number sections in LaTeX
, optIncremental :: Bool -- ^ Use incremental lists in S5
, optSmart :: Bool -- ^ Use smart typography
, optHTMLMathMethod :: HTMLMathMethod -- ^ Method to print HTML math
, optDumpArgs :: Bool -- ^ Output command-line arguments
, optIgnoreArgs :: Bool -- ^ Ignore command-line arguments
, optStrict :: Bool -- ^ Use strict markdown syntax
, optReferenceLinks :: Bool -- ^ Use reference links in writing markdown, rst
, optWrapText :: Bool -- ^ Wrap text
, optSanitizeHTML :: Bool -- ^ Sanitize HTML
}
-- | Defaults for command-line options.
defaultOpts :: Opt
defaultOpts = Opt
{ optPreserveTabs = False
, optTabStop = 4
, optStandalone = False
, optReader = "" -- null for default reader
, optWriter = "" -- null for default writer
, optParseRaw = False
, optCSS = ""
, optTableOfContents = False
, optIncludeInHeader = ""
, optIncludeBeforeBody = ""
, optIncludeAfterBody = ""
, optCustomHeader = "DEFAULT"
, optTitlePrefix = ""
, optOutputFile = "-" -- "-" means stdout
, optNumberSections = False
, optIncremental = False
, optSmart = False
, optHTMLMathMethod = PlainMath
, optDumpArgs = False
, optIgnoreArgs = False
, optStrict = False
, optReferenceLinks = False
, optWrapText = True
, optSanitizeHTML = False
}
-- | A list of functions, each transforming the options data structure
-- in response to a command-line option.
options :: [OptDescr (Opt -> IO Opt)]
options =
[ Option "fr" ["from","read"]
(ReqArg
(\arg opt -> return opt { optReader = map toLower arg })
"FORMAT")
"" -- ("(" ++ (joinWithSep ", " $ map fst readers) ++ ")")
, Option "tw" ["to","write"]
(ReqArg
(\arg opt -> return opt { optWriter = map toLower arg })
"FORMAT")
"" -- ("(" ++ (joinWithSep ", " $ map fst writers) ++ ")")
, Option "s" ["standalone"]
(NoArg
(\opt -> return opt { optStandalone = True }))
"" -- "Include needed header and footer on output"
, Option "o" ["output"]
(ReqArg
(\arg opt -> return opt { optOutputFile = arg })
"FILENAME")
"" -- "Name of output file"
, Option "p" ["preserve-tabs"]
(NoArg
(\opt -> return opt { optPreserveTabs = True }))
"" -- "Preserve tabs instead of converting to spaces"
, Option "" ["tab-stop"]
(ReqArg
(\arg opt -> return opt { optTabStop = (read arg) } )
"TABSTOP")
"" -- "Tab stop (default 4)"
, Option "" ["strict"]
(NoArg
(\opt -> return opt { optStrict = True } ))
"" -- "Disable markdown syntax extensions"
, Option "" ["reference-links"]
(NoArg
(\opt -> return opt { optReferenceLinks = True } ))
"" -- "Use reference links in parsing HTML"
, Option "R" ["parse-raw"]
(NoArg
(\opt -> return opt { optParseRaw = True }))
"" -- "Parse untranslatable HTML codes and LaTeX environments as raw"
, Option "S" ["smart"]
(NoArg
(\opt -> return opt { optSmart = True }))
"" -- "Use smart quotes, dashes, and ellipses"
, Option "m" ["asciimathml"]
(OptArg
(\arg opt -> return opt { optHTMLMathMethod =
ASCIIMathML arg })
"URL")
"" -- "Use ASCIIMathML script in html output"
, Option "" ["mimetex"]
(OptArg
(\arg opt -> return opt { optHTMLMathMethod = MimeTeX
(fromMaybe "/cgi-bin/mimetex.cgi" arg)})
"URL")
"" -- "Use mimetex for HTML math"
, Option "" ["gladtex"]
(NoArg
(\opt -> return opt { optHTMLMathMethod = GladTeX }))
"" -- "Use gladtex for HTML math"
, Option "i" ["incremental"]
(NoArg
(\opt -> return opt { optIncremental = True }))
"" -- "Make list items display incrementally in S5"
, Option "N" ["number-sections"]
(NoArg
(\opt -> return opt { optNumberSections = True }))
"" -- "Number sections in LaTeX"
, Option "" ["no-wrap"]
(NoArg
(\opt -> return opt { optWrapText = False }))
"" -- "Do not wrap text in output"
, Option "" ["sanitize-html"]
(NoArg
(\opt -> return opt { optSanitizeHTML = True }))
"" -- "Sanitize HTML"
, Option "" ["toc", "table-of-contents"]
(NoArg
(\opt -> return opt { optTableOfContents = True }))
"" -- "Include table of contents"
, Option "c" ["css"]
(ReqArg
(\arg opt -> return opt { optCSS = arg,
optStandalone = True })
"CSS")
"" -- "Link to CSS style sheet"
, Option "H" ["include-in-header"]
(ReqArg
(\arg opt -> do
let old = optIncludeInHeader opt
text <- readFile arg
return opt { optIncludeInHeader = old ++ fromUTF8 text,
optStandalone = True })
"FILENAME")
"" -- "File to include at end of header (implies -s)"
, Option "B" ["include-before-body"]
(ReqArg
(\arg opt -> do
let old = optIncludeBeforeBody opt
text <- readFile arg
return opt { optIncludeBeforeBody = old ++ fromUTF8 text })
"FILENAME")
"" -- "File to include before document body"
, Option "A" ["include-after-body"]
(ReqArg
(\arg opt -> do
let old = optIncludeAfterBody opt
text <- readFile arg
return opt { optIncludeAfterBody = old ++ fromUTF8 text })
"FILENAME")
"" -- "File to include after document body"
, Option "C" ["custom-header"]
(ReqArg
(\arg opt -> do
text <- readFile arg
return opt { optCustomHeader = fromUTF8 text,
optStandalone = True })
"FILENAME")
"" -- "File to use for custom header (implies -s)"
, Option "T" ["title-prefix"]
(ReqArg
(\arg opt -> return opt { optTitlePrefix = arg,
optStandalone = True })
"STRING")
"" -- "String to prefix to HTML window title"
, Option "D" ["print-default-header"]
(ReqArg
(\arg opt -> do
let header = case (lookup arg writers) of
Just (writer, head) -> head
Nothing -> error ("Unknown reader: " ++ arg)
hPutStr stdout header
exitWith ExitSuccess)
"FORMAT")
"" -- "Print default header for FORMAT"
, Option "" ["dump-args"]
(NoArg
(\opt -> return opt { optDumpArgs = True }))
"" -- "Print output filename and arguments to stdout."
, Option "" ["ignore-args"]
(NoArg
(\opt -> return opt { optIgnoreArgs = True }))
"" -- "Ignore command-line arguments."
, Option "v" ["version"]
(NoArg
(\_ -> do
prg <- getProgName
hPutStrLn stderr (prg ++ " " ++ pandocVersion ++
copyrightMessage)
exitWith $ ExitFailure 4))
"" -- "Print version"
, Option "h" ["help"]
(NoArg
(\_ -> do
prg <- getProgName
hPutStr stderr (usageMessage prg options)
exitWith $ ExitFailure 2))
"" -- "Show help"
]
-- Returns usage message
usageMessage :: String -> [OptDescr (Opt -> IO Opt)] -> String
usageMessage programName options = usageInfo
(programName ++ " [OPTIONS] [FILES]" ++ "\nInput formats: " ++
(joinWithSep ", " $ map fst readers) ++ "\nOutput formats: " ++
(joinWithSep ", " $ map fst writers) ++ "\nOptions:")
options
-- Determine default reader based on source file extensions
defaultReaderName :: [String] -> String
defaultReaderName [] = "markdown"
defaultReaderName (x:xs) =
let x' = map toLower x in
case (matchRegex (mkRegex ".*\\.(.*)") x') of
Nothing -> defaultReaderName xs -- no extension
Just ["xhtml"] -> "html"
Just ["html"] -> "html"
Just ["htm"] -> "html"
Just ["tex"] -> "latex"
Just ["latex"] -> "latex"
Just ["ltx"] -> "latex"
Just ["rst"] -> "rst"
Just ["native"] -> "native"
Just _ -> "markdown"
-- Determine default writer based on output file extension
defaultWriterName :: String -> String
defaultWriterName "-" = "html" -- no output file
defaultWriterName x =
let x' = map toLower x in
case (matchRegex (mkRegex ".*\\.(.*)") x') of
Nothing -> "markdown" -- no extension
Just [""] -> "markdown" -- empty extension
Just ["tex"] -> "latex"
Just ["latex"] -> "latex"
Just ["ltx"] -> "latex"
Just ["context"] -> "context"
Just ["ctx"] -> "context"
Just ["rtf"] -> "rtf"
Just ["rst"] -> "rst"
Just ["s5"] -> "s5"
Just ["native"] -> "native"
Just ["txt"] -> "markdown"
Just ["text"] -> "markdown"
Just ["md"] -> "markdown"
Just ["markdown"] -> "markdown"
Just ["db"] -> "docbook"
Just ["xml"] -> "docbook"
Just ["sgml"] -> "docbook"
Just [[x]] | x `elem` ['1'..'9'] -> "man"
Just _ -> "html"
main = do
rawArgs <- getArgs
prg <- getProgName
let compatMode = (prg == "hsmarkdown")
let (actions, args, errors) = if compatMode
then ([], rawArgs, [])
else getOpt Permute options rawArgs
if (not (null errors))
then do
name <- getProgName
mapM (\e -> hPutStrLn stderr e) errors
hPutStr stderr (usageMessage name options)
exitWith $ ExitFailure 2
else
return ()
let defaultOpts' = if compatMode
then defaultOpts { optReader = "markdown"
, optWriter = "html"
, optStrict = True }
else defaultOpts
-- thread option data structure through all supplied option actions
opts <- foldl (>>=) (return defaultOpts') actions
let Opt { optPreserveTabs = preserveTabs
, optTabStop = tabStop
, optStandalone = standalone
, optReader = readerName
, optWriter = writerName
, optParseRaw = parseRaw
, optCSS = css
, optTableOfContents = toc
, optIncludeInHeader = includeHeader
, optIncludeBeforeBody = includeBefore
, optIncludeAfterBody = includeAfter
, optCustomHeader = customHeader
, optTitlePrefix = titlePrefix
, optOutputFile = outputFile
, optNumberSections = numberSections
, optIncremental = incremental
, optSmart = smart
, optHTMLMathMethod = mathMethod
, optDumpArgs = dumpArgs
, optIgnoreArgs = ignoreArgs
, optStrict = strict
, optReferenceLinks = referenceLinks
, optWrapText = wrap
, optSanitizeHTML = sanitize
} = opts
if dumpArgs
then do
hPutStrLn stdout outputFile
mapM (\arg -> hPutStrLn stdout arg) args
exitWith $ ExitSuccess
else return ()
let sources = if ignoreArgs then [] else args
-- assign reader and writer based on options and filenames
let readerName' = if null readerName
then defaultReaderName sources
else readerName
let writerName' = if null writerName
then defaultWriterName outputFile
else writerName
reader <- case (lookup readerName' readers) of
Just r -> return r
Nothing -> error ("Unknown reader: " ++ readerName')
(writer, defaultHeader) <- case (lookup writerName' writers) of
Just (w,h) -> return (w, h)
Nothing -> error ("Unknown writer: " ++ writerName')
output <- if (outputFile == "-")
then return stdout
else openFile outputFile WriteMode
environment <- getEnvironment
let columns = case lookup "COLUMNS" environment of
Just cols -> read cols
Nothing -> stateColumns defaultParserState
let tabFilter _ [] = ""
tabFilter _ ('\n':xs) = '\n':(tabFilter tabStop xs)
-- remove DOS line endings
tabFilter _ ('\r':'\n':xs) = '\n':(tabFilter tabStop xs)
tabFilter _ ('\r':xs) = '\n':(tabFilter tabStop xs)
tabFilter spsToNextStop ('\t':xs) =
if preserveTabs
then '\t':(tabFilter tabStop xs)
else replicate spsToNextStop ' ' ++ tabFilter tabStop xs
tabFilter 1 (x:xs) =
x:(tabFilter tabStop xs)
tabFilter spsToNextStop (x:xs) =
x:(tabFilter (spsToNextStop - 1) xs)
let startParserState =
defaultParserState { stateParseRaw = parseRaw,
stateTabStop = tabStop,
stateSanitizeHTML = sanitize,
stateStandalone = standalone && (not strict),
stateSmart = smart || writerName' `elem`
["latex", "context"],
stateColumns = columns,
stateStrict = strict }
let csslink = if (css == "")
then ""
else "<link rel=\"stylesheet\" href=\"" ++ css ++
"\" type=\"text/css\" media=\"all\" />\n"
let header = (if (customHeader == "DEFAULT")
then defaultHeader
else customHeader) ++ csslink ++ includeHeader
let writerOptions = WriterOptions { writerStandalone = standalone &&
(not strict),
writerHeader = header,
writerTitlePrefix = titlePrefix,
writerTabStop = tabStop,
writerTableOfContents = toc &&
(not strict) &&
writerName/="s5",
writerHTMLMathMethod = mathMethod,
writerS5 = (writerName=="s5"),
writerIgnoreNotes = False,
writerIncremental = incremental,
writerNumberSections = numberSections,
writerIncludeBefore = includeBefore,
writerIncludeAfter = includeAfter,
writerStrictMarkdown = strict,
writerReferenceLinks = referenceLinks,
writerWrapText = wrap }
(readSources sources) >>= (hPutStrLn output . toUTF8 .
(writer writerOptions) .
(reader startParserState) . tabFilter tabStop .
fromUTF8 . (joinWithSep "\n")) >>
hClose output
where
readSources [] = mapM readSource ["-"]
readSources sources = mapM readSource sources
readSource "-" = getContents
readSource source = readFile source