pandoc/src/Text/Pandoc/Readers/Markdown.hs

1304 lines
44 KiB
Haskell
Raw Normal View History

{-
2010-03-23 13:31:09 -07:00
Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.Markdown
2010-03-23 13:31:09 -07:00
Copyright : Copyright (C) 2006-2010 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Conversion of markdown-formatted plain text to 'Pandoc' document.
-}
module Text.Pandoc.Readers.Markdown ( readMarkdown ) where
import Data.List ( transpose, sortBy, findIndex, intercalate )
import qualified Data.Map as M
import Data.Ord ( comparing )
import Data.Char ( isAlphaNum )
import Data.Maybe
import Text.Pandoc.Definition
import Text.Pandoc.Generic
import Text.Pandoc.Shared
import Text.Pandoc.Parsing
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag,
isTextTag, isCommentTag )
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Text.ParserCombinators.Parsec
import Control.Monad (when, liftM, guard)
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match (tagOpen)
-- | Read markdown from an input string and return a Pandoc document.
readMarkdown :: ParserState -- ^ Parser state, including options for parser
-> String -- ^ String to parse (assuming @'\n'@ line endings)
-> Pandoc
readMarkdown state s = (readWith parseMarkdown) state (s ++ "\n\n")
--
-- Constants and data structure definitions
--
isBulletListMarker :: Char -> Bool
isBulletListMarker '*' = True
isBulletListMarker '+' = True
isBulletListMarker '-' = True
isBulletListMarker _ = False
isHruleChar :: Char -> Bool
isHruleChar '*' = True
isHruleChar '-' = True
isHruleChar '_' = True
isHruleChar _ = False
setextHChars :: [Char]
setextHChars = "=-"
2011-01-19 15:14:23 -08:00
isBlank :: Char -> Bool
isBlank ' ' = True
isBlank '\t' = True
isBlank '\n' = True
isBlank _ = False
--
-- auxiliary functions
--
indentSpaces :: GenParser Char ParserState [Char]
indentSpaces = try $ do
state <- getState
let tabStop = stateTabStop state
count tabStop (char ' ') <|>
string "\t" <?> "indentation"
nonindentSpaces :: GenParser Char ParserState [Char]
nonindentSpaces = do
state <- getState
let tabStop = stateTabStop state
sps <- many (char ' ')
if length sps < tabStop
then return sps
else unexpected "indented line"
skipNonindentSpaces :: GenParser Char ParserState ()
skipNonindentSpaces = do
state <- getState
atMostSpaces (stateTabStop state - 1)
atMostSpaces :: Int -> GenParser Char ParserState ()
atMostSpaces 0 = notFollowedBy (char ' ')
atMostSpaces n = (char ' ' >> atMostSpaces (n-1)) <|> return ()
-- | Fail unless we're at beginning of a line.
failUnlessBeginningOfLine :: GenParser tok st ()
failUnlessBeginningOfLine = do
pos <- getPosition
if sourceColumn pos == 1 then return () else fail "not beginning of line"
-- | Parse a sequence of inline elements between square brackets,
-- including inlines between balanced pairs of square brackets.
inlinesInBalancedBrackets :: GenParser Char ParserState Inline
-> GenParser Char ParserState [Inline]
inlinesInBalancedBrackets parser = try $ do
char '['
result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser
guard (res == "[")
bal <- inlinesInBalancedBrackets parser
return $ [Str "["] ++ bal ++ [Str "]"])
<|> (count 1 parser))
(char ']')
return $ concat result
--
-- document structure
--
titleLine :: GenParser Char ParserState [Inline]
titleLine = try $ do
char '%'
skipSpaces
res <- many $ (notFollowedBy newline >> inline)
<|> try (endline >> whitespace)
newline
return $ normalizeSpaces res
authorsLine :: GenParser Char ParserState [[Inline]]
authorsLine = try $ do
char '%'
skipSpaces
authors <- sepEndBy (many (notFollowedBy (satisfy $ \c ->
c == ';' || c == '\n') >> inline))
(char ';' <|>
try (newline >> notFollowedBy blankline >> spaceChar))
newline
return $ filter (not . null) $ map normalizeSpaces authors
dateLine :: GenParser Char ParserState [Inline]
dateLine = try $ do
char '%'
skipSpaces
date <- manyTill inline newline
return $ normalizeSpaces date
titleBlock :: GenParser Char ParserState ([Inline], [[Inline]], [Inline])
titleBlock = try $ do
failIfStrict
title <- option [] titleLine
author <- option [] authorsLine
date <- option [] dateLine
optional blanklines
return (title, author, date)
parseMarkdown :: GenParser Char ParserState Pandoc
parseMarkdown = do
-- markdown allows raw HTML
updateState (\state -> state { stateParseRaw = True })
startPos <- getPosition
-- go through once just to get list of reference keys and notes
-- docMinusKeys is the raw document with blanks where the keys/notes were...
st <- getState
let firstPassParser = referenceKey
<|> (if stateStrict st then pzero else noteBlock)
<|> lineClump
docMinusKeys <- liftM concat $ manyTill firstPassParser eof
setInput docMinusKeys
setPosition startPos
st' <- getState
let reversedNotes = stateNotes st'
updateState $ \s -> s { stateNotes = reverse reversedNotes }
-- now parse it for real...
(title, author, date) <- option ([],[],[]) titleBlock
blocks <- parseBlocks
let doc = Pandoc (Meta title author date) $ filter (/= Null) blocks
-- if there are labeled examples, change references into numbers
examples <- liftM stateExamples getState
let handleExampleRef :: Inline -> Inline
handleExampleRef z@(Str ('@':xs)) =
case M.lookup xs examples of
Just n -> Str (show n)
Nothing -> z
handleExampleRef z = z
if M.null examples
then return doc
else return $ bottomUp handleExampleRef doc
--
-- initial pass for references and notes
--
referenceKey :: GenParser Char ParserState [Char]
referenceKey = try $ do
startPos <- getPosition
skipNonindentSpaces
lab <- reference
char ':'
skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[')
let nl = char '\n' >> notFollowedBy blankline >> return ' '
let sourceURL = liftM unwords $ many $ try $ do
notFollowedBy' referenceTitle
skipMany spaceChar
optional nl
skipMany spaceChar
notFollowedBy' reference
2011-01-19 15:14:23 -08:00
many1 (satisfy $ not . isBlank)
let betweenAngles = try $ char '<' >>
manyTill (noneOf ">\n" <|> nl) (char '>')
src <- try betweenAngles <|> sourceURL
tit <- option "" referenceTitle
blanklines
endPos <- getPosition
let target = (escapeURI $ removeTrailingSpace src, tit)
st <- getState
let oldkeys = stateKeys st
updateState $ \s -> s { stateKeys = M.insert (toKey lab) target oldkeys }
-- return blanks so line count isn't affected
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
referenceTitle :: GenParser Char st String
referenceTitle = try $ do
skipSpaces >> optional newline >> skipSpaces
tit <- (charsInBalanced '(' ')' >>= return . unwords . words)
<|> do delim <- char '\'' <|> char '"'
manyTill anyChar (try (char delim >> skipSpaces >>
notFollowedBy (noneOf ")\n")))
return $ decodeCharacterReferences tit
noteMarker :: GenParser Char ParserState [Char]
2011-01-19 15:14:23 -08:00
noteMarker = string "[^" >> many1Till (satisfy $ not . isBlank) (char ']')
rawLine :: GenParser Char ParserState [Char]
rawLine = do
notFollowedBy blankline
notFollowedBy' $ try $ skipNonindentSpaces >> noteMarker
contents <- many1 nonEndline
end <- option "" (newline >> optional indentSpaces >> return "\n")
return $ contents ++ end
rawLines :: GenParser Char ParserState [Char]
rawLines = many1 rawLine >>= return . concat
noteBlock :: GenParser Char ParserState [Char]
noteBlock = try $ do
startPos <- getPosition
skipNonindentSpaces
ref <- noteMarker
char ':'
optional blankline
optional indentSpaces
raw <- sepBy rawLines (try (blankline >> indentSpaces))
optional blanklines
endPos <- getPosition
let newnote = (ref, (intercalate "\n" raw) ++ "\n\n")
st <- getState
let oldnotes = stateNotes st
updateState $ \s -> s { stateNotes = newnote : oldnotes }
-- return blanks so line count isn't affected
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
--
-- parsing blocks
--
parseBlocks :: GenParser Char ParserState [Block]
parseBlocks = manyTill block eof
block :: GenParser Char ParserState Block
block = do
st <- getState
choice (if stateStrict st
then [ header
, codeBlockIndented
, blockQuote
, hrule
, bulletList
, orderedList
, htmlBlock
, para
, plain
, nullBlock ]
else [ codeBlockDelimited
, macro
, header
, table
, codeBlockIndented
, lhsCodeBlock
, blockQuote
, hrule
, bulletList
, orderedList
, definitionList
, rawTeXBlock
, para
Added optional HTML sanitization using a whitelist. When this option is specified (--sanitize-html on the command line), unsafe HTML tags will be replaced by HTML comments, and unsafe HTML attributes will be removed. This option should be especially useful for those who want to use pandoc libraries in web applications, where users will provide the input. + Main.hs: Added --sanitize-html option. + Text.Pandoc.Shared: Added stateSanitizeHTML to ParserState. + Text.Pandoc.Readers.HTML: - Added whitelists of sanitaryTags and sanitaryAttributes. - Added parsers to check these lists (and state) to see if a given tag or attribute should be counted unsafe. - Modified anyHtmlTag and anyHtmlEndTag to replace unsafe tags with comments. - Modified htmlAttribute to remove unsafe attributes. - Modified htmlScript and htmlStyle to remove these elements if unsafe. - Modified rawHtmlBlock to use anyHtmlBlockTag instead of anyHtmlTag and anyHtmlEndTag. This fixes a bug in markdown parsing, where inline tags would be included in raw HTML blocks. - Modified anyHtmlBlockTag to test for (not inline) rather than directly for block. This allows us to handle e.g. docbook in the markdown reader. - Minor tweaks in nonTitleNonHead and parseTitle. + Text.Pandoc.Readers.Markdown: - In non-strict mode use rawHtmlBlocks instead of htmlBlock. Simplified htmlBlock, since we know it's only called in strict mode. + Modified README and man pages to document new option. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1166 788f1e2b-df1e-0410-8736-df70ead52e1b
2008-01-03 21:32:32 +00:00
, rawHtmlBlocks
, plain
, nullBlock ]) <?> "block"
--
-- header blocks
--
header :: GenParser Char ParserState Block
header = setextHeader <|> atxHeader <?> "header"
atxHeader :: GenParser Char ParserState Block
atxHeader = try $ do
level <- many1 (char '#') >>= return . length
notFollowedBy (char '.' <|> char ')') -- this would be a list
skipSpaces
text <- manyTill inline atxClosing >>= return . normalizeSpaces
return $ Header level text
atxClosing :: GenParser Char st [Char]
atxClosing = try $ skipMany (char '#') >> blanklines
setextHeader :: GenParser Char ParserState Block
setextHeader = try $ do
-- This lookahead prevents us from wasting time parsing Inlines
-- unless necessary -- it gives a significant performance boost.
lookAhead $ anyLine >> many1 (oneOf setextHChars) >> blankline
text <- many1Till inline newline
underlineChar <- oneOf setextHChars
many (char underlineChar)
blanklines
let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1
return $ Header level (normalizeSpaces text)
--
-- hrule block
--
hrule :: GenParser Char st Block
hrule = try $ do
skipSpaces
start <- satisfy isHruleChar
count 2 (skipSpaces >> char start)
skipMany (spaceChar <|> char start)
newline
optional blanklines
return HorizontalRule
--
-- code blocks
--
indentedLine :: GenParser Char ParserState [Char]
indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n")
codeBlockDelimiter :: Maybe Int
-> GenParser Char st (Int, ([Char], [[Char]], [([Char], [Char])]))
codeBlockDelimiter len = try $ do
size <- case len of
Just l -> count l (char '~') >> many (char '~') >> return l
Nothing -> count 3 (char '~') >> many (char '~') >>=
return . (+ 3) . length
many spaceChar
attr <- option ([],[],[]) attributes
blankline
return (size, attr)
attributes :: GenParser Char st ([Char], [[Char]], [([Char], [Char])])
attributes = try $ do
char '{'
many spaceChar
attrs <- many (attribute >>~ many spaceChar)
char '}'
let (ids, classes, keyvals) = unzip3 attrs
let id' = if null ids then "" else head ids
return (id', concat classes, concat keyvals)
attribute :: GenParser Char st ([Char], [[Char]], [([Char], [Char])])
attribute = identifierAttr <|> classAttr <|> keyValAttr
identifier :: GenParser Char st [Char]
identifier = do
first <- letter
rest <- many $ alphaNum <|> oneOf "-_:."
return (first:rest)
identifierAttr :: GenParser Char st ([Char], [a], [a1])
identifierAttr = try $ do
char '#'
result <- identifier
return (result,[],[])
classAttr :: GenParser Char st ([Char], [[Char]], [a])
classAttr = try $ do
char '.'
result <- identifier
return ("",[result],[])
keyValAttr :: GenParser Char st ([Char], [a], [([Char], [Char])])
keyValAttr = try $ do
key <- identifier
char '='
char '"'
2011-01-19 15:14:23 -08:00
val <- manyTill (satisfy (/='\n')) (char '"')
return ("",[],[(key,val)])
codeBlockDelimited :: GenParser Char st Block
codeBlockDelimited = try $ do
(size, attr) <- codeBlockDelimiter Nothing
contents <- manyTill anyLine (codeBlockDelimiter (Just size))
blanklines
return $ CodeBlock attr $ intercalate "\n" contents
codeBlockIndented :: GenParser Char ParserState Block
codeBlockIndented = do
contents <- many1 (indentedLine <|>
try (do b <- blanklines
l <- indentedLine
return $ b ++ l))
optional blanklines
st <- getState
return $ CodeBlock ("", stateIndentedCodeClasses st, []) $
stripTrailingNewlines $ concat contents
lhsCodeBlock :: GenParser Char ParserState Block
lhsCodeBlock = do
failUnlessLHS
liftM (CodeBlock ("",["sourceCode","literate","haskell"],[]))
(lhsCodeBlockBird <|> lhsCodeBlockLaTeX)
<|> liftM (CodeBlock ("",["sourceCode","haskell"],[]))
lhsCodeBlockInverseBird
lhsCodeBlockLaTeX :: GenParser Char ParserState String
lhsCodeBlockLaTeX = try $ do
string "\\begin{code}"
manyTill spaceChar newline
contents <- many1Till anyChar (try $ string "\\end{code}")
blanklines
return $ stripTrailingNewlines contents
lhsCodeBlockBird :: GenParser Char ParserState String
lhsCodeBlockBird = lhsCodeBlockBirdWith '>'
lhsCodeBlockInverseBird :: GenParser Char ParserState String
lhsCodeBlockInverseBird = lhsCodeBlockBirdWith '<'
lhsCodeBlockBirdWith :: Char -> GenParser Char ParserState String
lhsCodeBlockBirdWith c = try $ do
pos <- getPosition
when (sourceColumn pos /= 1) $ fail "Not in first column"
lns <- many1 $ birdTrackLine c
-- if (as is normal) there is always a space after >, drop it
let lns' = if all (\ln -> null ln || take 1 ln == " ") lns
then map (drop 1) lns
else lns
blanklines
return $ intercalate "\n" lns'
birdTrackLine :: Char -> GenParser Char st [Char]
birdTrackLine c = do
char c
manyTill anyChar newline
--
-- block quotes
--
emailBlockQuoteStart :: GenParser Char ParserState Char
emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' >>~ optional (char ' ')
emailBlockQuote :: GenParser Char ParserState [[Char]]
emailBlockQuote = try $ do
emailBlockQuoteStart
raw <- sepBy (many (nonEndline <|>
(try (endline >> notFollowedBy emailBlockQuoteStart >>
return '\n'))))
(try (newline >> emailBlockQuoteStart))
newline <|> (eof >> return '\n')
optional blanklines
return raw
blockQuote :: GenParser Char ParserState Block
blockQuote = do
raw <- emailBlockQuote
-- parse the extracted block, which may contain various block elements:
contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n"
return $ BlockQuote contents
--
-- list blocks
--
bulletListStart :: GenParser Char ParserState ()
bulletListStart = try $ do
optional newline -- if preceded by a Plain block in a list context
skipNonindentSpaces
notFollowedBy' hrule -- because hrules start out just like lists
satisfy isBulletListMarker
spaceChar
skipSpaces
anyOrderedListStart :: GenParser Char ParserState (Int, ListNumberStyle, ListNumberDelim)
anyOrderedListStart = try $ do
optional newline -- if preceded by a Plain block in a list context
skipNonindentSpaces
notFollowedBy $ string "p." >> spaceChar >> digit -- page number
state <- getState
if stateStrict state
then do many1 digit
char '.'
spaceChar
return (1, DefaultStyle, DefaultDelim)
else do (num, style, delim) <- anyOrderedListMarker
-- if it could be an abbreviated first name, insist on more than one space
if delim == Period && (style == UpperAlpha || (style == UpperRoman &&
num `elem` [1, 5, 10, 50, 100, 500, 1000]))
then char '\t' <|> (try $ char ' ' >> spaceChar)
else spaceChar
skipSpaces
return (num, style, delim)
listStart :: GenParser Char ParserState ()
listStart = bulletListStart <|> (anyOrderedListStart >> return ())
-- parse a line of a list item (start = parser for beginning of list item)
listLine :: GenParser Char ParserState [Char]
listLine = try $ do
notFollowedBy' listStart
notFollowedBy blankline
notFollowedBy' (do indentSpaces
many (spaceChar)
listStart)
chunks <- manyTill (liftM snd (htmlTag isCommentTag) <|> count 1 anyChar) newline
return $ concat chunks ++ "\n"
-- parse raw text for one list item, excluding start marker and continuations
rawListItem :: GenParser Char ParserState [Char]
rawListItem = try $ do
listStart
result <- many1 listLine
blanks <- many blankline
return $ concat result ++ blanks
-- continuation of a list item - indented and separated by blankline
-- or (in compact lists) endline.
-- note: nested lists are parsed as continuations
listContinuation :: GenParser Char ParserState [Char]
listContinuation = try $ do
lookAhead indentSpaces
result <- many1 listContinuationLine
blanks <- many blankline
return $ concat result ++ blanks
listContinuationLine :: GenParser Char ParserState [Char]
listContinuationLine = try $ do
notFollowedBy blankline
notFollowedBy' listStart
optional indentSpaces
result <- manyTill anyChar newline
return $ result ++ "\n"
listItem :: GenParser Char ParserState [Block]
listItem = try $ do
first <- rawListItem
continuations <- many listContinuation
-- parsing with ListItemState forces markers at beginning of lines to
-- count as list item markers, even if not separated by blank space.
-- see definition of "endline"
state <- getState
let oldContext = stateParserContext state
setState $ state {stateParserContext = ListItemState}
-- parse the extracted block, which may contain various block elements:
let raw = concat (first:continuations)
contents <- parseFromString parseBlocks raw
updateState (\st -> st {stateParserContext = oldContext})
return contents
orderedList :: GenParser Char ParserState Block
orderedList = try $ do
(start, style, delim) <- lookAhead anyOrderedListStart
items <- many1 listItem
return $ OrderedList (start, style, delim) $ compactify items
bulletList :: GenParser Char ParserState Block
bulletList = try $ do
lookAhead bulletListStart
many1 listItem >>= return . BulletList . compactify
-- definition lists
defListMarker :: GenParser Char ParserState ()
defListMarker = do
sps <- nonindentSpaces
char ':' <|> char '~'
st <- getState
let tabStop = stateTabStop st
let remaining = tabStop - (length sps + 1)
if remaining > 0
then count remaining (char ' ') <|> string "\t"
else pzero
return ()
definitionListItem :: GenParser Char ParserState ([Inline], [[Block]])
definitionListItem = try $ do
-- first, see if this has any chance of being a definition list:
lookAhead (anyLine >> optional blankline >> defListMarker)
term <- manyTill inline newline
optional blankline
raw <- many1 defRawBlock
state <- getState
let oldContext = stateParserContext state
-- parse the extracted block, which may contain various block elements:
contents <- mapM (parseFromString parseBlocks) raw
updateState (\st -> st {stateParserContext = oldContext})
return ((normalizeSpaces term), contents)
defRawBlock :: GenParser Char ParserState [Char]
defRawBlock = try $ do
defListMarker
firstline <- anyLine
rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
trailing <- option "" blanklines
cont <- liftM concat $ many $ do
lns <- many1 $ notFollowedBy blankline >> indentSpaces >> anyLine
trl <- option "" blanklines
return $ unlines lns ++ trl
return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont
definitionList :: GenParser Char ParserState Block
definitionList = do
items <- many1 definitionListItem
-- "compactify" the definition list:
let defs = map snd items
let defBlocks = reverse $ concat $ concat defs
let isPara (Para _) = True
isPara _ = False
let items' = case take 1 defBlocks of
[Para x] -> if not $ any isPara (drop 1 defBlocks)
then let (t,ds) = last items
lastDef = last ds
ds' = init ds ++
[init lastDef ++ [Plain x]]
in init items ++ [(t, ds')]
else items
_ -> items
return $ DefinitionList items'
--
-- paragraph block
--
isHtmlOrBlank :: Inline -> Bool
isHtmlOrBlank (HtmlInline _) = True
isHtmlOrBlank (Space) = True
isHtmlOrBlank (LineBreak) = True
isHtmlOrBlank _ = False
para :: GenParser Char ParserState Block
para = try $ do
result <- liftM normalizeSpaces $ many1 inline
guard $ not . all isHtmlOrBlank $ result
option (Plain result) $ try $ do
newline
blanklines <|>
(getState >>= guard . stateStrict >>
lookAhead (blockQuote <|> header) >> return "")
return $ Para result
plain :: GenParser Char ParserState Block
plain = many1 inline >>~ spaces >>= return . Plain . normalizeSpaces
--
-- raw html
--
htmlElement :: GenParser Char ParserState [Char]
htmlElement = strictHtmlBlock <|> liftM snd (htmlTag isBlockTag)
htmlBlock :: GenParser Char ParserState Block
Added optional HTML sanitization using a whitelist. When this option is specified (--sanitize-html on the command line), unsafe HTML tags will be replaced by HTML comments, and unsafe HTML attributes will be removed. This option should be especially useful for those who want to use pandoc libraries in web applications, where users will provide the input. + Main.hs: Added --sanitize-html option. + Text.Pandoc.Shared: Added stateSanitizeHTML to ParserState. + Text.Pandoc.Readers.HTML: - Added whitelists of sanitaryTags and sanitaryAttributes. - Added parsers to check these lists (and state) to see if a given tag or attribute should be counted unsafe. - Modified anyHtmlTag and anyHtmlEndTag to replace unsafe tags with comments. - Modified htmlAttribute to remove unsafe attributes. - Modified htmlScript and htmlStyle to remove these elements if unsafe. - Modified rawHtmlBlock to use anyHtmlBlockTag instead of anyHtmlTag and anyHtmlEndTag. This fixes a bug in markdown parsing, where inline tags would be included in raw HTML blocks. - Modified anyHtmlBlockTag to test for (not inline) rather than directly for block. This allows us to handle e.g. docbook in the markdown reader. - Minor tweaks in nonTitleNonHead and parseTitle. + Text.Pandoc.Readers.Markdown: - In non-strict mode use rawHtmlBlocks instead of htmlBlock. Simplified htmlBlock, since we know it's only called in strict mode. + Modified README and man pages to document new option. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1166 788f1e2b-df1e-0410-8736-df70ead52e1b
2008-01-03 21:32:32 +00:00
htmlBlock = try $ do
failUnlessBeginningOfLine
first <- htmlElement
finalSpace <- many spaceChar
Added optional HTML sanitization using a whitelist. When this option is specified (--sanitize-html on the command line), unsafe HTML tags will be replaced by HTML comments, and unsafe HTML attributes will be removed. This option should be especially useful for those who want to use pandoc libraries in web applications, where users will provide the input. + Main.hs: Added --sanitize-html option. + Text.Pandoc.Shared: Added stateSanitizeHTML to ParserState. + Text.Pandoc.Readers.HTML: - Added whitelists of sanitaryTags and sanitaryAttributes. - Added parsers to check these lists (and state) to see if a given tag or attribute should be counted unsafe. - Modified anyHtmlTag and anyHtmlEndTag to replace unsafe tags with comments. - Modified htmlAttribute to remove unsafe attributes. - Modified htmlScript and htmlStyle to remove these elements if unsafe. - Modified rawHtmlBlock to use anyHtmlBlockTag instead of anyHtmlTag and anyHtmlEndTag. This fixes a bug in markdown parsing, where inline tags would be included in raw HTML blocks. - Modified anyHtmlBlockTag to test for (not inline) rather than directly for block. This allows us to handle e.g. docbook in the markdown reader. - Minor tweaks in nonTitleNonHead and parseTitle. + Text.Pandoc.Readers.Markdown: - In non-strict mode use rawHtmlBlocks instead of htmlBlock. Simplified htmlBlock, since we know it's only called in strict mode. + Modified README and man pages to document new option. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1166 788f1e2b-df1e-0410-8736-df70ead52e1b
2008-01-03 21:32:32 +00:00
finalNewlines <- many newline
return $ RawHtml $ first ++ finalSpace ++ finalNewlines
strictHtmlBlock :: GenParser Char ParserState [Char]
strictHtmlBlock = do
failUnlessBeginningOfLine
htmlInBalanced (not . isInlineTag)
rawVerbatimBlock :: GenParser Char ParserState String
rawVerbatimBlock = try $ do
(TagOpen tag _, open) <- htmlTag (tagOpen (\t ->
t == "pre" || t == "style" || t == "script")
(const True))
contents <- manyTill anyChar (htmlTag (~== TagClose tag))
return $ open ++ contents ++ renderTags [TagClose tag]
rawTeXBlock :: GenParser Char ParserState Block
rawTeXBlock = do
failIfStrict
result <- rawLaTeXEnvironment' <|> rawConTeXtEnvironment'
spaces
return $ Para [TeX result]
rawHtmlBlocks :: GenParser Char ParserState Block
rawHtmlBlocks = do
htmlBlocks <- many1 $ do blk <- rawVerbatimBlock <|>
liftM snd (htmlTag isBlockTag)
sps <- do sp1 <- many spaceChar
sp2 <- option "" (blankline >> return "\n")
sp3 <- many spaceChar
sp4 <- option "" blanklines
return $ sp1 ++ sp2 ++ sp3 ++ sp4
-- note: we want raw html to be able to
-- precede a code block, when separated
-- by a blank line
return $ blk ++ sps
let combined = concat htmlBlocks
let combined' = if last combined == '\n' then init combined else combined
return $ RawHtml combined'
--
-- Tables
--
-- Parse a dashed line with optional trailing spaces; return its length
-- and the length including trailing space.
dashedLine :: Char
-> GenParser Char st (Int, Int)
dashedLine ch = do
dashes <- many1 (char ch)
sp <- many spaceChar
return $ (length dashes, length $ dashes ++ sp)
-- Parse a table header with dashed lines of '-' preceded by
-- one (or zero) line of text.
simpleTableHeader :: Bool -- ^ Headerless table
2010-07-05 23:43:07 -07:00
-> GenParser Char ParserState ([[Block]], [Alignment], [Int])
simpleTableHeader headless = try $ do
rawContent <- if headless
then return ""
else anyLine
initSp <- nonindentSpaces
dashes <- many1 (dashedLine '-')
newline
let (lengths, lines') = unzip dashes
let indices = scanl (+) (length initSp) lines'
-- If no header, calculate alignment on basis of first row of text
rawHeads <- liftM (tail . splitByIndices (init indices)) $
if headless
then lookAhead anyLine
else return rawContent
let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths
let rawHeads' = if headless
then replicate (length dashes) ""
else rawHeads
2010-07-05 23:43:07 -07:00
heads <- mapM (parseFromString (many plain)) $
map removeLeadingTrailingSpace rawHeads'
return (heads, aligns, indices)
-- Parse a table footer - dashed lines followed by blank line.
tableFooter :: GenParser Char ParserState [Char]
tableFooter = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> blanklines
-- Parse a table separator - dashed line.
tableSep :: GenParser Char ParserState Char
tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> char '\n'
-- Parse a raw line and split it into chunks by indices.
rawTableLine :: [Int]
-> GenParser Char ParserState [String]
rawTableLine indices = do
notFollowedBy' (blanklines <|> tableFooter)
line <- many1Till anyChar newline
return $ map removeLeadingTrailingSpace $ tail $
splitByIndices (init indices) line
-- Parse a table line and return a list of lists of blocks (columns).
tableLine :: [Int]
-> GenParser Char ParserState [[Block]]
tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain))
-- Parse a multiline table row and return a list of blocks (columns).
multilineRow :: [Int]
-> GenParser Char ParserState [[Block]]
multilineRow indices = do
colLines <- many1 (rawTableLine indices)
let cols = map unlines $ transpose colLines
mapM (parseFromString (many plain)) cols
-- Parses a table caption: inlines beginning with 'Table:'
-- and followed by blank lines.
tableCaption :: GenParser Char ParserState [Inline]
tableCaption = try $ do
skipNonindentSpaces
string ":" <|> string "Table:"
result <- many1 inline
blanklines
return $ normalizeSpaces result
-- Parse a simple table with '---' header and one line per row.
simpleTable :: Bool -- ^ Headerless table
-> GenParser Char ParserState Block
simpleTable headless = do
Table c a _w h l <- tableWith (simpleTableHeader headless) tableLine
2010-07-05 23:43:07 -07:00
(return ())
(if headless then tableFooter else tableFooter <|> blanklines)
tableCaption
-- Simple tables get 0s for relative column widths (i.e., use default)
return $ Table c a (replicate (length a) 0) h l
-- Parse a multiline table: starts with row of '-' on top, then header
-- (which may be multiline), then the rows,
-- which may be multiline, separated by blank lines, and
-- ending with a footer (dashed line followed by blank line).
multilineTable :: Bool -- ^ Headerless table
-> GenParser Char ParserState Block
multilineTable headless =
2010-07-05 23:43:07 -07:00
tableWith (multilineTableHeader headless) multilineRow blanklines tableFooter tableCaption
multilineTableHeader :: Bool -- ^ Headerless table
2010-07-05 23:43:07 -07:00
-> GenParser Char ParserState ([[Block]], [Alignment], [Int])
multilineTableHeader headless = try $ do
if headless
then return '\n'
else tableSep
rawContent <- if headless
then return $ repeat ""
else many1
(notFollowedBy tableSep >> many1Till anyChar newline)
initSp <- nonindentSpaces
dashes <- many1 (dashedLine '-')
newline
let (lengths, lines') = unzip dashes
let indices = scanl (+) (length initSp) lines'
rawHeadsList <- if headless
then liftM (map (:[]) . tail .
splitByIndices (init indices)) $ lookAhead anyLine
else return $ transpose $ map
(\ln -> tail $ splitByIndices (init indices) ln)
rawContent
let aligns = zipWith alignType rawHeadsList lengths
let rawHeads = if headless
then replicate (length dashes) ""
else map (intercalate " ") rawHeadsList
2010-07-05 23:43:07 -07:00
heads <- mapM (parseFromString (many plain)) $
map removeLeadingTrailingSpace rawHeads
return (heads, aligns, indices)
-- Returns an alignment type for a table, based on a list of strings
-- (the rows of the column header) and a number (the length of the
-- dashed line under the rows.
alignType :: [String]
-> Int
-> Alignment
alignType [] _ = AlignDefault
alignType strLst len =
let s = head $ sortBy (comparing length) $
map removeTrailingSpace strLst
leftSpace = if null s then False else (s !! 0) `elem` " \t"
rightSpace = length s < len || (s !! (len - 1)) `elem` " \t"
in case (leftSpace, rightSpace) of
(True, False) -> AlignRight
(False, True) -> AlignLeft
(True, True) -> AlignCenter
(False, False) -> AlignDefault
2010-07-05 23:43:07 -07:00
gridTable :: Bool -- ^ Headerless table
-> GenParser Char ParserState Block
gridTable = gridTableWith block tableCaption
table :: GenParser Char ParserState Block
table = multilineTable False <|> simpleTable True <|>
2010-07-05 23:43:07 -07:00
simpleTable False <|> multilineTable True <|>
gridTable False <|> gridTable True <?> "table"
--
-- inline
--
inline :: GenParser Char ParserState Inline
inline = choice inlineParsers <?> "inline"
inlineParsers :: [GenParser Char ParserState Inline]
inlineParsers = [ str
, whitespace
, endline
, code
, (fourOrMore '*' <|> fourOrMore '_')
, strong
, emph
, note
, link
, cite
, image
, math
, strikeout
, superscript
, subscript
, inlineNote -- after superscript because of ^[link](/foo)^
, autoLink
, rawHtmlInline
, rawLaTeXInline'
, escapedChar
, exampleRef
, smartPunctuation inline
, charRef
, symbol
, ltSign ]
inlineNonLink :: GenParser Char ParserState Inline
inlineNonLink = (choice $
map (\parser -> try (parser >>= failIfLink)) inlineParsers)
<?> "inline (non-link)"
failIfLink :: Inline -> GenParser tok st Inline
failIfLink (Link _ _) = pzero
failIfLink elt = return elt
escapedChar :: GenParser Char ParserState Inline
escapedChar = do
char '\\'
state <- getState
result <- option '\\' $ if stateStrict state
then oneOf "\\`*_{}[]()>#+-.!~"
else satisfy (not . isAlphaNum)
return $ case result of
' ' -> Str "\160" -- "\ " is a nonbreaking space
'\n' -> LineBreak -- "\[newline]" is a linebreak
_ -> Str [result]
ltSign :: GenParser Char ParserState Inline
ltSign = do
st <- getState
if stateStrict st
then char '<'
else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html
return $ Str ['<']
exampleRef :: GenParser Char ParserState Inline
exampleRef = try $ do
char '@'
lab <- many1 (alphaNum <|> oneOf "-_")
-- We just return a Str. These are replaced with numbers
-- later. See the end of parseMarkdown.
return $ Str $ '@' : lab
symbol :: GenParser Char ParserState Inline
symbol = do
result <- noneOf "<\n\t "
return $ Str [result]
-- parses inline code, between n `s and n `s
code :: GenParser Char ParserState Inline
code = try $ do
starts <- many1 (char '`')
skipSpaces
result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|>
(char '\n' >> notFollowedBy' blankline >> return " "))
(try (skipSpaces >> count (length starts) (char '`') >>
notFollowedBy (char '`')))
return $ Code $ removeLeadingTrailingSpace $ concat result
mathWord :: GenParser Char st [Char]
mathWord = liftM concat $ many1 mathChunk
mathChunk :: GenParser Char st [Char]
mathChunk = do char '\\'
c <- anyChar
return ['\\',c]
2011-01-19 15:14:23 -08:00
<|> many1 (satisfy $ \c -> not (isBlank c || c == '\\' || c == '$'))
math :: GenParser Char ParserState Inline
math = (mathDisplay >>= applyMacros' >>= return . Math DisplayMath)
<|> (mathInline >>= applyMacros' >>= return . Math InlineMath)
mathDisplay :: GenParser Char ParserState String
mathDisplay = try $ do
failIfStrict
string "$$"
many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string "$$")
mathInline :: GenParser Char ParserState String
mathInline = try $ do
failIfStrict
char '$'
notFollowedBy space
words' <- sepBy1 mathWord (many1 (spaceChar <|> (newline >>~ notFollowedBy' blankline)))
char '$'
notFollowedBy digit
return $ intercalate " " words'
-- to avoid performance problems, treat 4 or more _ or * in a row as a literal
-- rather than attempting to parse for emph/strong
fourOrMore :: Char -> GenParser Char st Inline
fourOrMore c = try $ count 4 (char c) >> many (char c) >>= \s ->
return (Str $ replicate 4 c ++ s)
emph :: GenParser Char ParserState Inline
emph = ((enclosed (char '*') (notFollowedBy' strong >> char '*') inline) <|>
(enclosed (char '_') (notFollowedBy' strong >> char '_' >>
notFollowedBy alphaNum) inline)) >>=
return . Emph . normalizeSpaces
strong :: GenParser Char ParserState Inline
strong = ((enclosed (string "**") (try $ string "**") inline) <|>
(enclosed (string "__") (try $ string "__") inline)) >>=
return . Strong . normalizeSpaces
strikeout :: GenParser Char ParserState Inline
strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>=
return . Strikeout . normalizeSpaces
superscript :: GenParser Char ParserState Inline
superscript = failIfStrict >> enclosed (char '^') (char '^')
(notFollowedBy spaceChar >> inline) >>= -- may not contain Space
return . Superscript
subscript :: GenParser Char ParserState Inline
subscript = failIfStrict >> enclosed (char '~') (char '~')
(notFollowedBy spaceChar >> inline) >>= -- may not contain Space
return . Subscript
whitespace :: GenParser Char ParserState Inline
whitespace = spaceChar >>
( (spaceChar >> skipMany spaceChar >> option Space (endline >> return LineBreak))
<|> (skipMany spaceChar >> return Space) ) <?> "whitespace"
nonEndline :: GenParser Char st Char
nonEndline = satisfy (/='\n')
str :: GenParser Char ParserState Inline
str = do
a <- alphaNum
as <- many $ alphaNum <|> (try $ char '_' >>~ lookAhead alphaNum)
let result = a:as
state <- getState
let spacesToNbr = map (\c -> if c == ' ' then '\160' else c)
if stateSmart state
then case likelyAbbrev result of
[] -> return $ Str result
xs -> choice (map (\x ->
try (string x >> oneOf " \n" >>
lookAhead alphaNum >>
return (Str $ result ++ spacesToNbr x ++ "\160"))) xs)
<|> (return $ Str result)
else return $ Str result
-- | if the string matches the beginning of an abbreviation (before
-- the first period, return strings that would finish the abbreviation.
likelyAbbrev :: String -> [String]
likelyAbbrev x =
let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.",
"Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
"vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.",
"Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.",
"ch.", "sec." ]
abbrPairs = map (break (=='.')) abbrevs
in map snd $ filter (\(y,_) -> y == x) abbrPairs
-- an endline character that can be treated as a space, not a structural break
endline :: GenParser Char ParserState Inline
endline = try $ do
newline
notFollowedBy blankline
st <- getState
when (stateStrict st) $ do
notFollowedBy emailBlockQuoteStart
notFollowedBy (char '#') -- atx header
-- parse potential list-starts differently if in a list:
when (stateParserContext st == ListItemState) $ do
notFollowedBy' bulletListStart
notFollowedBy' anyOrderedListStart
return Space
--
-- links
--
-- a reference label for a link
reference :: GenParser Char ParserState [Inline]
reference = do notFollowedBy' (string "[^") -- footnote reference
result <- inlinesInBalancedBrackets inlineNonLink
return $ normalizeSpaces result
-- source for a link, with optional title
source :: GenParser Char st (String, [Char])
source =
(try $ charsInBalanced '(' ')' >>= parseFromString source') <|>
-- the following is needed for cases like: [ref](/url(a).
(enclosed (char '(') (char ')') anyChar >>=
parseFromString source')
-- auxiliary function for source
source' :: GenParser Char st (String, [Char])
source' = do
skipSpaces
let nl = char '\n' >>~ notFollowedBy blankline
let sourceURL = liftM unwords $ many $ try $ do
notFollowedBy' linkTitle
skipMany spaceChar
optional nl
skipMany spaceChar
2011-01-19 15:14:23 -08:00
many1 (satisfy $ not . isBlank)
let betweenAngles = try $ char '<' >>
manyTill (noneOf ">\n" <|> nl) (char '>')
src <- try betweenAngles <|> sourceURL
tit <- option "" linkTitle
skipSpaces
eof
2010-03-23 15:07:48 -07:00
return (escapeURI $ removeTrailingSpace src, tit)
linkTitle :: GenParser Char st String
linkTitle = try $ do
(many1 spaceChar >> option '\n' newline) <|> newline
skipSpaces
delim <- oneOf "'\""
tit <- manyTill (optional (char '\\') >> anyChar)
(try (char delim >> skipSpaces >> eof))
return $ decodeCharacterReferences tit
link :: GenParser Char ParserState Inline
link = try $ do
lab <- reference
2010-03-23 15:07:48 -07:00
(src, tit) <- source <|> referenceLink lab
return $ Link lab (src, tit)
-- a link like [this][ref] or [this][] or [this]
referenceLink :: [Inline]
-> GenParser Char ParserState (String, [Char])
referenceLink lab = do
ref <- option [] (try (optional (char ' ') >>
optional (newline >> skipSpaces) >> reference))
let ref' = if null ref then lab else ref
state <- getState
case lookupKeySrc (stateKeys state) (toKey ref') of
Nothing -> fail "no corresponding key"
Just target -> return target
autoLink :: GenParser Char ParserState Inline
autoLink = try $ do
char '<'
2010-03-23 15:07:48 -07:00
(orig, src) <- uri <|> emailAddress
char '>'
st <- getState
return $ if stateStrict st
then Link [Str orig] (src, "")
else Link [Code orig] (src, "")
image :: GenParser Char ParserState Inline
image = try $ do
char '!'
(Link lab src) <- link
return $ Image lab src
note :: GenParser Char ParserState Inline
note = try $ do
failIfStrict
ref <- noteMarker
state <- getState
let notes = stateNotes state
case lookup ref notes of
Nothing -> fail "note not found"
Just raw -> liftM Note $ parseFromString parseBlocks raw
inlineNote :: GenParser Char ParserState Inline
inlineNote = try $ do
failIfStrict
char '^'
contents <- inlinesInBalancedBrackets inline
return $ Note [Para contents]
rawLaTeXInline' :: GenParser Char ParserState Inline
rawLaTeXInline' = do
failIfStrict
(rawConTeXtEnvironment' >>= return . TeX)
<|> (rawLaTeXEnvironment' >>= return . TeX)
<|> rawLaTeXInline
rawConTeXtEnvironment' :: GenParser Char st String
rawConTeXtEnvironment' = try $ do
string "\\start"
completion <- inBrackets (letter <|> digit <|> spaceChar)
<|> (many1 letter)
contents <- manyTill (rawConTeXtEnvironment' <|> (count 1 anyChar))
(try $ string "\\stop" >> string completion)
return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion
inBrackets :: (GenParser Char st Char) -> GenParser Char st String
inBrackets parser = do
char '['
contents <- many parser
char ']'
return $ "[" ++ contents ++ "]"
rawHtmlInline :: GenParser Char ParserState Inline
rawHtmlInline = do
st <- getState
(_,result) <- if stateStrict st
then htmlTag (not . isTextTag)
else htmlTag isInlineTag
return $ HtmlInline result
-- Citations
cite :: GenParser Char ParserState Inline
cite = do
failIfStrict
citations <- textualCite <|> normalCite
return $ Cite citations []
spnl :: GenParser Char st ()
spnl = try $ do
skipSpaces
optional newline
skipSpaces
notFollowedBy (char '\n')
textualCite :: GenParser Char ParserState [Citation]
textualCite = try $ do
(_, key) <- citeKey
let first = Citation{ citationId = key
, citationPrefix = []
, citationSuffix = []
, citationMode = AuthorInText
, citationNoteNum = 0
, citationHash = 0
}
rest <- option [] $ try $ spnl >> normalCite
if null rest
then option [first] $ bareloc first
else return $ first : rest
bareloc :: Citation -> GenParser Char ParserState [Citation]
bareloc c = try $ do
spnl
char '['
suff <- suffix
rest <- option [] $ try $ char ';' >> citeList
spnl
char ']'
return $ c{ citationSuffix = suff } : rest
normalCite :: GenParser Char ParserState [Citation]
normalCite = try $ do
char '['
spnl
citations <- citeList
spnl
char ']'
return citations
citeKey :: GenParser Char ParserState (Bool, String)
citeKey = try $ do
suppress_author <- option False (char '-' >> return True)
char '@'
first <- letter
rest <- many $ (noneOf ",;]@ \t\n")
let key = first:rest
st <- getState
guard $ key `elem` stateCitations st
return (suppress_author, key)
suffix :: GenParser Char ParserState [Inline]
suffix = try $ do
spnl
liftM normalizeSpaces $ many $ notFollowedBy (oneOf ";]") >> inline
prefix :: GenParser Char ParserState [Inline]
prefix = liftM normalizeSpaces $
manyTill inline (char ']' <|> liftM (const ']') (lookAhead citeKey))
citeList :: GenParser Char ParserState [Citation]
citeList = sepBy1 citation (try $ char ';' >> spnl)
citation :: GenParser Char ParserState Citation
citation = try $ do
pref <- prefix
(suppress_author, key) <- citeKey
suff <- suffix
return $ Citation{ citationId = key
, citationPrefix = pref
, citationSuffix = suff
, citationMode = if suppress_author
then SuppressAuthor
else NormalCitation
, citationNoteNum = 0
, citationHash = 0
}