583 lines
17 KiB
Haskell
583 lines
17 KiB
Haskell
|
-- | Convert markdown to Pandoc document.
|
||
|
module Text.Pandoc.Readers.Markdown (
|
||
|
readMarkdown
|
||
|
) where
|
||
|
|
||
|
import Text.ParserCombinators.Pandoc
|
||
|
import Text.Pandoc.Definition
|
||
|
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
|
||
|
import Text.Pandoc.Shared
|
||
|
import Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, anyHtmlBlockTag,
|
||
|
anyHtmlInlineTag )
|
||
|
import Text.Pandoc.HtmlEntities ( decodeEntities )
|
||
|
import Text.Regex ( matchRegex, mkRegex )
|
||
|
import Text.ParserCombinators.Parsec
|
||
|
|
||
|
-- | Read markdown from an input string and return a Pandoc document.
|
||
|
readMarkdown :: ParserState -> String -> Pandoc
|
||
|
readMarkdown = readWith parseMarkdown
|
||
|
|
||
|
-- | Parse markdown string with default options and print result (for testing).
|
||
|
testString :: String -> IO ()
|
||
|
testString = testStringWith parseMarkdown
|
||
|
|
||
|
--
|
||
|
-- Constants and data structure definitions
|
||
|
--
|
||
|
|
||
|
spaceChars = " \t"
|
||
|
endLineChars = "\n"
|
||
|
labelStart = '['
|
||
|
labelEnd = ']'
|
||
|
labelSep = ':'
|
||
|
srcStart = '('
|
||
|
srcEnd = ')'
|
||
|
imageStart = '!'
|
||
|
noteStart = '^'
|
||
|
codeStart = '`'
|
||
|
codeEnd = '`'
|
||
|
emphStart = '*'
|
||
|
emphEnd = '*'
|
||
|
emphStartAlt = '_'
|
||
|
emphEndAlt = '_'
|
||
|
autoLinkStart = '<'
|
||
|
autoLinkEnd = '>'
|
||
|
mathStart = '$'
|
||
|
mathEnd = '$'
|
||
|
bulletListMarkers = "*+-"
|
||
|
orderedListDelimiters = "."
|
||
|
escapeChar = '\\'
|
||
|
hruleChars = "*-_"
|
||
|
quoteChars = "'\""
|
||
|
atxHChar = '#'
|
||
|
titleOpeners = "\"'("
|
||
|
setextHChars = ['=','-']
|
||
|
blockQuoteChar = '>'
|
||
|
hyphenChar = '-'
|
||
|
|
||
|
-- treat these as potentially non-text when parsing inline:
|
||
|
specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd, emphStartAlt,
|
||
|
emphEndAlt, codeStart, codeEnd, autoLinkEnd, autoLinkStart, mathStart,
|
||
|
mathEnd, imageStart, noteStart, hyphenChar]
|
||
|
|
||
|
--
|
||
|
-- auxiliary functions
|
||
|
--
|
||
|
|
||
|
-- | Skip a single endline if there is one.
|
||
|
skipEndline = option Space endline
|
||
|
|
||
|
indentSpaces = do
|
||
|
state <- getState
|
||
|
let tabStop = stateTabStop state
|
||
|
oneOfStrings [ "\t", (replicate tabStop ' ') ] <?> "indentation"
|
||
|
|
||
|
skipNonindentSpaces = do
|
||
|
state <- getState
|
||
|
let tabStop = stateTabStop state
|
||
|
choice (map (\n -> (try (count n (char ' ')))) (reverse [0..(tabStop - 1)]))
|
||
|
|
||
|
--
|
||
|
-- document structure
|
||
|
--
|
||
|
|
||
|
titleLine = try (do
|
||
|
char '%'
|
||
|
skipSpaces
|
||
|
line <- manyTill inline newline
|
||
|
return line)
|
||
|
|
||
|
authorsLine = try (do
|
||
|
char '%'
|
||
|
skipSpaces
|
||
|
authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
|
||
|
newline
|
||
|
return (map removeLeadingTrailingSpace authors))
|
||
|
|
||
|
dateLine = try (do
|
||
|
char '%'
|
||
|
skipSpaces
|
||
|
date <- many (noneOf "\n")
|
||
|
newline
|
||
|
return (removeTrailingSpace date))
|
||
|
|
||
|
titleBlock = try (do
|
||
|
title <- option [] titleLine
|
||
|
author <- option [] authorsLine
|
||
|
date <- option "" dateLine
|
||
|
option "" blanklines
|
||
|
return (title, author, date))
|
||
|
|
||
|
parseMarkdown = do
|
||
|
updateState (\state -> state { stateParseRaw = True }) -- need to parse raw HTML
|
||
|
(title, author, date) <- option ([],[],"") titleBlock
|
||
|
blocks <- parseBlocks
|
||
|
state <- getState
|
||
|
let keys = reverse $ stateKeyBlocks state
|
||
|
return (Pandoc (Meta title author date) (blocks ++ keys))
|
||
|
|
||
|
--
|
||
|
-- parsing blocks
|
||
|
--
|
||
|
|
||
|
parseBlocks = do
|
||
|
result <- manyTill block eof
|
||
|
return result
|
||
|
|
||
|
block = choice [ codeBlock, referenceKey, note, header, hrule, list, blockQuote, rawHtmlBlocks,
|
||
|
rawLaTeXEnvironment, para, plain, blankBlock, nullBlock ] <?> "block"
|
||
|
|
||
|
--
|
||
|
-- header blocks
|
||
|
--
|
||
|
|
||
|
header = choice [ setextHeader, atxHeader ] <?> "header"
|
||
|
|
||
|
atxHeader = try (do
|
||
|
lead <- many1 (char atxHChar)
|
||
|
skipSpaces
|
||
|
txt <- many1 (do {notFollowedBy' atxClosing; inline})
|
||
|
atxClosing
|
||
|
return (Header (length lead) (normalizeSpaces txt)))
|
||
|
|
||
|
atxClosing = try (do
|
||
|
skipMany (char atxHChar)
|
||
|
skipSpaces
|
||
|
newline
|
||
|
option "" blanklines)
|
||
|
|
||
|
setextHeader = choice (map (\x -> setextH x) (enumFromTo 1 (length setextHChars)))
|
||
|
|
||
|
setextH n = try (do
|
||
|
txt <- many1 (do {notFollowedBy newline; inline})
|
||
|
endline
|
||
|
many1 (char (setextHChars !! (n-1)))
|
||
|
skipSpaces
|
||
|
newline
|
||
|
option "" blanklines
|
||
|
return (Header n (normalizeSpaces txt)))
|
||
|
|
||
|
--
|
||
|
-- hrule block
|
||
|
--
|
||
|
|
||
|
hruleWith chr =
|
||
|
try (do
|
||
|
skipSpaces
|
||
|
char chr
|
||
|
skipSpaces
|
||
|
char chr
|
||
|
skipSpaces
|
||
|
char chr
|
||
|
skipMany (oneOf (chr:spaceChars))
|
||
|
newline
|
||
|
option "" blanklines
|
||
|
return HorizontalRule)
|
||
|
|
||
|
hrule = choice (map hruleWith hruleChars) <?> "hrule"
|
||
|
|
||
|
--
|
||
|
-- code blocks
|
||
|
--
|
||
|
|
||
|
indentedLine = try (do
|
||
|
indentSpaces
|
||
|
result <- manyTill anyChar newline
|
||
|
return (result ++ "\n"))
|
||
|
|
||
|
-- two or more indented lines, possibly separated by blank lines
|
||
|
indentedBlock = try (do
|
||
|
res1 <- indentedLine
|
||
|
blanks <- many blankline
|
||
|
res2 <- choice [indentedBlock, indentedLine]
|
||
|
return (res1 ++ blanks ++ res2))
|
||
|
|
||
|
codeBlock = do
|
||
|
result <- choice [indentedBlock, indentedLine]
|
||
|
option "" blanklines
|
||
|
return (CodeBlock result)
|
||
|
|
||
|
--
|
||
|
-- note block
|
||
|
--
|
||
|
|
||
|
note = try (do
|
||
|
(NoteRef ref) <- noteRef
|
||
|
skipSpaces
|
||
|
raw <- sepBy (many (choice [nonEndline,
|
||
|
(try (do {endline; notFollowedBy (char noteStart); return '\n'}))
|
||
|
])) (try (do {newline; char noteStart; option ' ' (char ' ')}))
|
||
|
newline
|
||
|
blanklines
|
||
|
-- parse the extracted block, which may contain various block elements:
|
||
|
state <- getState
|
||
|
let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of
|
||
|
Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err
|
||
|
Right result -> result
|
||
|
return (Note ref parsed))
|
||
|
|
||
|
--
|
||
|
-- block quotes
|
||
|
--
|
||
|
|
||
|
emacsBoxQuote = try (do
|
||
|
string ",----"
|
||
|
manyTill anyChar newline
|
||
|
raw <- manyTill (try (do{ char '|';
|
||
|
option ' ' (char ' ');
|
||
|
result <- manyTill anyChar newline;
|
||
|
return result}))
|
||
|
(string "`----")
|
||
|
manyTill anyChar newline
|
||
|
option "" blanklines
|
||
|
return raw)
|
||
|
|
||
|
emailBlockQuoteStart = try (do
|
||
|
skipNonindentSpaces
|
||
|
char blockQuoteChar
|
||
|
option ' ' (char ' ')
|
||
|
return "> ")
|
||
|
|
||
|
emailBlockQuote = try (do
|
||
|
emailBlockQuoteStart
|
||
|
raw <- sepBy (many (choice [nonEndline,
|
||
|
(try (do{ endline;
|
||
|
notFollowedBy' emailBlockQuoteStart;
|
||
|
return '\n'}))]))
|
||
|
(try (do {newline; emailBlockQuoteStart}))
|
||
|
newline <|> (do{ eof; return '\n'})
|
||
|
option "" blanklines
|
||
|
return raw)
|
||
|
|
||
|
blockQuote = do
|
||
|
raw <- choice [ emailBlockQuote, emacsBoxQuote ]
|
||
|
-- parse the extracted block, which may contain various block elements:
|
||
|
state <- getState
|
||
|
let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of
|
||
|
Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err
|
||
|
Right result -> result
|
||
|
return (BlockQuote parsed)
|
||
|
|
||
|
--
|
||
|
-- list blocks
|
||
|
--
|
||
|
|
||
|
list = choice [ bulletList, orderedList ] <?> "list"
|
||
|
|
||
|
bulletListStart =
|
||
|
try (do
|
||
|
option ' ' newline -- if preceded by a Plain block in a list context
|
||
|
skipNonindentSpaces
|
||
|
notFollowedBy' hrule -- because hrules start out just like lists
|
||
|
oneOf bulletListMarkers
|
||
|
spaceChar
|
||
|
skipSpaces)
|
||
|
|
||
|
orderedListStart =
|
||
|
try (do
|
||
|
option ' ' newline -- if preceded by a Plain block in a list context
|
||
|
skipNonindentSpaces
|
||
|
many1 digit
|
||
|
oneOf orderedListDelimiters
|
||
|
oneOf spaceChars
|
||
|
skipSpaces)
|
||
|
|
||
|
-- parse a line of a list item (start = parser for beginning of list item)
|
||
|
listLine start = try (do
|
||
|
notFollowedBy' start
|
||
|
notFollowedBy blankline
|
||
|
notFollowedBy' (try (do{ indentSpaces;
|
||
|
many (spaceChar);
|
||
|
choice [bulletListStart, orderedListStart]}))
|
||
|
line <- manyTill anyChar newline
|
||
|
return (line ++ "\n"))
|
||
|
|
||
|
-- parse raw text for one list item, excluding start marker and continuations
|
||
|
rawListItem start =
|
||
|
try (do
|
||
|
start
|
||
|
result <- many1 (listLine start)
|
||
|
blanks <- many blankline
|
||
|
return ((concat result) ++ blanks))
|
||
|
|
||
|
-- continuation of a list item - indented and separated by blankline
|
||
|
-- or (in compact lists) endline.
|
||
|
-- note: nested lists are parsed as continuations
|
||
|
listContinuation start =
|
||
|
try (do
|
||
|
followedBy' indentSpaces
|
||
|
result <- many1 (listContinuationLine start)
|
||
|
blanks <- many blankline
|
||
|
return ((concat result) ++ blanks))
|
||
|
|
||
|
listContinuationLine start = try (do
|
||
|
notFollowedBy blankline
|
||
|
notFollowedBy' start
|
||
|
option "" indentSpaces
|
||
|
result <- manyTill anyChar newline
|
||
|
return (result ++ "\n"))
|
||
|
|
||
|
listItem start =
|
||
|
try (do
|
||
|
first <- rawListItem start
|
||
|
rest <- many (listContinuation start)
|
||
|
-- parsing with ListItemState forces markers at beginning of lines to
|
||
|
-- count as list item markers, even if not separated by blank space.
|
||
|
-- see definition of "endline"
|
||
|
state <- getState
|
||
|
let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState})
|
||
|
"block" raw of
|
||
|
Left err -> error $ "Raw block:\n" ++ raw ++ "\nError:\n" ++ show err
|
||
|
Right result -> result
|
||
|
where raw = concat (first:rest)
|
||
|
return parsed)
|
||
|
|
||
|
orderedList =
|
||
|
try (do
|
||
|
items <- many1 (listItem orderedListStart)
|
||
|
let items' = compactify items
|
||
|
return (OrderedList items'))
|
||
|
|
||
|
bulletList =
|
||
|
try (do
|
||
|
items <- many1 (listItem bulletListStart)
|
||
|
let items' = compactify items
|
||
|
return (BulletList items'))
|
||
|
|
||
|
--
|
||
|
-- paragraph block
|
||
|
--
|
||
|
|
||
|
para = try (do
|
||
|
result <- many1 inline
|
||
|
newline
|
||
|
choice [ (do{ followedBy' (oneOfStrings [">", ",----"]); return "" }), blanklines ]
|
||
|
let result' = normalizeSpaces result
|
||
|
return (Para result'))
|
||
|
|
||
|
plain = do
|
||
|
result <- many1 inline
|
||
|
let result' = normalizeSpaces result
|
||
|
return (Plain result')
|
||
|
|
||
|
--
|
||
|
-- raw html
|
||
|
--
|
||
|
|
||
|
rawHtmlBlocks = try (do
|
||
|
htmlBlocks <- many1 rawHtmlBlock
|
||
|
let combined = concatMap (\(RawHtml str) -> str) htmlBlocks
|
||
|
let combined' = if (last combined == '\n') then
|
||
|
init combined -- strip extra newline
|
||
|
else
|
||
|
combined
|
||
|
return (RawHtml combined'))
|
||
|
|
||
|
--
|
||
|
-- reference key
|
||
|
--
|
||
|
|
||
|
referenceKey =
|
||
|
try (do
|
||
|
skipSpaces
|
||
|
label <- reference
|
||
|
char labelSep
|
||
|
skipSpaces
|
||
|
option ' ' (char autoLinkStart)
|
||
|
src <- many (noneOf (titleOpeners ++ [autoLinkEnd] ++ endLineChars))
|
||
|
option ' ' (char autoLinkEnd)
|
||
|
tit <- option "" title
|
||
|
blanklines
|
||
|
return (Key label (Src (removeTrailingSpace src) tit)))
|
||
|
|
||
|
--
|
||
|
-- inline
|
||
|
--
|
||
|
|
||
|
text = choice [ math, strong, emph, code2, code1, str, linebreak, tabchar,
|
||
|
whitespace, endline ] <?> "text"
|
||
|
|
||
|
inline = choice [ rawLaTeXInline, escapedChar, special, hyphens, text, ltSign, symbol ] <?> "inline"
|
||
|
|
||
|
special = choice [ link, referenceLink, rawHtmlInline, autoLink,
|
||
|
image, noteRef ] <?> "link, inline html, note, or image"
|
||
|
|
||
|
escapedChar = escaped anyChar
|
||
|
|
||
|
ltSign = do
|
||
|
notFollowedBy' rawHtmlBlocks -- don't return < if it starts html
|
||
|
char '<'
|
||
|
return (Str ['<'])
|
||
|
|
||
|
specialCharsMinusLt = filter (/= '<') specialChars
|
||
|
|
||
|
symbol = do
|
||
|
result <- oneOf specialCharsMinusLt
|
||
|
return (Str [result])
|
||
|
|
||
|
hyphens = try (do
|
||
|
result <- many1 (char '-')
|
||
|
if (length result) == 1 then
|
||
|
skipEndline -- don't want to treat endline after hyphen as a space
|
||
|
else
|
||
|
do{ string ""; return Space }
|
||
|
return (Str result))
|
||
|
|
||
|
-- parses inline code, between codeStart and codeEnd
|
||
|
code1 =
|
||
|
try (do
|
||
|
char codeStart
|
||
|
result <- many (noneOf [codeEnd])
|
||
|
char codeEnd
|
||
|
let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines
|
||
|
return (Code result'))
|
||
|
|
||
|
-- parses inline code, between 2 codeStarts and 2 codeEnds
|
||
|
code2 =
|
||
|
try (do
|
||
|
string [codeStart, codeStart]
|
||
|
result <- manyTill anyChar (try (string [codeEnd, codeEnd]))
|
||
|
let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines
|
||
|
return (Code result'))
|
||
|
|
||
|
mathWord = many1 (choice [(noneOf (" \t\n\\" ++ [mathEnd])), (try (do {c <- char '\\'; notFollowedBy (char mathEnd); return c}))])
|
||
|
|
||
|
math = try (do
|
||
|
char mathStart
|
||
|
notFollowedBy space
|
||
|
words <- sepBy1 mathWord (many1 space)
|
||
|
char mathEnd
|
||
|
return (TeX ("$" ++ (joinWithSep " " words) ++ "$")))
|
||
|
|
||
|
emph = do
|
||
|
result <- choice [ (enclosed (char emphStart) (char emphEnd) inline),
|
||
|
(enclosed (char emphStartAlt) (char emphEndAlt) inline) ]
|
||
|
return (Emph (normalizeSpaces result))
|
||
|
|
||
|
strong = do
|
||
|
result <- choice [ (enclosed (count 2 (char emphStart)) (count 2 (char emphEnd)) inline),
|
||
|
(enclosed (count 2 (char emphStartAlt)) (count 2 (char emphEndAlt)) inline)]
|
||
|
return (Strong (normalizeSpaces result))
|
||
|
|
||
|
whitespace = do
|
||
|
many1 (oneOf spaceChars) <?> "whitespace"
|
||
|
return Space
|
||
|
|
||
|
tabchar = do
|
||
|
tab
|
||
|
return (Str "\t")
|
||
|
|
||
|
-- hard line break
|
||
|
linebreak = try (do
|
||
|
oneOf spaceChars
|
||
|
many1 (oneOf spaceChars)
|
||
|
endline
|
||
|
return LineBreak )
|
||
|
|
||
|
nonEndline = noneOf endLineChars
|
||
|
|
||
|
str = do
|
||
|
result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars)))
|
||
|
return (Str (decodeEntities result))
|
||
|
|
||
|
-- an endline character that can be treated as a space, not a structural break
|
||
|
endline =
|
||
|
try (do
|
||
|
newline
|
||
|
-- next line would allow block quotes without preceding blank line
|
||
|
-- Markdown.pl does allow this, but there's a chance of a wrapped
|
||
|
-- greater-than sign triggering a block quote by accident...
|
||
|
-- notFollowedBy (try (do { choice [emailBlockQuoteStart, string ",----"]; return ' ' }))
|
||
|
notFollowedBy blankline
|
||
|
-- parse potential list starts at beginning of line differently if in a list:
|
||
|
st <- getState
|
||
|
if (stateParserContext st) == ListItemState then
|
||
|
do
|
||
|
notFollowedBy' orderedListStart
|
||
|
notFollowedBy' bulletListStart
|
||
|
else
|
||
|
option () pzero
|
||
|
return Space)
|
||
|
|
||
|
--
|
||
|
-- links
|
||
|
--
|
||
|
|
||
|
-- a reference label for a link
|
||
|
reference = do
|
||
|
char labelStart
|
||
|
label <- manyTill inline (char labelEnd)
|
||
|
return (normalizeSpaces label)
|
||
|
|
||
|
-- source for a link, with optional title
|
||
|
source =
|
||
|
try (do
|
||
|
char srcStart
|
||
|
option ' ' (char autoLinkStart)
|
||
|
src <- many (noneOf ([srcEnd, autoLinkEnd] ++ titleOpeners))
|
||
|
option ' ' (char autoLinkEnd)
|
||
|
tit <- option "" title
|
||
|
skipSpaces
|
||
|
char srcEnd
|
||
|
return (Src (removeTrailingSpace src) tit))
|
||
|
|
||
|
titleWith startChar endChar =
|
||
|
try (do
|
||
|
skipSpaces
|
||
|
skipEndline -- a title can be on the next line from the source
|
||
|
skipSpaces
|
||
|
char startChar
|
||
|
tit <- manyTill (choice [ try (do {char '\\'; char endChar}),
|
||
|
(noneOf (endChar:endLineChars)) ]) (char endChar)
|
||
|
let tit' = gsub "\"" """ tit
|
||
|
return tit')
|
||
|
|
||
|
title = choice [titleWith '(' ')', titleWith '"' '"', titleWith '\'' '\''] <?> "title"
|
||
|
|
||
|
link = choice [explicitLink, referenceLink] <?> "link"
|
||
|
|
||
|
explicitLink =
|
||
|
try (do
|
||
|
label <- reference
|
||
|
src <- source
|
||
|
return (Link label src))
|
||
|
|
||
|
referenceLink = choice [referenceLinkDouble, referenceLinkSingle]
|
||
|
|
||
|
referenceLinkDouble = -- a link like [this][/url/]
|
||
|
try (do
|
||
|
label <- reference
|
||
|
skipSpaces
|
||
|
skipEndline
|
||
|
skipSpaces
|
||
|
ref <- reference
|
||
|
return (Link label (Ref ref)))
|
||
|
|
||
|
referenceLinkSingle = -- a link like [this]
|
||
|
try (do
|
||
|
label <- reference
|
||
|
return (Link label (Ref [])))
|
||
|
|
||
|
autoLink = -- a link <like.this.com>
|
||
|
try (do
|
||
|
notFollowedBy (do {anyHtmlBlockTag; return ' '})
|
||
|
src <- between (char autoLinkStart) (char autoLinkEnd)
|
||
|
(many (noneOf (spaceChars ++ endLineChars ++ [autoLinkEnd])))
|
||
|
case (matchRegex emailAddress src) of
|
||
|
Just _ -> return (Link [Str src] (Src ("mailto:" ++ src) ""))
|
||
|
Nothing -> return (Link [Str src] (Src src "")))
|
||
|
|
||
|
emailAddress = mkRegex "([^@:/]+)@(([^.]+[.]?)*([^.]+))" -- presupposes no whitespace
|
||
|
|
||
|
image =
|
||
|
try (do
|
||
|
char imageStart
|
||
|
(Link label src) <- link
|
||
|
return (Image label src))
|
||
|
|
||
|
noteRef = try (do
|
||
|
char noteStart
|
||
|
ref <- between (char '(') (char ')') (many1 (noneOf " \t\n)"))
|
||
|
return (NoteRef ref))
|
||
|
|