pandoc/src/Text/Pandoc/Readers/Markdown.hs
fiddlosopher f7b705b44c Implemented implicit reference-style links to section headers in markdown.
For example, if you have a header '# Supported architectures', you can
link to it with '[Supported architectures]'.  If there are multiple
headers with this label, the link will point to the first of them.
Implicit references are always overridden by explicitly specified references.
Addresses Issue #20.

+ Moved isPunctuation, uniqueIdentifiers, and inlineListToIdentifier from
  Text.Pandoc.Writers.HTML to Text.Pandoc.Shared.

+ Added stHeaders to ParserState.   This holds a list of header texts
  used in the document, and is used to construct implicit header references.

+ In Text.Pandoc.Readers.Markdown, added call to headerReference
  parser in initial parsing pass, constructing a list of section header
  labels. This is then passed to uniqueIdentifiers to produce
  identifiers, and a list of implicit references is constructed. This is
  added to the end of the explicitly specified references, so it will be
  overridden by explicitly specified references. All of this processing
  is skipped if --strict was specified.

+ Modified documentation in README.



git-svn-id: https://pandoc.googlecode.com/svn/trunk@1086 788f1e2b-df1e-0410-8736-df70ead52e1b
2007-11-22 17:14:21 +00:00

935 lines
29 KiB
Haskell

{-
Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.Markdown
Copyright : Copyright (C) 2006-7 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Conversion of markdown-formatted plain text to 'Pandoc' document.
-}
module Text.Pandoc.Readers.Markdown (
readMarkdown
) where
import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy )
import Data.Ord ( comparing )
import Data.Char ( isAlphaNum )
import Network.URI ( isURI )
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
anyHtmlInlineTag, anyHtmlTag,
anyHtmlEndTag, htmlEndTag, extractTagType,
htmlBlockElement )
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Text.ParserCombinators.Parsec
-- | Read markdown from an input string and return a Pandoc document.
readMarkdown :: ParserState -> String -> Pandoc
readMarkdown state str = (readWith parseMarkdown) state (str ++ "\n\n")
--
-- Constants and data structure definitions
--
spaceChars = " \t"
bulletListMarkers = "*+-"
hruleChars = "*-_"
setextHChars = "=-"
-- treat these as potentially non-text when parsing inline:
specialChars = "\\[]*_~`<>$!^-.&'\"\8216\8217\8220\8221"
--
-- auxiliary functions
--
indentSpaces = try $ do
state <- getState
let tabStop = stateTabStop state
try (count tabStop (char ' ')) <|>
(many (char ' ') >> string "\t") <?> "indentation"
nonindentSpaces = do
state <- getState
let tabStop = stateTabStop state
sps <- many (char ' ')
if length sps < tabStop
then return sps
else unexpected "indented line"
-- | Fail unless we're at beginning of a line.
failUnlessBeginningOfLine = do
pos <- getPosition
if sourceColumn pos == 1 then return () else fail "not beginning of line"
-- | Fail unless we're in "smart typography" mode.
failUnlessSmart = do
state <- getState
if stateSmart state then return () else fail "Smart typography feature"
-- | Parse an inline Str element with a given content.
inlineString str = try $ do
(Str res) <- inline
if res == str then return res else fail $ "unexpected Str content"
-- | Parse a sequence of inline elements between a string
-- @opener@ and a string @closer@, including inlines
-- between balanced pairs of @opener@ and a @closer@.
inlinesInBalanced :: String -> String -> GenParser Char ParserState [Inline]
inlinesInBalanced opener closer = try $ do
string opener
result <- manyTill ( (do lookAhead (inlineString opener)
-- because it might be a link...
bal <- inlinesInBalanced opener closer
return $ [Str opener] ++ bal ++ [Str closer])
<|> (count 1 inline))
(try (string closer))
return $ concat result
--
-- document structure
--
titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline
authorsLine = try $ do
char '%'
skipSpaces
authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
newline
return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors
dateLine = try $ do
char '%'
skipSpaces
date <- many (noneOf "\n")
newline
return $ decodeCharacterReferences $ removeTrailingSpace date
titleBlock = try $ do
failIfStrict
title <- option [] titleLine
author <- option [] authorsLine
date <- option "" dateLine
optional blanklines
return (title, author, date)
parseMarkdown = do
-- markdown allows raw HTML
updateState (\state -> state { stateParseRaw = True })
startPos <- getPosition
-- go through once just to get list of reference keys
-- docMinusKeys is the raw document with blanks where the keys were...
docMinusKeys <- manyTill (referenceKey <|> headerReference <|>
lineClump) eof >>= return . concat
setInput docMinusKeys
setPosition startPos
st <- getState
-- get headers and construct implicit references unless strict
if stateStrict st
then return ()
else do let oldkeys = stateKeys st
let headers = reverse $ stateHeaders st
let idents = uniqueIdentifiers headers
let implicitRefs = zipWith (\hd ident -> (hd, ("#" ++ ident, "")))
headers idents
updateState $ \st -> st { stateKeys = oldkeys ++ implicitRefs }
-- go through again for notes unless strict...
if stateStrict st
then return ()
else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>=
return . concat
st <- getState
let reversedNotes = stateNotes st
updateState $ \st -> st { stateNotes = reverse reversedNotes }
setInput docMinusNotes
setPosition startPos
-- now parse it for real...
(title, author, date) <- option ([],[],"") titleBlock
blocks <- parseBlocks
return $ Pandoc (Meta title author date) $ filter (/= Null) blocks
--
-- initial pass for references and notes
--
referenceKey = try $ do
startPos <- getPosition
nonindentSpaces
label <- reference
char ':'
skipSpaces
optional (char '<')
src <- many (noneOf "> \n\t")
optional (char '>')
tit <- option "" referenceTitle
blanklines
endPos <- getPosition
let newkey = (label, (removeTrailingSpace src, tit))
st <- getState
let oldkeys = stateKeys st
updateState $ \st -> st { stateKeys = newkey : oldkeys }
-- return blanks so line count isn't affected
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
headerReference = try $ do
failIfStrict
startPos <- getPosition
(Header level text) <- lookAhead $ atxHeader <|> setextHeader
st <- getState
let headers = stateHeaders st
updateState $ \st -> st { stateHeaders = text:headers }
endPos <- getPosition
lineClump -- return the raw header, because we need to parse it later
referenceTitle = try $ do
(many1 spaceChar >> option '\n' newline) <|> newline
skipSpaces
tit <- (charsInBalanced '(' ')' >>= return . unwords . words)
<|> do delim <- char '\'' <|> char '"'
manyTill anyChar (try (char delim >> skipSpaces >>
notFollowedBy (noneOf ")\n")))
return $ decodeCharacterReferences tit
noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']')
rawLine = do
notFollowedBy blankline
notFollowedBy' noteMarker
contents <- many1 nonEndline
end <- option "" (newline >> optional indentSpaces >> return "\n")
return $ contents ++ end
rawLines = many1 rawLine >>= return . concat
noteBlock = try $ do
startPos <- getPosition
ref <- noteMarker
char ':'
optional blankline
optional indentSpaces
raw <- sepBy rawLines (try (blankline >> indentSpaces))
optional blanklines
endPos <- getPosition
-- parse the extracted text, which may contain various block elements:
contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n"
let newnote = (ref, contents)
st <- getState
let oldnotes = stateNotes st
updateState $ \st -> st { stateNotes = newnote : oldnotes }
-- return blanks so line count isn't affected
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
--
-- parsing blocks
--
parseBlocks = manyTill block eof
block = choice [ header
, table
, codeBlock
, hrule
, list
, blockQuote
, htmlBlock
, rawLaTeXEnvironment'
, para
, plain
, nullBlock ] <?> "block"
--
-- header blocks
--
header = atxHeader <|> setextHeader <?> "header"
atxHeader = try $ do
level <- many1 (char '#') >>= return . length
notFollowedBy (char '.' <|> char ')') -- this would be a list
skipSpaces
text <- manyTill inline atxClosing >>= return . normalizeSpaces
return $ Header level text
atxClosing = try $ skipMany (char '#') >> blanklines
setextHeader = try $ do
-- first, see if this block has any chance of being a setextHeader:
lookAhead (anyLine >> oneOf setextHChars)
text <- many1Till inline newline >>= return . normalizeSpaces
level <- choice $ zipWith
(\ch lev -> try (many1 $ char ch) >> blanklines >> return lev)
setextHChars [1..(length setextHChars)]
return $ Header level text
--
-- hrule block
--
hrule = try $ do
skipSpaces
start <- oneOf hruleChars
count 2 (skipSpaces >> char start)
skipMany (skipSpaces >> char start)
newline
optional blanklines
return HorizontalRule
--
-- code blocks
--
indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n")
codeBlock = do
contents <- many1 (indentedLine <|>
try (do b <- blanklines
l <- indentedLine
return $ b ++ l))
optional blanklines
return $ CodeBlock $ stripTrailingNewlines $ concat contents
--
-- block quotes
--
emacsBoxQuote = try $ do
failIfStrict
string ",----"
manyTill anyChar newline
raw <- manyTill
(try (char '|' >> optional (char ' ') >> manyTill anyChar newline))
(try (string "`----"))
blanklines
return raw
emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ')
emailBlockQuote = try $ do
emailBlockQuoteStart
raw <- sepBy (many (nonEndline <|>
(try (endline >> notFollowedBy emailBlockQuoteStart >>
return '\n'))))
(try (newline >> emailBlockQuoteStart))
newline <|> (eof >> return '\n')
optional blanklines
return raw
blockQuote = do
raw <- emailBlockQuote <|> emacsBoxQuote
-- parse the extracted block, which may contain various block elements:
contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n"
return $ BlockQuote contents
--
-- list blocks
--
list = choice [ bulletList, orderedList, definitionList ] <?> "list"
bulletListStart = try $ do
optional newline -- if preceded by a Plain block in a list context
nonindentSpaces
notFollowedBy' hrule -- because hrules start out just like lists
oneOf bulletListMarkers
spaceChar
skipSpaces
anyOrderedListStart = try $ do
optional newline -- if preceded by a Plain block in a list context
nonindentSpaces
notFollowedBy $ string "p." >> spaceChar >> digit -- page number
state <- getState
if stateStrict state
then do many1 digit
char '.'
spaceChar
return (1, DefaultStyle, DefaultDelim)
else anyOrderedListMarker >>~ spaceChar
orderedListStart style delim = try $ do
optional newline -- if preceded by a Plain block in a list context
nonindentSpaces
state <- getState
num <- if stateStrict state
then do many1 digit
char '.'
return 1
else orderedListMarker style delim
if delim == Period && (style == UpperAlpha || (style == UpperRoman &&
num `elem` [1, 5, 10, 50, 100, 500, 1000]))
then char '\t' <|> (spaceChar >> spaceChar)
else spaceChar
skipSpaces
-- parse a line of a list item (start = parser for beginning of list item)
listLine start = try $ do
notFollowedBy' start
notFollowedBy blankline
notFollowedBy' (do indentSpaces
many (spaceChar)
bulletListStart <|> (anyOrderedListStart >> return ()))
line <- manyTill anyChar newline
return $ line ++ "\n"
-- parse raw text for one list item, excluding start marker and continuations
rawListItem start = try $ do
start
result <- many1 (listLine start)
blanks <- many blankline
return $ concat result ++ blanks
-- continuation of a list item - indented and separated by blankline
-- or (in compact lists) endline.
-- note: nested lists are parsed as continuations
listContinuation start = try $ do
lookAhead indentSpaces
result <- many1 (listContinuationLine start)
blanks <- many blankline
return $ concat result ++ blanks
listContinuationLine start = try $ do
notFollowedBy blankline
notFollowedBy' start
optional indentSpaces
result <- manyTill anyChar newline
return $ result ++ "\n"
listItem start = try $ do
first <- rawListItem start
continuations <- many (listContinuation start)
-- parsing with ListItemState forces markers at beginning of lines to
-- count as list item markers, even if not separated by blank space.
-- see definition of "endline"
state <- getState
let oldContext = stateParserContext state
setState $ state {stateParserContext = ListItemState}
-- parse the extracted block, which may contain various block elements:
let raw = concat (first:continuations)
contents <- parseFromString parseBlocks raw
updateState (\st -> st {stateParserContext = oldContext})
return contents
orderedList = try $ do
(start, style, delim) <- lookAhead anyOrderedListStart
items <- many1 (listItem (orderedListStart style delim))
return $ OrderedList (start, style, delim) $ compactify items
bulletList = many1 (listItem bulletListStart) >>=
return . BulletList . compactify
-- definition lists
definitionListItem = try $ do
notFollowedBy blankline
notFollowedBy' indentSpaces
-- first, see if this has any chance of being a definition list:
lookAhead (anyLine >> char ':')
term <- manyTill inline newline
raw <- many1 defRawBlock
state <- getState
let oldContext = stateParserContext state
-- parse the extracted block, which may contain various block elements:
contents <- parseFromString parseBlocks $ concat raw
updateState (\st -> st {stateParserContext = oldContext})
return ((normalizeSpaces term), contents)
defRawBlock = try $ do
char ':'
state <- getState
let tabStop = stateTabStop state
try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t")
firstline <- anyLine
rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
trailing <- option "" blanklines
return $ firstline ++ "\n" ++ unlines rawlines ++ trailing
definitionList = do
failIfStrict
items <- many1 definitionListItem
let (terms, defs) = unzip items
let defs' = compactify defs
let items' = zip terms defs'
return $ DefinitionList items'
--
-- paragraph block
--
para = try $ do
result <- many1 inline
newline
blanklines <|> do st <- getState
if stateStrict st
then lookAhead (blockQuote <|> header) >> return ""
else lookAhead emacsBoxQuote >> return ""
return $ Para $ normalizeSpaces result
plain = many1 inline >>= return . Plain . normalizeSpaces
--
-- raw html
--
htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
htmlBlock = do
st <- getState
if stateStrict st
then try $ do failUnlessBeginningOfLine
first <- htmlElement
finalSpace <- many (oneOf spaceChars)
finalNewlines <- many newline
return $ RawHtml $ first ++ finalSpace ++ finalNewlines
else rawHtmlBlocks
-- True if tag is self-closing
isSelfClosing tag =
isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag
strictHtmlBlock = try $ do
tag <- anyHtmlBlockTag
let tag' = extractTagType tag
if isSelfClosing tag || tag' == "hr"
then return tag
else do contents <- many (notFollowedBy' (htmlEndTag tag') >>
(htmlElement <|> (count 1 anyChar)))
end <- htmlEndTag tag'
return $ tag ++ concat contents ++ end
rawHtmlBlocks = do
htmlBlocks <- many1 rawHtmlBlock
let combined = concatMap (\(RawHtml str) -> str) htmlBlocks
let combined' = if not (null combined) && last combined == '\n'
then init combined -- strip extra newline
else combined
return $ RawHtml combined'
--
-- LaTeX
--
rawLaTeXEnvironment' = failIfStrict >> rawLaTeXEnvironment
--
-- Tables
--
-- Parse a dashed line with optional trailing spaces; return its length
-- and the length including trailing space.
dashedLine ch = do
dashes <- many1 (char ch)
sp <- many spaceChar
return $ (length dashes, length $ dashes ++ sp)
-- Parse a table header with dashed lines of '-' preceded by
-- one line of text.
simpleTableHeader = try $ do
rawContent <- anyLine
initSp <- nonindentSpaces
dashes <- many1 (dashedLine '-')
newline
let (lengths, lines) = unzip dashes
let indices = scanl (+) (length initSp) lines
let rawHeads = tail $ splitByIndices (init indices) rawContent
let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths
return (rawHeads, aligns, indices)
-- Parse a table footer - dashed lines followed by blank line.
tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines
-- Parse a table separator - dashed line.
tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n"
-- Parse a raw line and split it into chunks by indices.
rawTableLine indices = do
notFollowedBy' (blanklines <|> tableFooter)
line <- many1Till anyChar newline
return $ map removeLeadingTrailingSpace $ tail $
splitByIndices (init indices) line
-- Parse a table line and return a list of lists of blocks (columns).
tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain))
-- Parse a multiline table row and return a list of blocks (columns).
multilineRow indices = do
colLines <- many1 (rawTableLine indices)
optional blanklines
let cols = map unlines $ transpose colLines
mapM (parseFromString (many plain)) cols
-- Calculate relative widths of table columns, based on indices
widthsFromIndices :: Int -- Number of columns on terminal
-> [Int] -- Indices
-> [Float] -- Fractional relative sizes of columns
widthsFromIndices _ [] = []
widthsFromIndices numColumns indices =
let lengths = zipWith (-) indices (0:indices)
totLength = sum lengths
quotient = if totLength > numColumns
then fromIntegral totLength
else fromIntegral numColumns
fracs = map (\l -> (fromIntegral l) / quotient) lengths in
tail fracs
-- Parses a table caption: inlines beginning with 'Table:'
-- and followed by blank lines.
tableCaption = try $ do
nonindentSpaces
string "Table:"
result <- many1 inline
blanklines
return $ normalizeSpaces result
-- Parse a table using 'headerParser', 'lineParser', and 'footerParser'.
tableWith headerParser lineParser footerParser = try $ do
(rawHeads, aligns, indices) <- headerParser
lines <- many1Till (lineParser indices) footerParser
caption <- option [] tableCaption
heads <- mapM (parseFromString (many plain)) rawHeads
state <- getState
let numColumns = stateColumns state
let widths = widthsFromIndices numColumns indices
return $ Table caption aligns widths heads lines
-- Parse a simple table with '---' header and one line per row.
simpleTable = tableWith simpleTableHeader tableLine blanklines
-- Parse a multiline table: starts with row of '-' on top, then header
-- (which may be multiline), then the rows,
-- which may be multiline, separated by blank lines, and
-- ending with a footer (dashed line followed by blank line).
multilineTable = tableWith multilineTableHeader multilineRow tableFooter
multilineTableHeader = try $ do
tableSep
rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline)
initSp <- nonindentSpaces
dashes <- many1 (dashedLine '-')
newline
let (lengths, lines) = unzip dashes
let indices = scanl (+) (length initSp) lines
let rawHeadsList = transpose $ map
(\ln -> tail $ splitByIndices (init indices) ln)
rawContent
let rawHeads = map (joinWithSep " ") rawHeadsList
let aligns = zipWith alignType rawHeadsList lengths
return ((map removeLeadingTrailingSpace rawHeads), aligns, indices)
-- Returns an alignment type for a table, based on a list of strings
-- (the rows of the column header) and a number (the length of the
-- dashed line under the rows.
alignType :: [String] -> Int -> Alignment
alignType [] len = AlignDefault
alignType strLst len =
let str = head $ sortBy (comparing length) $
map removeTrailingSpace strLst
leftSpace = if null str then False else (str !! 0) `elem` " \t"
rightSpace = length str < len || (str !! (len - 1)) `elem` " \t"
in case (leftSpace, rightSpace) of
(True, False) -> AlignRight
(False, True) -> AlignLeft
(True, True) -> AlignCenter
(False, False) -> AlignDefault
table = failIfStrict >> (simpleTable <|> multilineTable) <?> "table"
--
-- inline
--
inline = choice [ str
, smartPunctuation
, whitespace
, endline
, code
, charRef
, strong
, emph
, note
, inlineNote
, link
, image
, math
, strikeout
, superscript
, subscript
, autoLink
, rawHtmlInline'
, rawLaTeXInline'
, escapedChar
, symbol
, ltSign ] <?> "inline"
escapedChar = do
char '\\'
state <- getState
result <- option '\\' $ if stateStrict state
then oneOf "\\`*_{}[]()>#+-.!~"
else satisfy (not . isAlphaNum)
return $ Str [result]
ltSign = do
st <- getState
if stateStrict st
then char '<'
else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html
return $ Str ['<']
specialCharsMinusLt = filter (/= '<') specialChars
symbol = do
result <- oneOf specialCharsMinusLt
return $ Str [result]
-- parses inline code, between n `s and n `s
code = try $ do
starts <- many1 (char '`')
skipSpaces
result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|>
(char '\n' >> return " "))
(try (skipSpaces >> count (length starts) (char '`') >>
notFollowedBy (char '`')))
return $ Code $ removeLeadingTrailingSpace $ concat result
mathWord = many1 ((noneOf " \t\n\\$") <|>
(try (char '\\') >>~ notFollowedBy (char '$')))
math = try $ do
failIfStrict
char '$'
notFollowedBy space
words <- sepBy1 mathWord (many1 space)
char '$'
return $ TeX ("$" ++ (joinWithSep " " words) ++ "$")
emph = ((enclosed (char '*') (char '*') inline) <|>
(enclosed (char '_') (char '_' >> notFollowedBy alphaNum) inline)) >>=
return . Emph . normalizeSpaces
strong = ((enclosed (string "**") (try $ string "**") inline) <|>
(enclosed (string "__") (try $ string "__") inline)) >>=
return . Strong . normalizeSpaces
strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>=
return . Strikeout . normalizeSpaces
superscript = failIfStrict >> enclosed (char '^') (char '^')
(notFollowedBy' whitespace >> inline) >>= -- may not contain Space
return . Superscript
subscript = failIfStrict >> enclosed (char '~') (char '~')
(notFollowedBy' whitespace >> inline) >>= -- may not contain Space
return . Subscript
smartPunctuation = failUnlessSmart >>
choice [ quoted, apostrophe, dash, ellipses ]
apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe
quoted = doubleQuoted <|> singleQuoted
withQuoteContext context parser = do
oldState <- getState
let oldQuoteContext = stateQuoteContext oldState
setState oldState { stateQuoteContext = context }
result <- parser
newState <- getState
setState newState { stateQuoteContext = oldQuoteContext }
return result
singleQuoted = try $ do
singleQuoteStart
withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>=
return . Quoted SingleQuote . normalizeSpaces
doubleQuoted = try $ do
doubleQuoteStart
withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>=
return . Quoted DoubleQuote . normalizeSpaces
failIfInQuoteContext context = do
st <- getState
if stateQuoteContext st == context
then fail "already inside quotes"
else return ()
singleQuoteStart = do
failIfInQuoteContext InSingleQuote
char '\8216' <|>
(try $ do char '\''
notFollowedBy (oneOf ")!],.;:-? \t\n")
notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
satisfy (not . isAlphaNum)))
-- possess/contraction
return '\'')
singleQuoteEnd = try $ do
char '\8217' <|> char '\''
notFollowedBy alphaNum
return '\''
doubleQuoteStart = do
failIfInQuoteContext InDoubleQuote
char '\8220' <|>
(try $ do char '"'
notFollowedBy (oneOf " \t\n")
return '"')
doubleQuoteEnd = char '\8221' <|> char '"'
ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses
dash = enDash <|> emDash
enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash
emDash = try $ skipSpaces >> oneOfStrings ["---", "--"] >>
skipSpaces >> return EmDash
whitespace = do
sps <- many1 (oneOf spaceChars)
if length sps >= 2
then option Space (endline >> return LineBreak)
else return Space <?> "whitespace"
nonEndline = satisfy (/='\n')
strChar = noneOf (specialChars ++ spaceChars ++ "\n")
str = many1 strChar >>= return . Str
-- an endline character that can be treated as a space, not a structural break
endline = try $ do
newline
notFollowedBy blankline
st <- getState
if stateStrict st
then do notFollowedBy emailBlockQuoteStart
notFollowedBy (char '#') -- atx header
else return ()
-- parse potential list-starts differently if in a list:
if stateParserContext st == ListItemState
then notFollowedBy' (bulletListStart <|>
(anyOrderedListStart >> return ()))
else return ()
return Space
--
-- links
--
-- a reference label for a link
reference = notFollowedBy' (string "[^") >> -- footnote reference
inlinesInBalanced "[" "]" >>= (return . normalizeSpaces)
-- source for a link, with optional title
source = try $ do
char '('
optional (char '<')
src <- many (noneOf ")> \t\n")
optional (char '>')
tit <- option "" linkTitle
skipSpaces
char ')'
return (removeTrailingSpace src, tit)
linkTitle = try $ do
(many1 spaceChar >> option '\n' newline) <|> newline
skipSpaces
delim <- char '\'' <|> char '"'
tit <- manyTill anyChar (try (char delim >> skipSpaces >>
notFollowedBy (noneOf ")\n")))
return $ decodeCharacterReferences tit
link = try $ do
label <- reference
src <- source <|> referenceLink label
return $ Link label src
-- a link like [this][ref] or [this][] or [this]
referenceLink label = do
ref <- option [] (try (optional (char ' ') >>
optional (newline >> skipSpaces) >> reference))
let ref' = if null ref then label else ref
state <- getState
case lookupKeySrc (stateKeys state) ref' of
Nothing -> fail "no corresponding key"
Just target -> return target
emailAddress = try $ do
name <- many1 (alphaNum <|> char '+')
char '@'
first <- many1 alphaNum
rest <- many1 (char '.' >> many1 alphaNum)
return $ "mailto:" ++ name ++ "@" ++ joinWithSep "." (first:rest)
uri = try $ do
str <- many1 (noneOf "\n\t >")
if isURI str
then return str
else fail "not a URI"
autoLink = try $ do
char '<'
src <- uri <|> emailAddress
char '>'
let src' = if "mailto:" `isPrefixOf` src
then drop 7 src
else src
st <- getState
return $ if stateStrict st
then Link [Str src'] (src, "")
else Link [Code src'] (src, "")
image = try $ do
char '!'
(Link label src) <- link
return $ Image label src
note = try $ do
failIfStrict
ref <- noteMarker
state <- getState
let notes = stateNotes state
case lookup ref notes of
Nothing -> fail "note not found"
Just contents -> return $ Note contents
inlineNote = try $ do
failIfStrict
char '^'
contents <- inlinesInBalanced "[" "]"
return $ Note [Para contents]
rawLaTeXInline' = failIfStrict >> rawLaTeXInline
rawHtmlInline' = do
st <- getState
result <- choice $ if stateStrict st
then [htmlBlockElement, anyHtmlTag, anyHtmlEndTag]
else [htmlBlockElement, anyHtmlInlineTag]
return $ HtmlInline result