f7b705b44c
For example, if you have a header '# Supported architectures', you can link to it with '[Supported architectures]'. If there are multiple headers with this label, the link will point to the first of them. Implicit references are always overridden by explicitly specified references. Addresses Issue #20. + Moved isPunctuation, uniqueIdentifiers, and inlineListToIdentifier from Text.Pandoc.Writers.HTML to Text.Pandoc.Shared. + Added stHeaders to ParserState. This holds a list of header texts used in the document, and is used to construct implicit header references. + In Text.Pandoc.Readers.Markdown, added call to headerReference parser in initial parsing pass, constructing a list of section header labels. This is then passed to uniqueIdentifiers to produce identifiers, and a list of implicit references is constructed. This is added to the end of the explicitly specified references, so it will be overridden by explicitly specified references. All of this processing is skipped if --strict was specified. + Modified documentation in README. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1086 788f1e2b-df1e-0410-8736-df70ead52e1b
935 lines
29 KiB
Haskell
935 lines
29 KiB
Haskell
{-
|
|
Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
-}
|
|
|
|
{- |
|
|
Module : Text.Pandoc.Readers.Markdown
|
|
Copyright : Copyright (C) 2006-7 John MacFarlane
|
|
License : GNU GPL, version 2 or above
|
|
|
|
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
|
Stability : alpha
|
|
Portability : portable
|
|
|
|
Conversion of markdown-formatted plain text to 'Pandoc' document.
|
|
-}
|
|
module Text.Pandoc.Readers.Markdown (
|
|
readMarkdown
|
|
) where
|
|
|
|
import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy )
|
|
import Data.Ord ( comparing )
|
|
import Data.Char ( isAlphaNum )
|
|
import Network.URI ( isURI )
|
|
import Text.Pandoc.Definition
|
|
import Text.Pandoc.Shared
|
|
import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
|
|
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
|
|
anyHtmlInlineTag, anyHtmlTag,
|
|
anyHtmlEndTag, htmlEndTag, extractTagType,
|
|
htmlBlockElement )
|
|
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
|
|
import Text.ParserCombinators.Parsec
|
|
|
|
-- | Read markdown from an input string and return a Pandoc document.
|
|
readMarkdown :: ParserState -> String -> Pandoc
|
|
readMarkdown state str = (readWith parseMarkdown) state (str ++ "\n\n")
|
|
|
|
--
|
|
-- Constants and data structure definitions
|
|
--
|
|
|
|
spaceChars = " \t"
|
|
bulletListMarkers = "*+-"
|
|
hruleChars = "*-_"
|
|
setextHChars = "=-"
|
|
|
|
-- treat these as potentially non-text when parsing inline:
|
|
specialChars = "\\[]*_~`<>$!^-.&'\"\8216\8217\8220\8221"
|
|
|
|
--
|
|
-- auxiliary functions
|
|
--
|
|
|
|
indentSpaces = try $ do
|
|
state <- getState
|
|
let tabStop = stateTabStop state
|
|
try (count tabStop (char ' ')) <|>
|
|
(many (char ' ') >> string "\t") <?> "indentation"
|
|
|
|
nonindentSpaces = do
|
|
state <- getState
|
|
let tabStop = stateTabStop state
|
|
sps <- many (char ' ')
|
|
if length sps < tabStop
|
|
then return sps
|
|
else unexpected "indented line"
|
|
|
|
-- | Fail unless we're at beginning of a line.
|
|
failUnlessBeginningOfLine = do
|
|
pos <- getPosition
|
|
if sourceColumn pos == 1 then return () else fail "not beginning of line"
|
|
|
|
-- | Fail unless we're in "smart typography" mode.
|
|
failUnlessSmart = do
|
|
state <- getState
|
|
if stateSmart state then return () else fail "Smart typography feature"
|
|
|
|
-- | Parse an inline Str element with a given content.
|
|
inlineString str = try $ do
|
|
(Str res) <- inline
|
|
if res == str then return res else fail $ "unexpected Str content"
|
|
|
|
-- | Parse a sequence of inline elements between a string
|
|
-- @opener@ and a string @closer@, including inlines
|
|
-- between balanced pairs of @opener@ and a @closer@.
|
|
inlinesInBalanced :: String -> String -> GenParser Char ParserState [Inline]
|
|
inlinesInBalanced opener closer = try $ do
|
|
string opener
|
|
result <- manyTill ( (do lookAhead (inlineString opener)
|
|
-- because it might be a link...
|
|
bal <- inlinesInBalanced opener closer
|
|
return $ [Str opener] ++ bal ++ [Str closer])
|
|
<|> (count 1 inline))
|
|
(try (string closer))
|
|
return $ concat result
|
|
|
|
--
|
|
-- document structure
|
|
--
|
|
|
|
titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline
|
|
|
|
authorsLine = try $ do
|
|
char '%'
|
|
skipSpaces
|
|
authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
|
|
newline
|
|
return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors
|
|
|
|
dateLine = try $ do
|
|
char '%'
|
|
skipSpaces
|
|
date <- many (noneOf "\n")
|
|
newline
|
|
return $ decodeCharacterReferences $ removeTrailingSpace date
|
|
|
|
titleBlock = try $ do
|
|
failIfStrict
|
|
title <- option [] titleLine
|
|
author <- option [] authorsLine
|
|
date <- option "" dateLine
|
|
optional blanklines
|
|
return (title, author, date)
|
|
|
|
parseMarkdown = do
|
|
-- markdown allows raw HTML
|
|
updateState (\state -> state { stateParseRaw = True })
|
|
startPos <- getPosition
|
|
-- go through once just to get list of reference keys
|
|
-- docMinusKeys is the raw document with blanks where the keys were...
|
|
docMinusKeys <- manyTill (referenceKey <|> headerReference <|>
|
|
lineClump) eof >>= return . concat
|
|
setInput docMinusKeys
|
|
setPosition startPos
|
|
st <- getState
|
|
-- get headers and construct implicit references unless strict
|
|
if stateStrict st
|
|
then return ()
|
|
else do let oldkeys = stateKeys st
|
|
let headers = reverse $ stateHeaders st
|
|
let idents = uniqueIdentifiers headers
|
|
let implicitRefs = zipWith (\hd ident -> (hd, ("#" ++ ident, "")))
|
|
headers idents
|
|
updateState $ \st -> st { stateKeys = oldkeys ++ implicitRefs }
|
|
-- go through again for notes unless strict...
|
|
if stateStrict st
|
|
then return ()
|
|
else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>=
|
|
return . concat
|
|
st <- getState
|
|
let reversedNotes = stateNotes st
|
|
updateState $ \st -> st { stateNotes = reverse reversedNotes }
|
|
setInput docMinusNotes
|
|
setPosition startPos
|
|
-- now parse it for real...
|
|
(title, author, date) <- option ([],[],"") titleBlock
|
|
blocks <- parseBlocks
|
|
return $ Pandoc (Meta title author date) $ filter (/= Null) blocks
|
|
|
|
--
|
|
-- initial pass for references and notes
|
|
--
|
|
|
|
referenceKey = try $ do
|
|
startPos <- getPosition
|
|
nonindentSpaces
|
|
label <- reference
|
|
char ':'
|
|
skipSpaces
|
|
optional (char '<')
|
|
src <- many (noneOf "> \n\t")
|
|
optional (char '>')
|
|
tit <- option "" referenceTitle
|
|
blanklines
|
|
endPos <- getPosition
|
|
let newkey = (label, (removeTrailingSpace src, tit))
|
|
st <- getState
|
|
let oldkeys = stateKeys st
|
|
updateState $ \st -> st { stateKeys = newkey : oldkeys }
|
|
-- return blanks so line count isn't affected
|
|
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
|
|
|
|
headerReference = try $ do
|
|
failIfStrict
|
|
startPos <- getPosition
|
|
(Header level text) <- lookAhead $ atxHeader <|> setextHeader
|
|
st <- getState
|
|
let headers = stateHeaders st
|
|
updateState $ \st -> st { stateHeaders = text:headers }
|
|
endPos <- getPosition
|
|
lineClump -- return the raw header, because we need to parse it later
|
|
|
|
referenceTitle = try $ do
|
|
(many1 spaceChar >> option '\n' newline) <|> newline
|
|
skipSpaces
|
|
tit <- (charsInBalanced '(' ')' >>= return . unwords . words)
|
|
<|> do delim <- char '\'' <|> char '"'
|
|
manyTill anyChar (try (char delim >> skipSpaces >>
|
|
notFollowedBy (noneOf ")\n")))
|
|
return $ decodeCharacterReferences tit
|
|
|
|
noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']')
|
|
|
|
rawLine = do
|
|
notFollowedBy blankline
|
|
notFollowedBy' noteMarker
|
|
contents <- many1 nonEndline
|
|
end <- option "" (newline >> optional indentSpaces >> return "\n")
|
|
return $ contents ++ end
|
|
|
|
rawLines = many1 rawLine >>= return . concat
|
|
|
|
noteBlock = try $ do
|
|
startPos <- getPosition
|
|
ref <- noteMarker
|
|
char ':'
|
|
optional blankline
|
|
optional indentSpaces
|
|
raw <- sepBy rawLines (try (blankline >> indentSpaces))
|
|
optional blanklines
|
|
endPos <- getPosition
|
|
-- parse the extracted text, which may contain various block elements:
|
|
contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n"
|
|
let newnote = (ref, contents)
|
|
st <- getState
|
|
let oldnotes = stateNotes st
|
|
updateState $ \st -> st { stateNotes = newnote : oldnotes }
|
|
-- return blanks so line count isn't affected
|
|
return $ replicate (sourceLine endPos - sourceLine startPos) '\n'
|
|
|
|
--
|
|
-- parsing blocks
|
|
--
|
|
|
|
parseBlocks = manyTill block eof
|
|
|
|
block = choice [ header
|
|
, table
|
|
, codeBlock
|
|
, hrule
|
|
, list
|
|
, blockQuote
|
|
, htmlBlock
|
|
, rawLaTeXEnvironment'
|
|
, para
|
|
, plain
|
|
, nullBlock ] <?> "block"
|
|
|
|
--
|
|
-- header blocks
|
|
--
|
|
|
|
header = atxHeader <|> setextHeader <?> "header"
|
|
|
|
atxHeader = try $ do
|
|
level <- many1 (char '#') >>= return . length
|
|
notFollowedBy (char '.' <|> char ')') -- this would be a list
|
|
skipSpaces
|
|
text <- manyTill inline atxClosing >>= return . normalizeSpaces
|
|
return $ Header level text
|
|
|
|
atxClosing = try $ skipMany (char '#') >> blanklines
|
|
|
|
setextHeader = try $ do
|
|
-- first, see if this block has any chance of being a setextHeader:
|
|
lookAhead (anyLine >> oneOf setextHChars)
|
|
text <- many1Till inline newline >>= return . normalizeSpaces
|
|
level <- choice $ zipWith
|
|
(\ch lev -> try (many1 $ char ch) >> blanklines >> return lev)
|
|
setextHChars [1..(length setextHChars)]
|
|
return $ Header level text
|
|
|
|
--
|
|
-- hrule block
|
|
--
|
|
|
|
hrule = try $ do
|
|
skipSpaces
|
|
start <- oneOf hruleChars
|
|
count 2 (skipSpaces >> char start)
|
|
skipMany (skipSpaces >> char start)
|
|
newline
|
|
optional blanklines
|
|
return HorizontalRule
|
|
|
|
--
|
|
-- code blocks
|
|
--
|
|
|
|
indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n")
|
|
|
|
codeBlock = do
|
|
contents <- many1 (indentedLine <|>
|
|
try (do b <- blanklines
|
|
l <- indentedLine
|
|
return $ b ++ l))
|
|
optional blanklines
|
|
return $ CodeBlock $ stripTrailingNewlines $ concat contents
|
|
|
|
--
|
|
-- block quotes
|
|
--
|
|
|
|
emacsBoxQuote = try $ do
|
|
failIfStrict
|
|
string ",----"
|
|
manyTill anyChar newline
|
|
raw <- manyTill
|
|
(try (char '|' >> optional (char ' ') >> manyTill anyChar newline))
|
|
(try (string "`----"))
|
|
blanklines
|
|
return raw
|
|
|
|
emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ')
|
|
|
|
emailBlockQuote = try $ do
|
|
emailBlockQuoteStart
|
|
raw <- sepBy (many (nonEndline <|>
|
|
(try (endline >> notFollowedBy emailBlockQuoteStart >>
|
|
return '\n'))))
|
|
(try (newline >> emailBlockQuoteStart))
|
|
newline <|> (eof >> return '\n')
|
|
optional blanklines
|
|
return raw
|
|
|
|
blockQuote = do
|
|
raw <- emailBlockQuote <|> emacsBoxQuote
|
|
-- parse the extracted block, which may contain various block elements:
|
|
contents <- parseFromString parseBlocks $ (joinWithSep "\n" raw) ++ "\n\n"
|
|
return $ BlockQuote contents
|
|
|
|
--
|
|
-- list blocks
|
|
--
|
|
|
|
list = choice [ bulletList, orderedList, definitionList ] <?> "list"
|
|
|
|
bulletListStart = try $ do
|
|
optional newline -- if preceded by a Plain block in a list context
|
|
nonindentSpaces
|
|
notFollowedBy' hrule -- because hrules start out just like lists
|
|
oneOf bulletListMarkers
|
|
spaceChar
|
|
skipSpaces
|
|
|
|
anyOrderedListStart = try $ do
|
|
optional newline -- if preceded by a Plain block in a list context
|
|
nonindentSpaces
|
|
notFollowedBy $ string "p." >> spaceChar >> digit -- page number
|
|
state <- getState
|
|
if stateStrict state
|
|
then do many1 digit
|
|
char '.'
|
|
spaceChar
|
|
return (1, DefaultStyle, DefaultDelim)
|
|
else anyOrderedListMarker >>~ spaceChar
|
|
|
|
orderedListStart style delim = try $ do
|
|
optional newline -- if preceded by a Plain block in a list context
|
|
nonindentSpaces
|
|
state <- getState
|
|
num <- if stateStrict state
|
|
then do many1 digit
|
|
char '.'
|
|
return 1
|
|
else orderedListMarker style delim
|
|
if delim == Period && (style == UpperAlpha || (style == UpperRoman &&
|
|
num `elem` [1, 5, 10, 50, 100, 500, 1000]))
|
|
then char '\t' <|> (spaceChar >> spaceChar)
|
|
else spaceChar
|
|
skipSpaces
|
|
|
|
-- parse a line of a list item (start = parser for beginning of list item)
|
|
listLine start = try $ do
|
|
notFollowedBy' start
|
|
notFollowedBy blankline
|
|
notFollowedBy' (do indentSpaces
|
|
many (spaceChar)
|
|
bulletListStart <|> (anyOrderedListStart >> return ()))
|
|
line <- manyTill anyChar newline
|
|
return $ line ++ "\n"
|
|
|
|
-- parse raw text for one list item, excluding start marker and continuations
|
|
rawListItem start = try $ do
|
|
start
|
|
result <- many1 (listLine start)
|
|
blanks <- many blankline
|
|
return $ concat result ++ blanks
|
|
|
|
-- continuation of a list item - indented and separated by blankline
|
|
-- or (in compact lists) endline.
|
|
-- note: nested lists are parsed as continuations
|
|
listContinuation start = try $ do
|
|
lookAhead indentSpaces
|
|
result <- many1 (listContinuationLine start)
|
|
blanks <- many blankline
|
|
return $ concat result ++ blanks
|
|
|
|
listContinuationLine start = try $ do
|
|
notFollowedBy blankline
|
|
notFollowedBy' start
|
|
optional indentSpaces
|
|
result <- manyTill anyChar newline
|
|
return $ result ++ "\n"
|
|
|
|
listItem start = try $ do
|
|
first <- rawListItem start
|
|
continuations <- many (listContinuation start)
|
|
-- parsing with ListItemState forces markers at beginning of lines to
|
|
-- count as list item markers, even if not separated by blank space.
|
|
-- see definition of "endline"
|
|
state <- getState
|
|
let oldContext = stateParserContext state
|
|
setState $ state {stateParserContext = ListItemState}
|
|
-- parse the extracted block, which may contain various block elements:
|
|
let raw = concat (first:continuations)
|
|
contents <- parseFromString parseBlocks raw
|
|
updateState (\st -> st {stateParserContext = oldContext})
|
|
return contents
|
|
|
|
orderedList = try $ do
|
|
(start, style, delim) <- lookAhead anyOrderedListStart
|
|
items <- many1 (listItem (orderedListStart style delim))
|
|
return $ OrderedList (start, style, delim) $ compactify items
|
|
|
|
bulletList = many1 (listItem bulletListStart) >>=
|
|
return . BulletList . compactify
|
|
|
|
-- definition lists
|
|
|
|
definitionListItem = try $ do
|
|
notFollowedBy blankline
|
|
notFollowedBy' indentSpaces
|
|
-- first, see if this has any chance of being a definition list:
|
|
lookAhead (anyLine >> char ':')
|
|
term <- manyTill inline newline
|
|
raw <- many1 defRawBlock
|
|
state <- getState
|
|
let oldContext = stateParserContext state
|
|
-- parse the extracted block, which may contain various block elements:
|
|
contents <- parseFromString parseBlocks $ concat raw
|
|
updateState (\st -> st {stateParserContext = oldContext})
|
|
return ((normalizeSpaces term), contents)
|
|
|
|
defRawBlock = try $ do
|
|
char ':'
|
|
state <- getState
|
|
let tabStop = stateTabStop state
|
|
try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t")
|
|
firstline <- anyLine
|
|
rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine)
|
|
trailing <- option "" blanklines
|
|
return $ firstline ++ "\n" ++ unlines rawlines ++ trailing
|
|
|
|
definitionList = do
|
|
failIfStrict
|
|
items <- many1 definitionListItem
|
|
let (terms, defs) = unzip items
|
|
let defs' = compactify defs
|
|
let items' = zip terms defs'
|
|
return $ DefinitionList items'
|
|
|
|
--
|
|
-- paragraph block
|
|
--
|
|
|
|
para = try $ do
|
|
result <- many1 inline
|
|
newline
|
|
blanklines <|> do st <- getState
|
|
if stateStrict st
|
|
then lookAhead (blockQuote <|> header) >> return ""
|
|
else lookAhead emacsBoxQuote >> return ""
|
|
return $ Para $ normalizeSpaces result
|
|
|
|
plain = many1 inline >>= return . Plain . normalizeSpaces
|
|
|
|
--
|
|
-- raw html
|
|
--
|
|
|
|
htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
|
|
|
|
htmlBlock = do
|
|
st <- getState
|
|
if stateStrict st
|
|
then try $ do failUnlessBeginningOfLine
|
|
first <- htmlElement
|
|
finalSpace <- many (oneOf spaceChars)
|
|
finalNewlines <- many newline
|
|
return $ RawHtml $ first ++ finalSpace ++ finalNewlines
|
|
else rawHtmlBlocks
|
|
|
|
-- True if tag is self-closing
|
|
isSelfClosing tag =
|
|
isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag
|
|
|
|
strictHtmlBlock = try $ do
|
|
tag <- anyHtmlBlockTag
|
|
let tag' = extractTagType tag
|
|
if isSelfClosing tag || tag' == "hr"
|
|
then return tag
|
|
else do contents <- many (notFollowedBy' (htmlEndTag tag') >>
|
|
(htmlElement <|> (count 1 anyChar)))
|
|
end <- htmlEndTag tag'
|
|
return $ tag ++ concat contents ++ end
|
|
|
|
rawHtmlBlocks = do
|
|
htmlBlocks <- many1 rawHtmlBlock
|
|
let combined = concatMap (\(RawHtml str) -> str) htmlBlocks
|
|
let combined' = if not (null combined) && last combined == '\n'
|
|
then init combined -- strip extra newline
|
|
else combined
|
|
return $ RawHtml combined'
|
|
|
|
--
|
|
-- LaTeX
|
|
--
|
|
|
|
rawLaTeXEnvironment' = failIfStrict >> rawLaTeXEnvironment
|
|
|
|
--
|
|
-- Tables
|
|
--
|
|
|
|
-- Parse a dashed line with optional trailing spaces; return its length
|
|
-- and the length including trailing space.
|
|
dashedLine ch = do
|
|
dashes <- many1 (char ch)
|
|
sp <- many spaceChar
|
|
return $ (length dashes, length $ dashes ++ sp)
|
|
|
|
-- Parse a table header with dashed lines of '-' preceded by
|
|
-- one line of text.
|
|
simpleTableHeader = try $ do
|
|
rawContent <- anyLine
|
|
initSp <- nonindentSpaces
|
|
dashes <- many1 (dashedLine '-')
|
|
newline
|
|
let (lengths, lines) = unzip dashes
|
|
let indices = scanl (+) (length initSp) lines
|
|
let rawHeads = tail $ splitByIndices (init indices) rawContent
|
|
let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths
|
|
return (rawHeads, aligns, indices)
|
|
|
|
-- Parse a table footer - dashed lines followed by blank line.
|
|
tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines
|
|
|
|
-- Parse a table separator - dashed line.
|
|
tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n"
|
|
|
|
-- Parse a raw line and split it into chunks by indices.
|
|
rawTableLine indices = do
|
|
notFollowedBy' (blanklines <|> tableFooter)
|
|
line <- many1Till anyChar newline
|
|
return $ map removeLeadingTrailingSpace $ tail $
|
|
splitByIndices (init indices) line
|
|
|
|
-- Parse a table line and return a list of lists of blocks (columns).
|
|
tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain))
|
|
|
|
-- Parse a multiline table row and return a list of blocks (columns).
|
|
multilineRow indices = do
|
|
colLines <- many1 (rawTableLine indices)
|
|
optional blanklines
|
|
let cols = map unlines $ transpose colLines
|
|
mapM (parseFromString (many plain)) cols
|
|
|
|
-- Calculate relative widths of table columns, based on indices
|
|
widthsFromIndices :: Int -- Number of columns on terminal
|
|
-> [Int] -- Indices
|
|
-> [Float] -- Fractional relative sizes of columns
|
|
widthsFromIndices _ [] = []
|
|
widthsFromIndices numColumns indices =
|
|
let lengths = zipWith (-) indices (0:indices)
|
|
totLength = sum lengths
|
|
quotient = if totLength > numColumns
|
|
then fromIntegral totLength
|
|
else fromIntegral numColumns
|
|
fracs = map (\l -> (fromIntegral l) / quotient) lengths in
|
|
tail fracs
|
|
|
|
-- Parses a table caption: inlines beginning with 'Table:'
|
|
-- and followed by blank lines.
|
|
tableCaption = try $ do
|
|
nonindentSpaces
|
|
string "Table:"
|
|
result <- many1 inline
|
|
blanklines
|
|
return $ normalizeSpaces result
|
|
|
|
-- Parse a table using 'headerParser', 'lineParser', and 'footerParser'.
|
|
tableWith headerParser lineParser footerParser = try $ do
|
|
(rawHeads, aligns, indices) <- headerParser
|
|
lines <- many1Till (lineParser indices) footerParser
|
|
caption <- option [] tableCaption
|
|
heads <- mapM (parseFromString (many plain)) rawHeads
|
|
state <- getState
|
|
let numColumns = stateColumns state
|
|
let widths = widthsFromIndices numColumns indices
|
|
return $ Table caption aligns widths heads lines
|
|
|
|
-- Parse a simple table with '---' header and one line per row.
|
|
simpleTable = tableWith simpleTableHeader tableLine blanklines
|
|
|
|
-- Parse a multiline table: starts with row of '-' on top, then header
|
|
-- (which may be multiline), then the rows,
|
|
-- which may be multiline, separated by blank lines, and
|
|
-- ending with a footer (dashed line followed by blank line).
|
|
multilineTable = tableWith multilineTableHeader multilineRow tableFooter
|
|
|
|
multilineTableHeader = try $ do
|
|
tableSep
|
|
rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline)
|
|
initSp <- nonindentSpaces
|
|
dashes <- many1 (dashedLine '-')
|
|
newline
|
|
let (lengths, lines) = unzip dashes
|
|
let indices = scanl (+) (length initSp) lines
|
|
let rawHeadsList = transpose $ map
|
|
(\ln -> tail $ splitByIndices (init indices) ln)
|
|
rawContent
|
|
let rawHeads = map (joinWithSep " ") rawHeadsList
|
|
let aligns = zipWith alignType rawHeadsList lengths
|
|
return ((map removeLeadingTrailingSpace rawHeads), aligns, indices)
|
|
|
|
-- Returns an alignment type for a table, based on a list of strings
|
|
-- (the rows of the column header) and a number (the length of the
|
|
-- dashed line under the rows.
|
|
alignType :: [String] -> Int -> Alignment
|
|
alignType [] len = AlignDefault
|
|
alignType strLst len =
|
|
let str = head $ sortBy (comparing length) $
|
|
map removeTrailingSpace strLst
|
|
leftSpace = if null str then False else (str !! 0) `elem` " \t"
|
|
rightSpace = length str < len || (str !! (len - 1)) `elem` " \t"
|
|
in case (leftSpace, rightSpace) of
|
|
(True, False) -> AlignRight
|
|
(False, True) -> AlignLeft
|
|
(True, True) -> AlignCenter
|
|
(False, False) -> AlignDefault
|
|
|
|
table = failIfStrict >> (simpleTable <|> multilineTable) <?> "table"
|
|
|
|
--
|
|
-- inline
|
|
--
|
|
|
|
inline = choice [ str
|
|
, smartPunctuation
|
|
, whitespace
|
|
, endline
|
|
, code
|
|
, charRef
|
|
, strong
|
|
, emph
|
|
, note
|
|
, inlineNote
|
|
, link
|
|
, image
|
|
, math
|
|
, strikeout
|
|
, superscript
|
|
, subscript
|
|
, autoLink
|
|
, rawHtmlInline'
|
|
, rawLaTeXInline'
|
|
, escapedChar
|
|
, symbol
|
|
, ltSign ] <?> "inline"
|
|
|
|
escapedChar = do
|
|
char '\\'
|
|
state <- getState
|
|
result <- option '\\' $ if stateStrict state
|
|
then oneOf "\\`*_{}[]()>#+-.!~"
|
|
else satisfy (not . isAlphaNum)
|
|
return $ Str [result]
|
|
|
|
ltSign = do
|
|
st <- getState
|
|
if stateStrict st
|
|
then char '<'
|
|
else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html
|
|
return $ Str ['<']
|
|
|
|
specialCharsMinusLt = filter (/= '<') specialChars
|
|
|
|
symbol = do
|
|
result <- oneOf specialCharsMinusLt
|
|
return $ Str [result]
|
|
|
|
-- parses inline code, between n `s and n `s
|
|
code = try $ do
|
|
starts <- many1 (char '`')
|
|
skipSpaces
|
|
result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|>
|
|
(char '\n' >> return " "))
|
|
(try (skipSpaces >> count (length starts) (char '`') >>
|
|
notFollowedBy (char '`')))
|
|
return $ Code $ removeLeadingTrailingSpace $ concat result
|
|
|
|
mathWord = many1 ((noneOf " \t\n\\$") <|>
|
|
(try (char '\\') >>~ notFollowedBy (char '$')))
|
|
|
|
math = try $ do
|
|
failIfStrict
|
|
char '$'
|
|
notFollowedBy space
|
|
words <- sepBy1 mathWord (many1 space)
|
|
char '$'
|
|
return $ TeX ("$" ++ (joinWithSep " " words) ++ "$")
|
|
|
|
emph = ((enclosed (char '*') (char '*') inline) <|>
|
|
(enclosed (char '_') (char '_' >> notFollowedBy alphaNum) inline)) >>=
|
|
return . Emph . normalizeSpaces
|
|
|
|
strong = ((enclosed (string "**") (try $ string "**") inline) <|>
|
|
(enclosed (string "__") (try $ string "__") inline)) >>=
|
|
return . Strong . normalizeSpaces
|
|
|
|
strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>=
|
|
return . Strikeout . normalizeSpaces
|
|
|
|
superscript = failIfStrict >> enclosed (char '^') (char '^')
|
|
(notFollowedBy' whitespace >> inline) >>= -- may not contain Space
|
|
return . Superscript
|
|
|
|
subscript = failIfStrict >> enclosed (char '~') (char '~')
|
|
(notFollowedBy' whitespace >> inline) >>= -- may not contain Space
|
|
return . Subscript
|
|
|
|
smartPunctuation = failUnlessSmart >>
|
|
choice [ quoted, apostrophe, dash, ellipses ]
|
|
|
|
apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe
|
|
|
|
quoted = doubleQuoted <|> singleQuoted
|
|
|
|
withQuoteContext context parser = do
|
|
oldState <- getState
|
|
let oldQuoteContext = stateQuoteContext oldState
|
|
setState oldState { stateQuoteContext = context }
|
|
result <- parser
|
|
newState <- getState
|
|
setState newState { stateQuoteContext = oldQuoteContext }
|
|
return result
|
|
|
|
singleQuoted = try $ do
|
|
singleQuoteStart
|
|
withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>=
|
|
return . Quoted SingleQuote . normalizeSpaces
|
|
|
|
doubleQuoted = try $ do
|
|
doubleQuoteStart
|
|
withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>=
|
|
return . Quoted DoubleQuote . normalizeSpaces
|
|
|
|
failIfInQuoteContext context = do
|
|
st <- getState
|
|
if stateQuoteContext st == context
|
|
then fail "already inside quotes"
|
|
else return ()
|
|
|
|
singleQuoteStart = do
|
|
failIfInQuoteContext InSingleQuote
|
|
char '\8216' <|>
|
|
(try $ do char '\''
|
|
notFollowedBy (oneOf ")!],.;:-? \t\n")
|
|
notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
|
|
satisfy (not . isAlphaNum)))
|
|
-- possess/contraction
|
|
return '\'')
|
|
|
|
singleQuoteEnd = try $ do
|
|
char '\8217' <|> char '\''
|
|
notFollowedBy alphaNum
|
|
return '\''
|
|
|
|
doubleQuoteStart = do
|
|
failIfInQuoteContext InDoubleQuote
|
|
char '\8220' <|>
|
|
(try $ do char '"'
|
|
notFollowedBy (oneOf " \t\n")
|
|
return '"')
|
|
|
|
doubleQuoteEnd = char '\8221' <|> char '"'
|
|
|
|
ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses
|
|
|
|
dash = enDash <|> emDash
|
|
|
|
enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash
|
|
|
|
emDash = try $ skipSpaces >> oneOfStrings ["---", "--"] >>
|
|
skipSpaces >> return EmDash
|
|
|
|
whitespace = do
|
|
sps <- many1 (oneOf spaceChars)
|
|
if length sps >= 2
|
|
then option Space (endline >> return LineBreak)
|
|
else return Space <?> "whitespace"
|
|
|
|
nonEndline = satisfy (/='\n')
|
|
|
|
strChar = noneOf (specialChars ++ spaceChars ++ "\n")
|
|
|
|
str = many1 strChar >>= return . Str
|
|
|
|
-- an endline character that can be treated as a space, not a structural break
|
|
endline = try $ do
|
|
newline
|
|
notFollowedBy blankline
|
|
st <- getState
|
|
if stateStrict st
|
|
then do notFollowedBy emailBlockQuoteStart
|
|
notFollowedBy (char '#') -- atx header
|
|
else return ()
|
|
-- parse potential list-starts differently if in a list:
|
|
if stateParserContext st == ListItemState
|
|
then notFollowedBy' (bulletListStart <|>
|
|
(anyOrderedListStart >> return ()))
|
|
else return ()
|
|
return Space
|
|
|
|
--
|
|
-- links
|
|
--
|
|
|
|
-- a reference label for a link
|
|
reference = notFollowedBy' (string "[^") >> -- footnote reference
|
|
inlinesInBalanced "[" "]" >>= (return . normalizeSpaces)
|
|
|
|
-- source for a link, with optional title
|
|
source = try $ do
|
|
char '('
|
|
optional (char '<')
|
|
src <- many (noneOf ")> \t\n")
|
|
optional (char '>')
|
|
tit <- option "" linkTitle
|
|
skipSpaces
|
|
char ')'
|
|
return (removeTrailingSpace src, tit)
|
|
|
|
linkTitle = try $ do
|
|
(many1 spaceChar >> option '\n' newline) <|> newline
|
|
skipSpaces
|
|
delim <- char '\'' <|> char '"'
|
|
tit <- manyTill anyChar (try (char delim >> skipSpaces >>
|
|
notFollowedBy (noneOf ")\n")))
|
|
return $ decodeCharacterReferences tit
|
|
|
|
link = try $ do
|
|
label <- reference
|
|
src <- source <|> referenceLink label
|
|
return $ Link label src
|
|
|
|
-- a link like [this][ref] or [this][] or [this]
|
|
referenceLink label = do
|
|
ref <- option [] (try (optional (char ' ') >>
|
|
optional (newline >> skipSpaces) >> reference))
|
|
let ref' = if null ref then label else ref
|
|
state <- getState
|
|
case lookupKeySrc (stateKeys state) ref' of
|
|
Nothing -> fail "no corresponding key"
|
|
Just target -> return target
|
|
|
|
emailAddress = try $ do
|
|
name <- many1 (alphaNum <|> char '+')
|
|
char '@'
|
|
first <- many1 alphaNum
|
|
rest <- many1 (char '.' >> many1 alphaNum)
|
|
return $ "mailto:" ++ name ++ "@" ++ joinWithSep "." (first:rest)
|
|
|
|
uri = try $ do
|
|
str <- many1 (noneOf "\n\t >")
|
|
if isURI str
|
|
then return str
|
|
else fail "not a URI"
|
|
|
|
autoLink = try $ do
|
|
char '<'
|
|
src <- uri <|> emailAddress
|
|
char '>'
|
|
let src' = if "mailto:" `isPrefixOf` src
|
|
then drop 7 src
|
|
else src
|
|
st <- getState
|
|
return $ if stateStrict st
|
|
then Link [Str src'] (src, "")
|
|
else Link [Code src'] (src, "")
|
|
|
|
image = try $ do
|
|
char '!'
|
|
(Link label src) <- link
|
|
return $ Image label src
|
|
|
|
note = try $ do
|
|
failIfStrict
|
|
ref <- noteMarker
|
|
state <- getState
|
|
let notes = stateNotes state
|
|
case lookup ref notes of
|
|
Nothing -> fail "note not found"
|
|
Just contents -> return $ Note contents
|
|
|
|
inlineNote = try $ do
|
|
failIfStrict
|
|
char '^'
|
|
contents <- inlinesInBalanced "[" "]"
|
|
return $ Note [Para contents]
|
|
|
|
rawLaTeXInline' = failIfStrict >> rawLaTeXInline
|
|
|
|
rawHtmlInline' = do
|
|
st <- getState
|
|
result <- choice $ if stateStrict st
|
|
then [htmlBlockElement, anyHtmlTag, anyHtmlEndTag]
|
|
else [htmlBlockElement, anyHtmlInlineTag]
|
|
return $ HtmlInline result
|
|
|