2010-07-04 13:43:45 -07:00
|
|
|
|
{-
|
|
|
|
|
Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
|
|
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
|
-}
|
|
|
|
|
|
|
|
|
|
{- |
|
|
|
|
|
Module : Text.Pandoc.Parsing
|
|
|
|
|
Copyright : Copyright (C) 2006-2010 John MacFarlane
|
2011-04-29 11:34:36 -07:00
|
|
|
|
License : GNU GPL, version 2 or above
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
|
|
|
|
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
|
|
|
|
Stability : alpha
|
|
|
|
|
Portability : portable
|
|
|
|
|
|
|
|
|
|
A utility library with parsers used in pandoc readers.
|
|
|
|
|
-}
|
2011-04-29 11:34:36 -07:00
|
|
|
|
module Text.Pandoc.Parsing ( (>>~),
|
2010-07-04 13:43:45 -07:00
|
|
|
|
anyLine,
|
|
|
|
|
many1Till,
|
|
|
|
|
notFollowedBy',
|
|
|
|
|
oneOfStrings,
|
|
|
|
|
spaceChar,
|
2011-07-30 18:08:02 -07:00
|
|
|
|
nonspaceChar,
|
2010-07-04 13:43:45 -07:00
|
|
|
|
skipSpaces,
|
|
|
|
|
blankline,
|
|
|
|
|
blanklines,
|
|
|
|
|
enclosed,
|
|
|
|
|
stringAnyCase,
|
|
|
|
|
parseFromString,
|
|
|
|
|
lineClump,
|
|
|
|
|
charsInBalanced,
|
|
|
|
|
romanNumeral,
|
|
|
|
|
emailAddress,
|
|
|
|
|
uri,
|
|
|
|
|
withHorizDisplacement,
|
2012-01-29 23:54:00 -08:00
|
|
|
|
withRaw,
|
2010-07-04 13:43:45 -07:00
|
|
|
|
nullBlock,
|
|
|
|
|
failIfStrict,
|
|
|
|
|
failUnlessLHS,
|
|
|
|
|
escaped,
|
2012-02-05 22:52:00 -08:00
|
|
|
|
characterReference,
|
2012-04-24 15:56:59 +02:00
|
|
|
|
updateLastStrPos,
|
2010-07-04 13:43:45 -07:00
|
|
|
|
anyOrderedListMarker,
|
|
|
|
|
orderedListMarker,
|
|
|
|
|
charRef,
|
2010-07-05 23:43:07 -07:00
|
|
|
|
tableWith,
|
|
|
|
|
gridTableWith,
|
2010-07-04 13:43:45 -07:00
|
|
|
|
readWith,
|
|
|
|
|
testStringWith,
|
|
|
|
|
ParserState (..),
|
|
|
|
|
defaultParserState,
|
|
|
|
|
HeaderType (..),
|
|
|
|
|
ParserContext (..),
|
|
|
|
|
QuoteContext (..),
|
|
|
|
|
NoteTable,
|
|
|
|
|
KeyTable,
|
2010-12-05 19:27:00 -08:00
|
|
|
|
Key,
|
|
|
|
|
toKey,
|
|
|
|
|
fromKey,
|
2010-12-07 19:03:08 -08:00
|
|
|
|
lookupKeySrc,
|
2011-01-04 19:12:33 -08:00
|
|
|
|
smartPunctuation,
|
|
|
|
|
macro,
|
2012-07-20 14:41:44 -07:00
|
|
|
|
applyMacros',
|
|
|
|
|
-- * Re-exports from Text.Pandoc.Parsec
|
2012-07-20 15:54:57 -07:00
|
|
|
|
Parser,
|
2012-07-20 14:41:44 -07:00
|
|
|
|
runParser,
|
|
|
|
|
parse,
|
|
|
|
|
anyToken,
|
|
|
|
|
getInput,
|
|
|
|
|
setInput,
|
|
|
|
|
unexpected,
|
|
|
|
|
char,
|
|
|
|
|
letter,
|
|
|
|
|
digit,
|
|
|
|
|
alphaNum,
|
|
|
|
|
skipMany,
|
|
|
|
|
skipMany1,
|
|
|
|
|
spaces,
|
|
|
|
|
space,
|
|
|
|
|
anyChar,
|
|
|
|
|
satisfy,
|
|
|
|
|
newline,
|
|
|
|
|
string,
|
|
|
|
|
count,
|
|
|
|
|
eof,
|
|
|
|
|
noneOf,
|
|
|
|
|
oneOf,
|
|
|
|
|
lookAhead,
|
|
|
|
|
notFollowedBy,
|
|
|
|
|
many,
|
|
|
|
|
many1,
|
|
|
|
|
manyTill,
|
|
|
|
|
(<|>),
|
|
|
|
|
(<?>),
|
|
|
|
|
choice,
|
|
|
|
|
try,
|
|
|
|
|
sepBy,
|
2012-07-22 22:09:15 -07:00
|
|
|
|
sepBy1,
|
2012-07-20 14:41:44 -07:00
|
|
|
|
sepEndBy,
|
2012-07-22 22:09:15 -07:00
|
|
|
|
sepEndBy1,
|
|
|
|
|
endBy,
|
2012-07-20 14:41:44 -07:00
|
|
|
|
endBy1,
|
|
|
|
|
option,
|
|
|
|
|
optional,
|
|
|
|
|
optionMaybe,
|
|
|
|
|
getState,
|
|
|
|
|
setState,
|
|
|
|
|
updateState,
|
|
|
|
|
getPosition,
|
|
|
|
|
setPosition,
|
|
|
|
|
sourceColumn,
|
|
|
|
|
sourceLine,
|
|
|
|
|
newPos,
|
|
|
|
|
token
|
|
|
|
|
)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
where
|
|
|
|
|
|
|
|
|
|
import Text.Pandoc.Definition
|
2010-12-24 13:39:27 -08:00
|
|
|
|
import Text.Pandoc.Generic
|
2010-07-04 13:43:45 -07:00
|
|
|
|
import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn)
|
2012-07-20 14:19:06 -07:00
|
|
|
|
import Text.Parsec
|
2012-07-20 14:41:44 -07:00
|
|
|
|
import Text.Parsec.Pos (newPos)
|
2011-03-18 11:27:42 -07:00
|
|
|
|
import Data.Char ( toLower, toUpper, ord, isAscii, isAlphaNum, isDigit, isPunctuation )
|
2010-07-05 14:34:48 -07:00
|
|
|
|
import Data.List ( intercalate, transpose )
|
2010-07-04 13:43:45 -07:00
|
|
|
|
import Network.URI ( parseURI, URI (..), isAllowedInURI )
|
2012-07-20 14:19:06 -07:00
|
|
|
|
import Control.Monad ( join, liftM, guard, mzero )
|
2010-07-05 14:34:48 -07:00
|
|
|
|
import Text.Pandoc.Shared
|
2010-07-04 13:43:45 -07:00
|
|
|
|
import qualified Data.Map as M
|
2011-01-05 14:42:47 -08:00
|
|
|
|
import Text.TeXMath.Macros (applyMacros, Macro, parseMacroDefinitions)
|
2012-02-05 22:52:00 -08:00
|
|
|
|
import Text.HTML.TagSoup.Entity ( lookupEntity )
|
2012-07-19 12:38:54 -07:00
|
|
|
|
import Data.Default
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
2012-07-20 15:54:57 -07:00
|
|
|
|
type Parser t s = Parsec t s
|
|
|
|
|
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-- | Like >>, but returns the operation on the left.
|
|
|
|
|
-- (Suggested by Tillmann Rendel on Haskell-cafe list.)
|
|
|
|
|
(>>~) :: (Monad m) => m a -> m b -> m a
|
|
|
|
|
a >>~ b = a >>= \x -> b >> return x
|
|
|
|
|
|
|
|
|
|
-- | Parse any line of text
|
2012-07-20 14:19:06 -07:00
|
|
|
|
anyLine :: Parsec [Char] st [Char]
|
2010-07-04 13:43:45 -07:00
|
|
|
|
anyLine = manyTill anyChar newline
|
|
|
|
|
|
|
|
|
|
-- | Like @manyTill@, but reads at least one item.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
many1Till :: Parsec [tok] st a
|
|
|
|
|
-> Parsec [tok] st end
|
|
|
|
|
-> Parsec [tok] st [a]
|
2010-07-04 13:43:45 -07:00
|
|
|
|
many1Till p end = do
|
|
|
|
|
first <- p
|
|
|
|
|
rest <- manyTill p end
|
|
|
|
|
return (first:rest)
|
|
|
|
|
|
2011-04-29 11:34:36 -07:00
|
|
|
|
-- | A more general form of @notFollowedBy@. This one allows any
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-- type of parser to be specified, and succeeds only if that parser fails.
|
|
|
|
|
-- It does not consume any input.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
notFollowedBy' :: Show b => Parsec [a] st b -> Parsec [a] st ()
|
2010-07-04 13:43:45 -07:00
|
|
|
|
notFollowedBy' p = try $ join $ do a <- try p
|
|
|
|
|
return (unexpected (show a))
|
|
|
|
|
<|>
|
|
|
|
|
return (return ())
|
|
|
|
|
-- (This version due to Andrew Pimlott on the Haskell mailing list.)
|
|
|
|
|
|
2011-04-29 11:34:36 -07:00
|
|
|
|
-- | Parses one of a list of strings (tried in order).
|
2012-07-20 14:19:06 -07:00
|
|
|
|
oneOfStrings :: [String] -> Parsec [Char] st String
|
2010-07-04 13:43:45 -07:00
|
|
|
|
oneOfStrings listOfStrings = choice $ map (try . string) listOfStrings
|
|
|
|
|
|
|
|
|
|
-- | Parses a space or tab.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
spaceChar :: Parsec [Char] st Char
|
2011-01-19 14:45:15 -08:00
|
|
|
|
spaceChar = satisfy $ \c -> c == ' ' || c == '\t'
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
2011-07-30 18:08:02 -07:00
|
|
|
|
-- | Parses a nonspace, nonnewline character.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
nonspaceChar :: Parsec [Char] st Char
|
2011-07-30 18:08:02 -07:00
|
|
|
|
nonspaceChar = satisfy $ \x -> x /= '\t' && x /= '\n' && x /= ' ' && x /= '\r'
|
|
|
|
|
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-- | Skips zero or more spaces or tabs.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
skipSpaces :: Parsec [Char] st ()
|
2010-07-04 13:43:45 -07:00
|
|
|
|
skipSpaces = skipMany spaceChar
|
|
|
|
|
|
|
|
|
|
-- | Skips zero or more spaces or tabs, then reads a newline.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
blankline :: Parsec [Char] st Char
|
2010-07-04 13:43:45 -07:00
|
|
|
|
blankline = try $ skipSpaces >> newline
|
|
|
|
|
|
|
|
|
|
-- | Parses one or more blank lines and returns a string of newlines.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
blanklines :: Parsec [Char] st [Char]
|
2010-07-04 13:43:45 -07:00
|
|
|
|
blanklines = many1 blankline
|
|
|
|
|
|
|
|
|
|
-- | Parses material enclosed between start and end parsers.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
enclosed :: Parsec [Char] st t -- ^ start parser
|
|
|
|
|
-> Parsec [Char] st end -- ^ end parser
|
|
|
|
|
-> Parsec [Char] st a -- ^ content parser (to be used repeatedly)
|
|
|
|
|
-> Parsec [Char] st [a]
|
2011-04-29 11:34:36 -07:00
|
|
|
|
enclosed start end parser = try $
|
2010-07-04 13:43:45 -07:00
|
|
|
|
start >> notFollowedBy space >> many1Till parser end
|
|
|
|
|
|
|
|
|
|
-- | Parse string, case insensitive.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
stringAnyCase :: [Char] -> Parsec [Char] st String
|
2010-07-04 13:43:45 -07:00
|
|
|
|
stringAnyCase [] = string ""
|
|
|
|
|
stringAnyCase (x:xs) = do
|
|
|
|
|
firstChar <- char (toUpper x) <|> char (toLower x)
|
|
|
|
|
rest <- stringAnyCase xs
|
|
|
|
|
return (firstChar:rest)
|
|
|
|
|
|
|
|
|
|
-- | Parse contents of 'str' using 'parser' and return result.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
parseFromString :: Parsec [tok] st a -> [tok] -> Parsec [tok] st a
|
2010-07-04 13:43:45 -07:00
|
|
|
|
parseFromString parser str = do
|
|
|
|
|
oldPos <- getPosition
|
|
|
|
|
oldInput <- getInput
|
|
|
|
|
setInput str
|
|
|
|
|
result <- parser
|
|
|
|
|
setInput oldInput
|
|
|
|
|
setPosition oldPos
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
-- | Parse raw line block up to and including blank lines.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
lineClump :: Parsec [Char] st String
|
2011-04-29 11:34:36 -07:00
|
|
|
|
lineClump = blanklines
|
2010-07-04 13:43:45 -07:00
|
|
|
|
<|> (many1 (notFollowedBy blankline >> anyLine) >>= return . unlines)
|
|
|
|
|
|
|
|
|
|
-- | Parse a string of characters between an open character
|
|
|
|
|
-- and a close character, including text between balanced
|
|
|
|
|
-- pairs of open and close, which must be different. For example,
|
2011-12-05 20:54:46 -08:00
|
|
|
|
-- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))"
|
|
|
|
|
-- and return "hello (there)".
|
2012-07-20 14:19:06 -07:00
|
|
|
|
charsInBalanced :: Char -> Char -> Parsec [Char] st Char
|
|
|
|
|
-> Parsec [Char] st String
|
2011-12-05 20:54:46 -08:00
|
|
|
|
charsInBalanced open close parser = try $ do
|
2010-07-04 13:43:45 -07:00
|
|
|
|
char open
|
2011-12-05 20:54:46 -08:00
|
|
|
|
let isDelim c = c == open || c == close
|
|
|
|
|
raw <- many $ many1 (notFollowedBy (satisfy isDelim) >> parser)
|
|
|
|
|
<|> (do res <- charsInBalanced open close parser
|
|
|
|
|
return $ [open] ++ res ++ [close])
|
2010-07-04 13:43:45 -07:00
|
|
|
|
char close
|
|
|
|
|
return $ concat raw
|
|
|
|
|
|
2011-12-05 20:54:46 -08:00
|
|
|
|
-- old charsInBalanced would be:
|
|
|
|
|
-- charsInBalanced open close (noneOf "\n" <|> char '\n' >> notFollowedBy blankline)
|
|
|
|
|
-- old charsInBalanced' would be:
|
|
|
|
|
-- charsInBalanced open close anyChar
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
|
|
|
|
-- Auxiliary functions for romanNumeral:
|
|
|
|
|
|
|
|
|
|
lowercaseRomanDigits :: [Char]
|
|
|
|
|
lowercaseRomanDigits = ['i','v','x','l','c','d','m']
|
|
|
|
|
|
|
|
|
|
uppercaseRomanDigits :: [Char]
|
|
|
|
|
uppercaseRomanDigits = map toUpper lowercaseRomanDigits
|
|
|
|
|
|
|
|
|
|
-- | Parses a roman numeral (uppercase or lowercase), returns number.
|
|
|
|
|
romanNumeral :: Bool -- ^ Uppercase if true
|
2012-07-20 14:19:06 -07:00
|
|
|
|
-> Parsec [Char] st Int
|
2010-07-04 13:43:45 -07:00
|
|
|
|
romanNumeral upperCase = do
|
2011-04-29 11:34:36 -07:00
|
|
|
|
let romanDigits = if upperCase
|
|
|
|
|
then uppercaseRomanDigits
|
2010-07-04 13:43:45 -07:00
|
|
|
|
else lowercaseRomanDigits
|
2011-01-19 14:59:59 -08:00
|
|
|
|
lookAhead $ oneOf romanDigits
|
2011-04-29 11:34:36 -07:00
|
|
|
|
let [one, five, ten, fifty, hundred, fivehundred, thousand] =
|
2010-07-04 13:43:45 -07:00
|
|
|
|
map char romanDigits
|
|
|
|
|
thousands <- many thousand >>= (return . (1000 *) . length)
|
|
|
|
|
ninehundreds <- option 0 $ try $ hundred >> thousand >> return 900
|
|
|
|
|
fivehundreds <- many fivehundred >>= (return . (500 *) . length)
|
|
|
|
|
fourhundreds <- option 0 $ try $ hundred >> fivehundred >> return 400
|
|
|
|
|
hundreds <- many hundred >>= (return . (100 *) . length)
|
|
|
|
|
nineties <- option 0 $ try $ ten >> hundred >> return 90
|
|
|
|
|
fifties <- many fifty >>= (return . (50 *) . length)
|
|
|
|
|
forties <- option 0 $ try $ ten >> fifty >> return 40
|
|
|
|
|
tens <- many ten >>= (return . (10 *) . length)
|
|
|
|
|
nines <- option 0 $ try $ one >> ten >> return 9
|
|
|
|
|
fives <- many five >>= (return . (5 *) . length)
|
|
|
|
|
fours <- option 0 $ try $ one >> five >> return 4
|
|
|
|
|
ones <- many one >>= (return . length)
|
|
|
|
|
let total = thousands + ninehundreds + fivehundreds + fourhundreds +
|
|
|
|
|
hundreds + nineties + fifties + forties + tens + nines +
|
|
|
|
|
fives + fours + ones
|
|
|
|
|
if total == 0
|
|
|
|
|
then fail "not a roman numeral"
|
|
|
|
|
else return total
|
|
|
|
|
|
|
|
|
|
-- Parsers for email addresses and URIs
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
emailChar :: Parsec [Char] st Char
|
2011-01-19 14:59:59 -08:00
|
|
|
|
emailChar = alphaNum <|>
|
|
|
|
|
satisfy (\c -> c == '-' || c == '+' || c == '_' || c == '.')
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
domainChar :: Parsec [Char] st Char
|
2010-07-04 13:43:45 -07:00
|
|
|
|
domainChar = alphaNum <|> char '-'
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
domain :: Parsec [Char] st [Char]
|
2010-07-04 13:43:45 -07:00
|
|
|
|
domain = do
|
|
|
|
|
first <- many1 domainChar
|
|
|
|
|
dom <- many1 $ try (char '.' >> many1 domainChar )
|
|
|
|
|
return $ intercalate "." (first:dom)
|
|
|
|
|
|
|
|
|
|
-- | Parses an email address; returns original and corresponding
|
|
|
|
|
-- escaped mailto: URI.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
emailAddress :: Parsec [Char] st (String, String)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
emailAddress = try $ do
|
|
|
|
|
firstLetter <- alphaNum
|
|
|
|
|
restAddr <- many emailChar
|
|
|
|
|
let addr = firstLetter:restAddr
|
|
|
|
|
char '@'
|
|
|
|
|
dom <- domain
|
|
|
|
|
let full = addr ++ '@':dom
|
|
|
|
|
return (full, escapeURI $ "mailto:" ++ full)
|
|
|
|
|
|
|
|
|
|
-- | Parses a URI. Returns pair of original and URI-escaped version.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
uri :: Parsec [Char] st (String, String)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
uri = try $ do
|
|
|
|
|
let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:",
|
|
|
|
|
"news:", "telnet:" ]
|
|
|
|
|
lookAhead $ oneOfStrings protocols
|
2011-03-18 11:27:42 -07:00
|
|
|
|
-- Scan non-ascii characters and ascii characters allowed in a URI.
|
|
|
|
|
-- We allow punctuation except when followed by a space, since
|
|
|
|
|
-- we don't want the trailing '.' in 'http://google.com.'
|
|
|
|
|
let innerPunct = try $ satisfy isPunctuation >>~
|
|
|
|
|
notFollowedBy (newline <|> spaceChar)
|
|
|
|
|
let uriChar = innerPunct <|>
|
|
|
|
|
satisfy (\c -> not (isPunctuation c) &&
|
|
|
|
|
(not (isAscii c) || isAllowedInURI c))
|
|
|
|
|
-- We want to allow
|
|
|
|
|
-- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation)
|
|
|
|
|
-- as a URL, while NOT picking up the closing paren in
|
|
|
|
|
-- (http://wikipedia.org)
|
|
|
|
|
-- So we include balanced parens in the URL.
|
|
|
|
|
let inParens = try $ do char '('
|
|
|
|
|
res <- many uriChar
|
|
|
|
|
char ')'
|
|
|
|
|
return $ '(' : res ++ ")"
|
|
|
|
|
str <- liftM concat $ many1 $ inParens <|> count 1 (innerPunct <|> uriChar)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-- now see if they amount to an absolute URI
|
|
|
|
|
case parseURI (escapeURI str) of
|
|
|
|
|
Just uri' -> if uriScheme uri' `elem` protocols
|
|
|
|
|
then return (str, show uri')
|
|
|
|
|
else fail "not a URI"
|
|
|
|
|
Nothing -> fail "not a URI"
|
|
|
|
|
|
|
|
|
|
-- | Applies a parser, returns tuple of its results and its horizontal
|
|
|
|
|
-- displacement (the difference between the source column at the end
|
|
|
|
|
-- and the source column at the beginning). Vertical displacement
|
|
|
|
|
-- (source row) is ignored.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
withHorizDisplacement :: Parsec [Char] st a -- ^ Parser to apply
|
|
|
|
|
-> Parsec [Char] st (a, Int) -- ^ (result, displacement)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
withHorizDisplacement parser = do
|
|
|
|
|
pos1 <- getPosition
|
|
|
|
|
result <- parser
|
|
|
|
|
pos2 <- getPosition
|
|
|
|
|
return (result, sourceColumn pos2 - sourceColumn pos1)
|
|
|
|
|
|
2012-01-29 23:54:00 -08:00
|
|
|
|
-- | Applies a parser and returns the raw string that was parsed,
|
|
|
|
|
-- along with the value produced by the parser.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
withRaw :: Parsec [Char] st a -> Parsec [Char] st (a, [Char])
|
2012-01-29 23:54:00 -08:00
|
|
|
|
withRaw parser = do
|
|
|
|
|
pos1 <- getPosition
|
|
|
|
|
inp <- getInput
|
|
|
|
|
result <- parser
|
|
|
|
|
pos2 <- getPosition
|
|
|
|
|
let (l1,c1) = (sourceLine pos1, sourceColumn pos1)
|
|
|
|
|
let (l2,c2) = (sourceLine pos2, sourceColumn pos2)
|
|
|
|
|
let inplines = take ((l2 - l1) + 1) $ lines inp
|
|
|
|
|
let raw = case inplines of
|
|
|
|
|
[] -> error "raw: inplines is null" -- shouldn't happen
|
|
|
|
|
[l] -> take (c2 - c1) l
|
|
|
|
|
ls -> unlines (init ls) ++ take (c2 - 1) (last ls)
|
|
|
|
|
return (result, raw)
|
|
|
|
|
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-- | Parses a character and returns 'Null' (so that the parser can move on
|
|
|
|
|
-- if it gets stuck).
|
2012-07-20 14:19:06 -07:00
|
|
|
|
nullBlock :: Parsec [Char] st Block
|
2010-07-04 13:43:45 -07:00
|
|
|
|
nullBlock = anyChar >> return Null
|
|
|
|
|
|
|
|
|
|
-- | Fail if reader is in strict markdown syntax mode.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
failIfStrict :: Parsec [a] ParserState ()
|
2010-07-04 13:43:45 -07:00
|
|
|
|
failIfStrict = do
|
|
|
|
|
state <- getState
|
|
|
|
|
if stateStrict state then fail "strict mode" else return ()
|
|
|
|
|
|
|
|
|
|
-- | Fail unless we're in literate haskell mode.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
failUnlessLHS :: Parsec [tok] ParserState ()
|
2012-01-29 23:54:00 -08:00
|
|
|
|
failUnlessLHS = getState >>= guard . stateLiterateHaskell
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
|
|
|
|
-- | Parses backslash, then applies character parser.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
escaped :: Parsec [Char] st Char -- ^ Parser for character to escape
|
|
|
|
|
-> Parsec [Char] st Char
|
2011-12-05 20:22:27 -08:00
|
|
|
|
escaped parser = try $ char '\\' >> parser
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
2012-02-05 22:52:00 -08:00
|
|
|
|
-- | Parse character entity.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
characterReference :: Parsec [Char] st Char
|
2012-02-05 22:52:00 -08:00
|
|
|
|
characterReference = try $ do
|
|
|
|
|
char '&'
|
2012-02-05 23:01:17 -08:00
|
|
|
|
ent <- many1Till nonspaceChar (char ';')
|
2012-02-05 22:52:00 -08:00
|
|
|
|
case lookupEntity ent of
|
|
|
|
|
Just c -> return c
|
2012-02-05 23:01:17 -08:00
|
|
|
|
Nothing -> fail "entity not found"
|
2012-02-05 22:52:00 -08:00
|
|
|
|
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-- | Parses an uppercase roman numeral and returns (UpperRoman, number).
|
2012-07-20 14:19:06 -07:00
|
|
|
|
upperRoman :: Parsec [Char] st (ListNumberStyle, Int)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
upperRoman = do
|
|
|
|
|
num <- romanNumeral True
|
|
|
|
|
return (UpperRoman, num)
|
|
|
|
|
|
|
|
|
|
-- | Parses a lowercase roman numeral and returns (LowerRoman, number).
|
2012-07-20 14:19:06 -07:00
|
|
|
|
lowerRoman :: Parsec [Char] st (ListNumberStyle, Int)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
lowerRoman = do
|
|
|
|
|
num <- romanNumeral False
|
|
|
|
|
return (LowerRoman, num)
|
|
|
|
|
|
|
|
|
|
-- | Parses a decimal numeral and returns (Decimal, number).
|
2012-07-20 14:19:06 -07:00
|
|
|
|
decimal :: Parsec [Char] st (ListNumberStyle, Int)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
decimal = do
|
|
|
|
|
num <- many1 digit
|
|
|
|
|
return (Decimal, read num)
|
|
|
|
|
|
2010-07-11 22:47:52 -07:00
|
|
|
|
-- | Parses a '@' and optional label and
|
|
|
|
|
-- returns (DefaultStyle, [next example number]). The next
|
|
|
|
|
-- example number is incremented in parser state, and the label
|
|
|
|
|
-- (if present) is added to the label table.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
exampleNum :: Parsec [Char] ParserState (ListNumberStyle, Int)
|
2010-07-11 22:47:52 -07:00
|
|
|
|
exampleNum = do
|
|
|
|
|
char '@'
|
2011-01-19 14:59:59 -08:00
|
|
|
|
lab <- many (alphaNum <|> satisfy (\c -> c == '_' || c == '-'))
|
2010-07-11 22:47:52 -07:00
|
|
|
|
st <- getState
|
|
|
|
|
let num = stateNextExample st
|
|
|
|
|
let newlabels = if null lab
|
|
|
|
|
then stateExamples st
|
|
|
|
|
else M.insert lab num $ stateExamples st
|
|
|
|
|
updateState $ \s -> s{ stateNextExample = num + 1
|
|
|
|
|
, stateExamples = newlabels }
|
|
|
|
|
return (Example, num)
|
|
|
|
|
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-- | Parses a '#' returns (DefaultStyle, 1).
|
2012-07-20 14:19:06 -07:00
|
|
|
|
defaultNum :: Parsec [Char] st (ListNumberStyle, Int)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
defaultNum = do
|
|
|
|
|
char '#'
|
|
|
|
|
return (DefaultStyle, 1)
|
|
|
|
|
|
|
|
|
|
-- | Parses a lowercase letter and returns (LowerAlpha, number).
|
2012-07-20 14:19:06 -07:00
|
|
|
|
lowerAlpha :: Parsec [Char] st (ListNumberStyle, Int)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
lowerAlpha = do
|
|
|
|
|
ch <- oneOf ['a'..'z']
|
|
|
|
|
return (LowerAlpha, ord ch - ord 'a' + 1)
|
|
|
|
|
|
|
|
|
|
-- | Parses an uppercase letter and returns (UpperAlpha, number).
|
2012-07-20 14:19:06 -07:00
|
|
|
|
upperAlpha :: Parsec [Char] st (ListNumberStyle, Int)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
upperAlpha = do
|
|
|
|
|
ch <- oneOf ['A'..'Z']
|
|
|
|
|
return (UpperAlpha, ord ch - ord 'A' + 1)
|
|
|
|
|
|
|
|
|
|
-- | Parses a roman numeral i or I
|
2012-07-20 14:19:06 -07:00
|
|
|
|
romanOne :: Parsec [Char] st (ListNumberStyle, Int)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
romanOne = (char 'i' >> return (LowerRoman, 1)) <|>
|
|
|
|
|
(char 'I' >> return (UpperRoman, 1))
|
|
|
|
|
|
|
|
|
|
-- | Parses an ordered list marker and returns list attributes.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
anyOrderedListMarker :: Parsec [Char] ParserState ListAttributes
|
2011-04-29 11:34:36 -07:00
|
|
|
|
anyOrderedListMarker = choice $
|
2010-07-04 13:43:45 -07:00
|
|
|
|
[delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens],
|
2010-07-11 22:47:52 -07:00
|
|
|
|
numParser <- [decimal, exampleNum, defaultNum, romanOne,
|
2010-07-04 13:43:45 -07:00
|
|
|
|
lowerAlpha, lowerRoman, upperAlpha, upperRoman]]
|
|
|
|
|
|
|
|
|
|
-- | Parses a list number (num) followed by a period, returns list attributes.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
inPeriod :: Parsec [Char] st (ListNumberStyle, Int)
|
|
|
|
|
-> Parsec [Char] st ListAttributes
|
2010-07-04 13:43:45 -07:00
|
|
|
|
inPeriod num = try $ do
|
|
|
|
|
(style, start) <- num
|
|
|
|
|
char '.'
|
|
|
|
|
let delim = if style == DefaultStyle
|
|
|
|
|
then DefaultDelim
|
|
|
|
|
else Period
|
|
|
|
|
return (start, style, delim)
|
2011-04-29 11:34:36 -07:00
|
|
|
|
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-- | Parses a list number (num) followed by a paren, returns list attributes.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
inOneParen :: Parsec [Char] st (ListNumberStyle, Int)
|
|
|
|
|
-> Parsec [Char] st ListAttributes
|
2010-07-04 13:43:45 -07:00
|
|
|
|
inOneParen num = try $ do
|
|
|
|
|
(style, start) <- num
|
|
|
|
|
char ')'
|
|
|
|
|
return (start, style, OneParen)
|
|
|
|
|
|
|
|
|
|
-- | Parses a list number (num) enclosed in parens, returns list attributes.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
inTwoParens :: Parsec [Char] st (ListNumberStyle, Int)
|
|
|
|
|
-> Parsec [Char] st ListAttributes
|
2010-07-04 13:43:45 -07:00
|
|
|
|
inTwoParens num = try $ do
|
|
|
|
|
char '('
|
|
|
|
|
(style, start) <- num
|
|
|
|
|
char ')'
|
|
|
|
|
return (start, style, TwoParens)
|
|
|
|
|
|
|
|
|
|
-- | Parses an ordered list marker with a given style and delimiter,
|
|
|
|
|
-- returns number.
|
2011-04-29 11:34:36 -07:00
|
|
|
|
orderedListMarker :: ListNumberStyle
|
|
|
|
|
-> ListNumberDelim
|
2012-07-20 14:19:06 -07:00
|
|
|
|
-> Parsec [Char] ParserState Int
|
2010-07-04 13:43:45 -07:00
|
|
|
|
orderedListMarker style delim = do
|
|
|
|
|
let num = defaultNum <|> -- # can continue any kind of list
|
|
|
|
|
case style of
|
|
|
|
|
DefaultStyle -> decimal
|
2010-07-11 22:47:52 -07:00
|
|
|
|
Example -> exampleNum
|
2010-07-04 13:43:45 -07:00
|
|
|
|
Decimal -> decimal
|
|
|
|
|
UpperRoman -> upperRoman
|
|
|
|
|
LowerRoman -> lowerRoman
|
|
|
|
|
UpperAlpha -> upperAlpha
|
|
|
|
|
LowerAlpha -> lowerAlpha
|
|
|
|
|
let context = case delim of
|
|
|
|
|
DefaultDelim -> inPeriod
|
|
|
|
|
Period -> inPeriod
|
|
|
|
|
OneParen -> inOneParen
|
|
|
|
|
TwoParens -> inTwoParens
|
|
|
|
|
(start, _, _) <- context num
|
|
|
|
|
return start
|
|
|
|
|
|
|
|
|
|
-- | Parses a character reference and returns a Str element.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
charRef :: Parsec [Char] st Inline
|
2010-07-04 13:43:45 -07:00
|
|
|
|
charRef = do
|
|
|
|
|
c <- characterReference
|
|
|
|
|
return $ Str [c]
|
|
|
|
|
|
2010-07-05 23:43:07 -07:00
|
|
|
|
-- | Parse a table using 'headerParser', 'rowParser',
|
|
|
|
|
-- 'lineParser', and 'footerParser'.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
tableWith :: Parsec [Char] ParserState ([[Block]], [Alignment], [Int])
|
|
|
|
|
-> ([Int] -> Parsec [Char] ParserState [[Block]])
|
|
|
|
|
-> Parsec [Char] ParserState sep
|
|
|
|
|
-> Parsec [Char] ParserState end
|
|
|
|
|
-> Parsec [Char] ParserState Block
|
2012-07-24 09:06:13 -07:00
|
|
|
|
tableWith headerParser rowParser lineParser footerParser = try $ do
|
2010-07-05 23:43:07 -07:00
|
|
|
|
(heads, aligns, indices) <- headerParser
|
2012-07-22 22:09:15 -07:00
|
|
|
|
lines' <- rowParser indices `sepEndBy1` lineParser
|
2010-07-05 23:43:07 -07:00
|
|
|
|
footerParser
|
|
|
|
|
state <- getState
|
|
|
|
|
let numColumns = stateColumns state
|
2012-02-21 22:00:10 +01:00
|
|
|
|
let widths = if (indices == [])
|
|
|
|
|
then replicate (length aligns) 0.0
|
|
|
|
|
else widthsFromIndices numColumns indices
|
2012-07-24 09:06:13 -07:00
|
|
|
|
return $ Table [] aligns widths heads lines'
|
2010-07-05 23:43:07 -07:00
|
|
|
|
|
|
|
|
|
-- Calculate relative widths of table columns, based on indices
|
|
|
|
|
widthsFromIndices :: Int -- Number of columns on terminal
|
|
|
|
|
-> [Int] -- Indices
|
|
|
|
|
-> [Double] -- Fractional relative sizes of columns
|
2011-04-29 11:34:36 -07:00
|
|
|
|
widthsFromIndices _ [] = []
|
|
|
|
|
widthsFromIndices numColumns' indices =
|
2010-12-12 20:09:14 -08:00
|
|
|
|
let numColumns = max numColumns' (if null indices then 0 else last indices)
|
|
|
|
|
lengths' = zipWith (-) indices (0:indices)
|
2010-07-05 23:43:07 -07:00
|
|
|
|
lengths = reverse $
|
|
|
|
|
case reverse lengths' of
|
|
|
|
|
[] -> []
|
|
|
|
|
[x] -> [x]
|
|
|
|
|
-- compensate for the fact that intercolumn
|
|
|
|
|
-- spaces are counted in widths of all columns
|
|
|
|
|
-- but the last...
|
|
|
|
|
(x:y:zs) -> if x < y && y - x <= 2
|
|
|
|
|
then y:y:zs
|
|
|
|
|
else x:y:zs
|
|
|
|
|
totLength = sum lengths
|
|
|
|
|
quotient = if totLength > numColumns
|
|
|
|
|
then fromIntegral totLength
|
|
|
|
|
else fromIntegral numColumns
|
|
|
|
|
fracs = map (\l -> (fromIntegral l) / quotient) lengths in
|
|
|
|
|
tail fracs
|
|
|
|
|
|
2012-02-21 22:00:10 +01:00
|
|
|
|
---
|
|
|
|
|
|
2010-07-05 23:43:07 -07:00
|
|
|
|
-- Parse a grid table: starts with row of '-' on top, then header
|
|
|
|
|
-- (which may be grid), then the rows,
|
|
|
|
|
-- which may be grid, separated by blank lines, and
|
|
|
|
|
-- ending with a footer (dashed line followed by blank line).
|
2012-07-20 14:19:06 -07:00
|
|
|
|
gridTableWith :: Parsec [Char] ParserState Block -- ^ Block parser
|
2010-07-05 23:43:07 -07:00
|
|
|
|
-> Bool -- ^ Headerless table
|
2012-07-20 14:19:06 -07:00
|
|
|
|
-> Parsec [Char] ParserState Block
|
2012-07-24 09:06:13 -07:00
|
|
|
|
gridTableWith block headless =
|
|
|
|
|
tableWith (gridTableHeader headless block) (gridTableRow block) (gridTableSep '-') gridTableFooter
|
2010-07-05 14:34:48 -07:00
|
|
|
|
|
|
|
|
|
gridTableSplitLine :: [Int] -> String -> [String]
|
2011-01-14 14:16:27 -08:00
|
|
|
|
gridTableSplitLine indices line = map removeFinalBar $ tail $
|
2012-01-27 00:39:00 -08:00
|
|
|
|
splitStringByIndices (init indices) $ removeTrailingSpace line
|
2010-07-05 14:34:48 -07:00
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
gridPart :: Char -> Parsec [Char] st (Int, Int)
|
2010-07-05 14:34:48 -07:00
|
|
|
|
gridPart ch = do
|
|
|
|
|
dashes <- many1 (char ch)
|
|
|
|
|
char '+'
|
|
|
|
|
return (length dashes, length dashes + 1)
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
gridDashedLines :: Char -> Parsec [Char] st [(Int,Int)]
|
2010-07-05 14:34:48 -07:00
|
|
|
|
gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) >>~ blankline
|
|
|
|
|
|
|
|
|
|
removeFinalBar :: String -> String
|
2011-01-14 14:16:27 -08:00
|
|
|
|
removeFinalBar =
|
|
|
|
|
reverse . dropWhile (`elem` " \t") . dropWhile (=='|') . reverse
|
2010-07-05 14:34:48 -07:00
|
|
|
|
|
|
|
|
|
-- | Separator between rows of grid table.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
gridTableSep :: Char -> Parsec [Char] ParserState Char
|
2010-07-05 14:34:48 -07:00
|
|
|
|
gridTableSep ch = try $ gridDashedLines ch >> return '\n'
|
|
|
|
|
|
|
|
|
|
-- | Parse header for a grid table.
|
|
|
|
|
gridTableHeader :: Bool -- ^ Headerless table
|
2012-07-20 14:19:06 -07:00
|
|
|
|
-> Parsec [Char] ParserState Block
|
|
|
|
|
-> Parsec [Char] ParserState ([[Block]], [Alignment], [Int])
|
2010-07-05 23:43:07 -07:00
|
|
|
|
gridTableHeader headless block = try $ do
|
2010-07-05 14:34:48 -07:00
|
|
|
|
optional blanklines
|
|
|
|
|
dashes <- gridDashedLines '-'
|
|
|
|
|
rawContent <- if headless
|
2011-04-29 11:34:36 -07:00
|
|
|
|
then return $ repeat ""
|
2010-07-05 14:34:48 -07:00
|
|
|
|
else many1
|
2010-07-05 20:41:42 -07:00
|
|
|
|
(notFollowedBy (gridTableSep '=') >> char '|' >>
|
|
|
|
|
many1Till anyChar newline)
|
2010-07-05 14:34:48 -07:00
|
|
|
|
if headless
|
|
|
|
|
then return ()
|
|
|
|
|
else gridTableSep '=' >> return ()
|
|
|
|
|
let lines' = map snd dashes
|
|
|
|
|
let indices = scanl (+) 0 lines'
|
2010-07-05 20:41:42 -07:00
|
|
|
|
let aligns = replicate (length lines') AlignDefault
|
|
|
|
|
-- RST does not have a notion of alignments
|
2010-07-05 14:34:48 -07:00
|
|
|
|
let rawHeads = if headless
|
|
|
|
|
then replicate (length dashes) ""
|
|
|
|
|
else map (intercalate " ") $ transpose
|
|
|
|
|
$ map (gridTableSplitLine indices) rawContent
|
2010-07-05 23:43:07 -07:00
|
|
|
|
heads <- mapM (parseFromString $ many block) $
|
|
|
|
|
map removeLeadingTrailingSpace rawHeads
|
|
|
|
|
return (heads, aligns, indices)
|
2010-07-05 14:34:48 -07:00
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
gridTableRawLine :: [Int] -> Parsec [Char] ParserState [String]
|
2010-07-05 14:34:48 -07:00
|
|
|
|
gridTableRawLine indices = do
|
|
|
|
|
char '|'
|
|
|
|
|
line <- many1Till anyChar newline
|
2011-01-14 14:16:27 -08:00
|
|
|
|
return (gridTableSplitLine indices line)
|
2010-07-05 14:34:48 -07:00
|
|
|
|
|
|
|
|
|
-- | Parse row of grid table.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
gridTableRow :: Parsec [Char] ParserState Block
|
2010-07-05 14:34:48 -07:00
|
|
|
|
-> [Int]
|
2012-07-20 14:19:06 -07:00
|
|
|
|
-> Parsec [Char] ParserState [[Block]]
|
2010-07-05 14:34:48 -07:00
|
|
|
|
gridTableRow block indices = do
|
|
|
|
|
colLines <- many1 (gridTableRawLine indices)
|
|
|
|
|
let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $
|
|
|
|
|
transpose colLines
|
|
|
|
|
mapM (liftM compactifyCell . parseFromString (many block)) cols
|
|
|
|
|
|
|
|
|
|
removeOneLeadingSpace :: [String] -> [String]
|
|
|
|
|
removeOneLeadingSpace xs =
|
|
|
|
|
if all startsWithSpace xs
|
|
|
|
|
then map (drop 1) xs
|
|
|
|
|
else xs
|
|
|
|
|
where startsWithSpace "" = True
|
|
|
|
|
startsWithSpace (y:_) = y == ' '
|
|
|
|
|
|
|
|
|
|
compactifyCell :: [Block] -> [Block]
|
|
|
|
|
compactifyCell bs = head $ compactify [bs]
|
|
|
|
|
|
|
|
|
|
-- | Parse footer for a grid table.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
gridTableFooter :: Parsec [Char] ParserState [Char]
|
2010-07-05 14:34:48 -07:00
|
|
|
|
gridTableFooter = blanklines
|
|
|
|
|
|
|
|
|
|
---
|
|
|
|
|
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-- | Parse a string with a given parser and state.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
readWith :: Parsec [t] ParserState a -- ^ parser
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-> ParserState -- ^ initial state
|
2011-04-29 11:34:36 -07:00
|
|
|
|
-> [t] -- ^ input
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-> a
|
2011-04-29 11:34:36 -07:00
|
|
|
|
readWith parser state input =
|
2010-07-04 13:43:45 -07:00
|
|
|
|
case runParser parser state "source" input of
|
2012-01-29 23:54:00 -08:00
|
|
|
|
Left err' -> error $ "\nError:\n" ++ show err'
|
2010-07-04 13:43:45 -07:00
|
|
|
|
Right result -> result
|
|
|
|
|
|
|
|
|
|
-- | Parse a string with @parser@ (for testing).
|
2012-07-20 14:19:06 -07:00
|
|
|
|
testStringWith :: (Show a) => Parsec [Char] ParserState a
|
2010-07-04 13:43:45 -07:00
|
|
|
|
-> String
|
|
|
|
|
-> IO ()
|
|
|
|
|
testStringWith parser str = UTF8.putStrLn $ show $
|
|
|
|
|
readWith parser defaultParserState str
|
|
|
|
|
|
|
|
|
|
-- | Parsing options.
|
|
|
|
|
data ParserState = ParserState
|
|
|
|
|
{ stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX?
|
|
|
|
|
stateParserContext :: ParserContext, -- ^ Inside list?
|
|
|
|
|
stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment?
|
2012-02-07 21:50:55 -08:00
|
|
|
|
stateMaxNestingLevel :: Int, -- ^ Max # of nested Strong/Emph
|
2011-12-29 23:44:12 -08:00
|
|
|
|
stateLastStrPos :: Maybe SourcePos, -- ^ Position after last str parsed
|
2010-07-04 13:43:45 -07:00
|
|
|
|
stateKeys :: KeyTable, -- ^ List of reference keys
|
|
|
|
|
stateCitations :: [String], -- ^ List of available citations
|
|
|
|
|
stateNotes :: NoteTable, -- ^ List of notes
|
|
|
|
|
stateTabStop :: Int, -- ^ Tab stop
|
|
|
|
|
stateStandalone :: Bool, -- ^ Parse bibliographic info?
|
|
|
|
|
stateTitle :: [Inline], -- ^ Title of document
|
|
|
|
|
stateAuthors :: [[Inline]], -- ^ Authors of document
|
|
|
|
|
stateDate :: [Inline], -- ^ Date of document
|
|
|
|
|
stateStrict :: Bool, -- ^ Use strict markdown syntax?
|
|
|
|
|
stateSmart :: Bool, -- ^ Use smart typography?
|
2012-01-01 13:48:28 -08:00
|
|
|
|
stateOldDashes :: Bool, -- ^ Use pandoc <= 1.8.2.1 behavior
|
|
|
|
|
-- in parsing dashes; -- is em-dash;
|
|
|
|
|
-- before numeral is en-dash
|
2010-07-04 13:43:45 -07:00
|
|
|
|
stateLiterateHaskell :: Bool, -- ^ Treat input as literate haskell
|
|
|
|
|
stateColumns :: Int, -- ^ Number of columns in terminal
|
|
|
|
|
stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used
|
2010-07-11 22:47:52 -07:00
|
|
|
|
stateIndentedCodeClasses :: [String], -- ^ Classes to use for indented code blocks
|
|
|
|
|
stateNextExample :: Int, -- ^ Number of next example
|
2011-04-29 11:34:36 -07:00
|
|
|
|
stateExamples :: M.Map String Int, -- ^ Map from example labels to numbers
|
2010-10-26 09:03:03 -07:00
|
|
|
|
stateHasChapters :: Bool, -- ^ True if \chapter encountered
|
|
|
|
|
stateApplyMacros :: Bool, -- ^ Apply LaTeX macros?
|
2012-03-24 21:30:10 -04:00
|
|
|
|
stateMacros :: [Macro], -- ^ List of macros defined so far
|
|
|
|
|
stateRstDefaultRole :: String -- ^ Current rST default interpreted text role
|
2010-07-04 13:43:45 -07:00
|
|
|
|
}
|
|
|
|
|
deriving Show
|
|
|
|
|
|
2012-07-19 12:38:54 -07:00
|
|
|
|
instance Default ParserState where
|
|
|
|
|
def = defaultParserState
|
|
|
|
|
|
2010-07-04 13:43:45 -07:00
|
|
|
|
defaultParserState :: ParserState
|
2011-04-29 11:34:36 -07:00
|
|
|
|
defaultParserState =
|
2010-07-04 13:43:45 -07:00
|
|
|
|
ParserState { stateParseRaw = False,
|
|
|
|
|
stateParserContext = NullState,
|
|
|
|
|
stateQuoteContext = NoQuote,
|
2012-02-07 21:50:55 -08:00
|
|
|
|
stateMaxNestingLevel = 6,
|
2011-12-29 23:44:12 -08:00
|
|
|
|
stateLastStrPos = Nothing,
|
2010-07-04 13:43:45 -07:00
|
|
|
|
stateKeys = M.empty,
|
|
|
|
|
stateCitations = [],
|
|
|
|
|
stateNotes = [],
|
|
|
|
|
stateTabStop = 4,
|
|
|
|
|
stateStandalone = False,
|
|
|
|
|
stateTitle = [],
|
|
|
|
|
stateAuthors = [],
|
|
|
|
|
stateDate = [],
|
|
|
|
|
stateStrict = False,
|
|
|
|
|
stateSmart = False,
|
2012-01-01 13:48:28 -08:00
|
|
|
|
stateOldDashes = False,
|
2010-07-04 13:43:45 -07:00
|
|
|
|
stateLiterateHaskell = False,
|
|
|
|
|
stateColumns = 80,
|
|
|
|
|
stateHeaderTable = [],
|
2010-07-11 22:47:52 -07:00
|
|
|
|
stateIndentedCodeClasses = [],
|
|
|
|
|
stateNextExample = 1,
|
2010-07-13 19:18:58 -07:00
|
|
|
|
stateExamples = M.empty,
|
2010-10-26 09:03:03 -07:00
|
|
|
|
stateHasChapters = False,
|
|
|
|
|
stateApplyMacros = True,
|
2012-03-24 21:30:10 -04:00
|
|
|
|
stateMacros = [],
|
|
|
|
|
stateRstDefaultRole = "title-reference"}
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
2011-04-29 11:34:36 -07:00
|
|
|
|
data HeaderType
|
2010-07-04 13:43:45 -07:00
|
|
|
|
= SingleHeader Char -- ^ Single line of characters underneath
|
|
|
|
|
| DoubleHeader Char -- ^ Lines of characters above and below
|
|
|
|
|
deriving (Eq, Show)
|
|
|
|
|
|
2011-04-29 11:34:36 -07:00
|
|
|
|
data ParserContext
|
2010-07-04 13:43:45 -07:00
|
|
|
|
= ListItemState -- ^ Used when running parser on list item contents
|
|
|
|
|
| NullState -- ^ Default state
|
|
|
|
|
deriving (Eq, Show)
|
|
|
|
|
|
|
|
|
|
data QuoteContext
|
|
|
|
|
= InSingleQuote -- ^ Used when parsing inside single quotes
|
|
|
|
|
| InDoubleQuote -- ^ Used when parsing inside double quotes
|
|
|
|
|
| NoQuote -- ^ Used when not parsing inside quotes
|
|
|
|
|
deriving (Eq, Show)
|
|
|
|
|
|
|
|
|
|
type NoteTable = [(String, String)]
|
|
|
|
|
|
2010-12-05 19:27:00 -08:00
|
|
|
|
newtype Key = Key [Inline] deriving (Show, Read, Eq, Ord)
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
2010-12-05 19:27:00 -08:00
|
|
|
|
toKey :: [Inline] -> Key
|
2010-12-24 13:39:27 -08:00
|
|
|
|
toKey = Key . bottomUp lowercase
|
2010-12-05 19:27:00 -08:00
|
|
|
|
where lowercase :: Inline -> Inline
|
2011-01-23 10:55:56 -08:00
|
|
|
|
lowercase (Str xs) = Str (map toLower xs)
|
|
|
|
|
lowercase (Math t xs) = Math t (map toLower xs)
|
2011-01-26 20:44:25 -08:00
|
|
|
|
lowercase (Code attr xs) = Code attr (map toLower xs)
|
2011-01-23 10:55:56 -08:00
|
|
|
|
lowercase (RawInline f xs) = RawInline f (map toLower xs)
|
|
|
|
|
lowercase LineBreak = Space
|
|
|
|
|
lowercase x = x
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
2010-12-05 19:27:00 -08:00
|
|
|
|
fromKey :: Key -> [Inline]
|
|
|
|
|
fromKey (Key xs) = xs
|
2010-07-04 13:43:45 -07:00
|
|
|
|
|
|
|
|
|
type KeyTable = M.Map Key Target
|
|
|
|
|
|
|
|
|
|
-- | Look up key in key table and return target object.
|
|
|
|
|
lookupKeySrc :: KeyTable -- ^ Key table
|
|
|
|
|
-> Key -- ^ Key
|
|
|
|
|
-> Maybe Target
|
|
|
|
|
lookupKeySrc table key = case M.lookup key table of
|
|
|
|
|
Nothing -> Nothing
|
|
|
|
|
Just src -> Just src
|
|
|
|
|
|
2010-12-07 19:03:08 -08:00
|
|
|
|
-- | Fail unless we're in "smart typography" mode.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
failUnlessSmart :: Parsec [tok] ParserState ()
|
2010-12-07 19:03:08 -08:00
|
|
|
|
failUnlessSmart = getState >>= guard . stateSmart
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
smartPunctuation :: Parsec [Char] ParserState Inline
|
|
|
|
|
-> Parsec [Char] ParserState Inline
|
2010-12-07 19:03:08 -08:00
|
|
|
|
smartPunctuation inlineParser = do
|
|
|
|
|
failUnlessSmart
|
|
|
|
|
choice [ quoted inlineParser, apostrophe, dash, ellipses ]
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
apostrophe :: Parsec [Char] ParserState Inline
|
2011-12-27 11:01:10 -08:00
|
|
|
|
apostrophe = (char '\'' <|> char '\8217') >> return (Str "\x2019")
|
2010-12-07 19:03:08 -08:00
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
quoted :: Parsec [Char] ParserState Inline
|
|
|
|
|
-> Parsec [Char] ParserState Inline
|
2010-12-07 19:03:08 -08:00
|
|
|
|
quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser
|
|
|
|
|
|
|
|
|
|
withQuoteContext :: QuoteContext
|
2012-07-20 14:19:06 -07:00
|
|
|
|
-> (Parsec [Char] ParserState Inline)
|
|
|
|
|
-> Parsec [Char] ParserState Inline
|
2010-12-07 19:03:08 -08:00
|
|
|
|
withQuoteContext context parser = do
|
|
|
|
|
oldState <- getState
|
|
|
|
|
let oldQuoteContext = stateQuoteContext oldState
|
|
|
|
|
setState oldState { stateQuoteContext = context }
|
|
|
|
|
result <- parser
|
|
|
|
|
newState <- getState
|
|
|
|
|
setState newState { stateQuoteContext = oldQuoteContext }
|
|
|
|
|
return result
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
singleQuoted :: Parsec [Char] ParserState Inline
|
|
|
|
|
-> Parsec [Char] ParserState Inline
|
2010-12-07 19:03:08 -08:00
|
|
|
|
singleQuoted inlineParser = try $ do
|
|
|
|
|
singleQuoteStart
|
|
|
|
|
withQuoteContext InSingleQuote $ many1Till inlineParser singleQuoteEnd >>=
|
|
|
|
|
return . Quoted SingleQuote . normalizeSpaces
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
doubleQuoted :: Parsec [Char] ParserState Inline
|
|
|
|
|
-> Parsec [Char] ParserState Inline
|
2010-12-07 19:03:08 -08:00
|
|
|
|
doubleQuoted inlineParser = try $ do
|
|
|
|
|
doubleQuoteStart
|
|
|
|
|
withQuoteContext InDoubleQuote $ do
|
|
|
|
|
contents <- manyTill inlineParser doubleQuoteEnd
|
|
|
|
|
return . Quoted DoubleQuote . normalizeSpaces $ contents
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
failIfInQuoteContext :: QuoteContext -> Parsec [tok] ParserState ()
|
2010-12-07 19:03:08 -08:00
|
|
|
|
failIfInQuoteContext context = do
|
|
|
|
|
st <- getState
|
|
|
|
|
if stateQuoteContext st == context
|
|
|
|
|
then fail "already inside quotes"
|
|
|
|
|
else return ()
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
charOrRef :: [Char] -> Parsec [Char] st Char
|
2010-12-07 20:44:43 -08:00
|
|
|
|
charOrRef cs =
|
|
|
|
|
oneOf cs <|> try (do c <- characterReference
|
|
|
|
|
guard (c `elem` cs)
|
|
|
|
|
return c)
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
updateLastStrPos :: Parsec [Char] ParserState ()
|
2012-04-24 15:56:59 +02:00
|
|
|
|
updateLastStrPos = getPosition >>= \p ->
|
|
|
|
|
updateState $ \s -> s{ stateLastStrPos = Just p }
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
singleQuoteStart :: Parsec [Char] ParserState ()
|
2011-12-29 23:44:12 -08:00
|
|
|
|
singleQuoteStart = do
|
2010-12-07 19:03:08 -08:00
|
|
|
|
failIfInQuoteContext InSingleQuote
|
2011-12-29 23:44:12 -08:00
|
|
|
|
pos <- getPosition
|
|
|
|
|
st <- getState
|
|
|
|
|
-- single quote start can't be right after str
|
|
|
|
|
guard $ stateLastStrPos st /= Just pos
|
2011-07-23 12:35:01 -07:00
|
|
|
|
try $ do charOrRef "'\8216\145"
|
2011-07-25 23:49:45 -07:00
|
|
|
|
notFollowedBy (oneOf ")!],;:-? \t\n")
|
|
|
|
|
notFollowedBy (char '.') <|> lookAhead (string "..." >> return ())
|
2010-12-07 19:03:08 -08:00
|
|
|
|
notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >>
|
2011-04-29 11:34:36 -07:00
|
|
|
|
satisfy (not . isAlphaNum)))
|
2010-12-07 19:03:08 -08:00
|
|
|
|
-- possess/contraction
|
|
|
|
|
return ()
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
singleQuoteEnd :: Parsec [Char] st ()
|
2010-12-07 19:03:08 -08:00
|
|
|
|
singleQuoteEnd = try $ do
|
2011-07-23 12:35:01 -07:00
|
|
|
|
charOrRef "'\8217\146"
|
2010-12-07 19:03:08 -08:00
|
|
|
|
notFollowedBy alphaNum
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
doubleQuoteStart :: Parsec [Char] ParserState ()
|
2010-12-07 19:03:08 -08:00
|
|
|
|
doubleQuoteStart = do
|
|
|
|
|
failIfInQuoteContext InDoubleQuote
|
2011-07-23 12:35:01 -07:00
|
|
|
|
try $ do charOrRef "\"\8220\147"
|
2011-01-19 14:59:59 -08:00
|
|
|
|
notFollowedBy (satisfy (\c -> c == ' ' || c == '\t' || c == '\n'))
|
2010-12-07 19:03:08 -08:00
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
doubleQuoteEnd :: Parsec [Char] st ()
|
2010-12-07 20:44:43 -08:00
|
|
|
|
doubleQuoteEnd = do
|
2011-07-23 12:35:01 -07:00
|
|
|
|
charOrRef "\"\8221\148"
|
2010-12-07 20:44:43 -08:00
|
|
|
|
return ()
|
2010-12-07 19:03:08 -08:00
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
ellipses :: Parsec [Char] st Inline
|
2010-12-07 20:44:43 -08:00
|
|
|
|
ellipses = do
|
2011-12-27 15:45:34 -08:00
|
|
|
|
try (charOrRef "\8230\133") <|> try (string "..." >> return '…')
|
|
|
|
|
return (Str "\8230")
|
2010-12-07 19:03:08 -08:00
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
dash :: Parsec [Char] ParserState Inline
|
2012-01-01 13:48:28 -08:00
|
|
|
|
dash = do
|
|
|
|
|
oldDashes <- stateOldDashes `fmap` getState
|
|
|
|
|
if oldDashes
|
|
|
|
|
then emDashOld <|> enDashOld
|
|
|
|
|
else Str `fmap` (hyphenDash <|> emDash <|> enDash)
|
2010-12-07 19:03:08 -08:00
|
|
|
|
|
2012-01-01 13:48:28 -08:00
|
|
|
|
-- Two hyphens = en-dash, three = em-dash
|
2012-07-20 14:19:06 -07:00
|
|
|
|
hyphenDash :: Parsec [Char] st String
|
2012-01-01 13:48:28 -08:00
|
|
|
|
hyphenDash = do
|
|
|
|
|
try $ string "--"
|
|
|
|
|
option "\8211" (char '-' >> return "\8212")
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
emDash :: Parsec [Char] st String
|
2012-01-01 13:48:28 -08:00
|
|
|
|
emDash = do
|
|
|
|
|
try (charOrRef "\8212\151")
|
|
|
|
|
return "\8212"
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
enDash :: Parsec [Char] st String
|
2010-12-07 20:44:43 -08:00
|
|
|
|
enDash = do
|
2012-01-01 13:48:28 -08:00
|
|
|
|
try (charOrRef "\8212\151")
|
|
|
|
|
return "\8211"
|
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
enDashOld :: Parsec [Char] st Inline
|
2012-01-01 13:48:28 -08:00
|
|
|
|
enDashOld = do
|
2011-12-27 15:45:34 -08:00
|
|
|
|
try (charOrRef "\8211\150") <|>
|
2011-01-19 14:59:59 -08:00
|
|
|
|
try (char '-' >> lookAhead (satisfy isDigit) >> return '–')
|
2011-12-27 15:45:34 -08:00
|
|
|
|
return (Str "\8211")
|
2010-12-07 19:03:08 -08:00
|
|
|
|
|
2012-07-20 14:19:06 -07:00
|
|
|
|
emDashOld :: Parsec [Char] st Inline
|
2012-01-01 13:48:28 -08:00
|
|
|
|
emDashOld = do
|
2011-12-27 15:45:34 -08:00
|
|
|
|
try (charOrRef "\8212\151") <|> (try $ string "--" >> optional (char '-') >> return '-')
|
|
|
|
|
return (Str "\8212")
|
2010-12-07 19:03:08 -08:00
|
|
|
|
|
2011-01-04 19:12:33 -08:00
|
|
|
|
--
|
|
|
|
|
-- Macros
|
|
|
|
|
--
|
|
|
|
|
|
|
|
|
|
-- | Parse a \newcommand or \renewcommand macro definition.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
macro :: Parsec [Char] ParserState Block
|
2011-01-05 14:42:47 -08:00
|
|
|
|
macro = do
|
2012-06-29 18:30:22 -07:00
|
|
|
|
apply <- stateApplyMacros `fmap` getState
|
2011-01-05 14:42:47 -08:00
|
|
|
|
inp <- getInput
|
|
|
|
|
case parseMacroDefinitions inp of
|
2012-07-20 14:19:06 -07:00
|
|
|
|
([], _) -> mzero
|
2012-07-19 12:38:54 -07:00
|
|
|
|
(ms, rest) -> do def' <- count (length inp - length rest) anyChar
|
2012-06-29 18:30:22 -07:00
|
|
|
|
if apply
|
|
|
|
|
then do
|
|
|
|
|
updateState $ \st ->
|
|
|
|
|
st { stateMacros = ms ++ stateMacros st }
|
|
|
|
|
return Null
|
2012-07-19 12:38:54 -07:00
|
|
|
|
else return $ RawBlock "latex" def'
|
2011-01-04 19:12:33 -08:00
|
|
|
|
|
|
|
|
|
-- | Apply current macros to string.
|
2012-07-20 14:19:06 -07:00
|
|
|
|
applyMacros' :: String -> Parsec [Char] ParserState String
|
2011-01-04 19:12:33 -08:00
|
|
|
|
applyMacros' target = do
|
|
|
|
|
apply <- liftM stateApplyMacros getState
|
|
|
|
|
if apply
|
|
|
|
|
then do macros <- liftM stateMacros getState
|
|
|
|
|
return $ applyMacros macros target
|
|
|
|
|
else return target
|
2011-04-29 11:34:36 -07:00
|
|
|
|
|