Parsing: partition module into (internal) submodules (#7962)
This commit is contained in:
parent
168529f0a4
commit
517bceeba8
11 changed files with 1885 additions and 1472 deletions
|
@ -649,6 +649,15 @@ library
|
|||
Text.Pandoc.Filter.JSON,
|
||||
Text.Pandoc.Filter.Lua,
|
||||
Text.Pandoc.Filter.Path,
|
||||
Text.Pandoc.Parsing.Capabilities,
|
||||
Text.Pandoc.Parsing.Citations,
|
||||
Text.Pandoc.Parsing.Combinators,
|
||||
Text.Pandoc.Parsing.GridTable,
|
||||
Text.Pandoc.Parsing.Lists,
|
||||
Text.Pandoc.Parsing.Math,
|
||||
Text.Pandoc.Parsing.Smart,
|
||||
Text.Pandoc.Parsing.State,
|
||||
Text.Pandoc.Parsing.Types,
|
||||
Text.Pandoc.Readers.Docx.Lists,
|
||||
Text.Pandoc.Readers.Docx.Combine,
|
||||
Text.Pandoc.Readers.Docx.Parse,
|
||||
|
|
File diff suppressed because it is too large
Load diff
137
src/Text/Pandoc/Parsing/Capabilities.hs
Normal file
137
src/Text/Pandoc/Parsing/Capabilities.hs
Normal file
|
@ -0,0 +1,137 @@
|
|||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Parsing
|
||||
Copyright : © 2006-2022 John MacFarlane
|
||||
License : GPL-2.0-or-later
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
Parser state capabilities.
|
||||
-}
|
||||
module Text.Pandoc.Parsing.Capabilities
|
||||
( -- * Capabilities
|
||||
|
||||
-- ** Element identifiers
|
||||
HasIdentifierList (..)
|
||||
|
||||
-- ** Include files
|
||||
, HasIncludeFiles (..)
|
||||
|
||||
-- ** String/Word boundaries
|
||||
, HasLastStrPosition (..)
|
||||
, updateLastStrPos
|
||||
, notAfterString
|
||||
|
||||
-- ** Logging
|
||||
, HasLogMessages (..)
|
||||
, logMessage
|
||||
, reportLogMessages
|
||||
|
||||
-- ** Macros
|
||||
, HasMacros (..)
|
||||
|
||||
-- ** Quote context
|
||||
, QuoteContext (..)
|
||||
, HasQuoteContext (..)
|
||||
, failIfInQuoteContext
|
||||
|
||||
-- ** Reader options
|
||||
, HasReaderOptions (..)
|
||||
, guardEnabled
|
||||
, guardDisabled
|
||||
)
|
||||
where
|
||||
|
||||
import Control.Monad (guard, when)
|
||||
import Data.Text (Text)
|
||||
import Text.Parsec (ParsecT, SourcePos, Stream, getPosition, getState, updateState)
|
||||
import Text.Pandoc.Class.PandocMonad (PandocMonad, report)
|
||||
import Text.Pandoc.Logging (LogMessage)
|
||||
import Text.Pandoc.Options
|
||||
( Extension
|
||||
, ReaderOptions(readerExtensions)
|
||||
, extensionEnabled
|
||||
)
|
||||
import Text.Pandoc.Parsing.Types
|
||||
import Text.Pandoc.Readers.LaTeX.Types (Macro)
|
||||
|
||||
import qualified Data.Map as M
|
||||
import qualified Data.Set as Set
|
||||
|
||||
class HasReaderOptions st where
|
||||
extractReaderOptions :: st -> ReaderOptions
|
||||
getOption :: (Stream s m t) => (ReaderOptions -> b) -> ParserT s st m b
|
||||
-- default
|
||||
getOption f = f . extractReaderOptions <$> getState
|
||||
|
||||
class HasQuoteContext st m where
|
||||
getQuoteContext :: (Stream s m t) => ParsecT s st m QuoteContext
|
||||
withQuoteContext :: QuoteContext -> ParsecT s st m a -> ParsecT s st m a
|
||||
|
||||
failIfInQuoteContext :: (HasQuoteContext st m, Stream s m t)
|
||||
=> QuoteContext
|
||||
-> ParserT s st m ()
|
||||
failIfInQuoteContext context = do
|
||||
context' <- getQuoteContext
|
||||
when (context' == context) $ Prelude.fail "already inside quotes"
|
||||
|
||||
class HasIdentifierList st where
|
||||
extractIdentifierList :: st -> Set.Set Text
|
||||
updateIdentifierList :: (Set.Set Text -> Set.Set Text) -> st -> st
|
||||
|
||||
class HasMacros st where
|
||||
extractMacros :: st -> M.Map Text Macro
|
||||
updateMacros :: (M.Map Text Macro -> M.Map Text Macro) -> st -> st
|
||||
|
||||
class HasLastStrPosition st where
|
||||
setLastStrPos :: Maybe SourcePos -> st -> st
|
||||
getLastStrPos :: st -> Maybe SourcePos
|
||||
|
||||
class HasLogMessages st where
|
||||
addLogMessage :: LogMessage -> st -> st
|
||||
getLogMessages :: st -> [LogMessage]
|
||||
|
||||
class HasIncludeFiles st where
|
||||
getIncludeFiles :: st -> [Text]
|
||||
addIncludeFile :: Text -> st -> st
|
||||
dropLatestIncludeFile :: st -> st
|
||||
|
||||
-- | Add a log message.
|
||||
logMessage :: (Stream s m a, HasLogMessages st)
|
||||
=> LogMessage -> ParserT s st m ()
|
||||
logMessage msg = updateState (addLogMessage msg)
|
||||
|
||||
-- | Report all the accumulated log messages, according to verbosity level.
|
||||
reportLogMessages :: (PandocMonad m, HasLogMessages st) => ParserT s st m ()
|
||||
reportLogMessages = do
|
||||
msgs <- getLogMessages <$> getState
|
||||
mapM_ report msgs
|
||||
|
||||
-- | Succeed only if the extension is enabled.
|
||||
guardEnabled :: (Stream s m a, HasReaderOptions st)
|
||||
=> Extension -> ParserT s st m ()
|
||||
guardEnabled ext =
|
||||
getOption readerExtensions >>= guard . extensionEnabled ext
|
||||
|
||||
-- | Succeed only if the extension is disabled.
|
||||
guardDisabled :: (Stream s m a, HasReaderOptions st)
|
||||
=> Extension -> ParserT s st m ()
|
||||
guardDisabled ext =
|
||||
getOption readerExtensions >>= guard . not . extensionEnabled ext
|
||||
|
||||
-- | Update the position on which the last string ended.
|
||||
updateLastStrPos :: (Stream s m a, HasLastStrPosition st)
|
||||
=> ParserT s st m ()
|
||||
updateLastStrPos = getPosition >>= updateState . setLastStrPos . Just
|
||||
|
||||
-- | Whether we are right after the end of a string.
|
||||
notAfterString :: (Stream s m a, HasLastStrPosition st) => ParserT s st m Bool
|
||||
notAfterString = do
|
||||
pos <- getPosition
|
||||
st <- getState
|
||||
return $ getLastStrPos st /= Just pos
|
||||
|
||||
data QuoteContext
|
||||
= InSingleQuote -- ^ Used when parsing inside single quotes
|
||||
| InDoubleQuote -- ^ Used when parsing inside double quotes
|
||||
| NoQuote -- ^ Used when not parsing inside quotes
|
||||
deriving (Eq, Show)
|
57
src/Text/Pandoc/Parsing/Citations.hs
Normal file
57
src/Text/Pandoc/Parsing/Citations.hs
Normal file
|
@ -0,0 +1,57 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Parsing.Citations
|
||||
Copyright : © 2006-2022 John MacFarlane
|
||||
License : GPL-2.0-or-later
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
Citation parsing.
|
||||
-}
|
||||
|
||||
module Text.Pandoc.Parsing.Citations
|
||||
( citeKey
|
||||
)
|
||||
where
|
||||
|
||||
import Control.Monad (guard, MonadPlus(mzero))
|
||||
import Data.Char (isAlphaNum , isSpace)
|
||||
import Data.Text (Text)
|
||||
import Text.Pandoc.Sources
|
||||
import Text.Parsec
|
||||
( (<|>)
|
||||
, Stream(..)
|
||||
, lookAhead
|
||||
, many
|
||||
, option
|
||||
, try
|
||||
)
|
||||
import Text.Pandoc.Parsing.Capabilities (HasLastStrPosition, notAfterString)
|
||||
import Text.Pandoc.Parsing.Combinators
|
||||
import Text.Pandoc.Parsing.Types (ParserT)
|
||||
|
||||
import qualified Data.Text as T
|
||||
|
||||
citeKey :: (Stream s m Char, UpdateSourcePos s Char, HasLastStrPosition st)
|
||||
=> Bool -- ^ If True, allow expanded @{..} syntax.
|
||||
-> ParserT s st m (Bool, Text)
|
||||
citeKey allowBraced = try $ do
|
||||
guard =<< notAfterString
|
||||
suppress_author <- option False (True <$ char '-')
|
||||
char '@'
|
||||
key <- simpleCiteIdentifier
|
||||
<|> if allowBraced
|
||||
then charsInBalanced '{' '}' (satisfy (not . isSpace))
|
||||
else mzero
|
||||
return (suppress_author, key)
|
||||
|
||||
simpleCiteIdentifier :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Text
|
||||
simpleCiteIdentifier = do
|
||||
firstChar <- alphaNum <|> char '_' <|> char '*' -- @* for wildcard in nocite
|
||||
let regchar = satisfy (\c -> isAlphaNum c || c == '_')
|
||||
let internal p = try $ p <* lookAhead regchar
|
||||
rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/") <|>
|
||||
try (oneOf ":/" <* lookAhead (char '/'))
|
||||
return $ T.pack $ firstChar:rest
|
||||
|
751
src/Text/Pandoc/Parsing/Combinators.hs
Normal file
751
src/Text/Pandoc/Parsing/Combinators.hs
Normal file
|
@ -0,0 +1,751 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Parsing.Combinators
|
||||
Copyright : © 2006-2022 John MacFarlane
|
||||
License : GPL-2.0-or-later
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
Parser combinators for pandoc format readers.
|
||||
-}
|
||||
|
||||
module Text.Pandoc.Parsing.Combinators
|
||||
( (<+?>)
|
||||
, anyLine
|
||||
, anyLineNewline
|
||||
, blankline
|
||||
, blanklines
|
||||
, charRef
|
||||
, characterReference
|
||||
, charsInBalanced
|
||||
, countChar
|
||||
, emailAddress
|
||||
, enclosed
|
||||
, escaped
|
||||
, extractIdClass
|
||||
, gobbleAtMostSpaces
|
||||
, gobbleSpaces
|
||||
, indentWith
|
||||
, insertIncludedFile
|
||||
, isSpaceChar -- not re-exported from T.P.Parsing
|
||||
, lineBlockLines
|
||||
, lineClump
|
||||
, many1Char
|
||||
, many1Till
|
||||
, many1TillChar
|
||||
, manyChar
|
||||
, manyTillChar
|
||||
, manyUntil
|
||||
, manyUntilChar
|
||||
, nested
|
||||
, nonspaceChar
|
||||
, notFollowedBy'
|
||||
, oneOfStrings
|
||||
, oneOfStringsCI
|
||||
, parseFromString
|
||||
, parseFromString'
|
||||
, readWith
|
||||
, readWithM
|
||||
, registerHeader
|
||||
, sepBy1'
|
||||
, skipSpaces
|
||||
, spaceChar
|
||||
, stringAnyCase
|
||||
, testStringWith
|
||||
, textStr
|
||||
, token
|
||||
, trimInlinesF
|
||||
, uri
|
||||
, withHorizDisplacement
|
||||
, withRaw
|
||||
)
|
||||
where
|
||||
|
||||
import Control.Monad
|
||||
( guard
|
||||
, join
|
||||
, liftM
|
||||
, unless
|
||||
, void
|
||||
, when
|
||||
)
|
||||
import Control.Monad.Except ( MonadError(throwError) )
|
||||
import Control.Monad.Identity ( Identity(..), MonadPlus(mzero) )
|
||||
import Data.Char
|
||||
( chr
|
||||
, isAlphaNum
|
||||
, isAscii
|
||||
, isAsciiUpper
|
||||
, isSpace
|
||||
, ord
|
||||
, toLower
|
||||
, toUpper
|
||||
)
|
||||
import Data.Functor (($>))
|
||||
import Data.List (intercalate)
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.Text (Text)
|
||||
import Text.HTML.TagSoup.Entity (lookupEntity)
|
||||
import Text.Pandoc.Asciify (toAsciiText)
|
||||
import Text.Pandoc.Builder (Attr, Inline(Str), Inlines, trimInlines)
|
||||
import Text.Pandoc.Class.PandocMonad (PandocMonad, readFileFromDirs, report)
|
||||
import Text.Pandoc.Logging
|
||||
( LogMessage(CouldNotLoadIncludeFile, DuplicateIdentifier) )
|
||||
import Text.Pandoc.Options
|
||||
( extensionEnabled
|
||||
, Extension(Ext_auto_identifiers, Ext_ascii_identifiers)
|
||||
, ReaderOptions(readerTabStop, readerExtensions) )
|
||||
import Text.Pandoc.Shared (escapeURI, mapLeft, schemes, tshow, uniqueIdent)
|
||||
import Text.Pandoc.Sources
|
||||
import Text.Pandoc.XML (fromEntities)
|
||||
import Text.Parsec
|
||||
( (<|>)
|
||||
, ParsecT
|
||||
, SourcePos
|
||||
, Stream(..)
|
||||
, between
|
||||
, choice
|
||||
, count
|
||||
, getInput
|
||||
, getPosition
|
||||
, getState
|
||||
, lookAhead
|
||||
, many
|
||||
, many1
|
||||
, manyTill
|
||||
, notFollowedBy
|
||||
, option
|
||||
, runParserT
|
||||
, setInput
|
||||
, setPosition
|
||||
, skipMany
|
||||
, sourceColumn
|
||||
, sourceName
|
||||
, tokenPrim
|
||||
, try
|
||||
, unexpected
|
||||
, updateState
|
||||
)
|
||||
import Text.Parsec.Pos (initialPos, newPos)
|
||||
import Text.Pandoc.Error
|
||||
( PandocError(PandocParseError, PandocParsecError) )
|
||||
import Text.Pandoc.Parsing.Capabilities
|
||||
import Text.Pandoc.Parsing.State
|
||||
import Text.Pandoc.Parsing.Types ( Parser, ParserT, Future (..))
|
||||
|
||||
import qualified Data.Set as Set
|
||||
import qualified Data.Text as T
|
||||
import qualified Text.Pandoc.Builder as B
|
||||
import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn)
|
||||
|
||||
-- | Remove whitespace from start and end; just like @'trimInlines'@,
|
||||
-- but lifted into the 'Future' type.
|
||||
trimInlinesF :: Future s Inlines -> Future s Inlines
|
||||
trimInlinesF = liftM trimInlines
|
||||
|
||||
-- | Like @count@, but packs its result
|
||||
countChar :: (Stream s m Char, UpdateSourcePos s Char, Monad m)
|
||||
=> Int
|
||||
-> ParsecT s st m Char
|
||||
-> ParsecT s st m Text
|
||||
countChar n = fmap T.pack . count n
|
||||
|
||||
-- | Like @string@, but uses @Text@.
|
||||
textStr :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> Text -> ParsecT s u m Text
|
||||
textStr t = string (T.unpack t) $> t
|
||||
|
||||
|
||||
-- | Parse any line of text, returning the contents without the
|
||||
-- final newline.
|
||||
anyLine :: Monad m => ParserT Sources st m Text
|
||||
anyLine = do
|
||||
-- This is much faster than:
|
||||
-- manyTill anyChar newline
|
||||
inp <- getInput
|
||||
case inp of
|
||||
Sources [] -> mzero
|
||||
Sources ((fp,t):inps) ->
|
||||
-- we assume that lines don't span different input files
|
||||
case T.break (=='\n') t of
|
||||
(this, rest)
|
||||
| T.null rest
|
||||
, not (null inps) ->
|
||||
-- line may span different input files, so do it
|
||||
-- character by character
|
||||
T.pack <$> manyTill anyChar newline
|
||||
| otherwise -> do -- either end of inputs or newline in rest
|
||||
setInput $ Sources ((fp, rest):inps)
|
||||
char '\n' -- needed so parsec knows we won't match empty string
|
||||
-- and so source pos is updated
|
||||
return this
|
||||
|
||||
-- | Parse any line, include the final newline in the output
|
||||
anyLineNewline :: Monad m => ParserT Sources st m Text
|
||||
anyLineNewline = (<> "\n") <$> anyLine
|
||||
|
||||
-- | Parse indent by specified number of spaces (or equiv. tabs)
|
||||
indentWith :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> HasReaderOptions st
|
||||
=> Int -> ParserT s st m Text
|
||||
indentWith num = do
|
||||
tabStop <- getOption readerTabStop
|
||||
if num < tabStop
|
||||
then countChar num (char ' ')
|
||||
else choice [ try (countChar num (char ' '))
|
||||
, try (char '\t' >> indentWith (num - tabStop)) ]
|
||||
|
||||
-- | Like @many@, but packs its result.
|
||||
manyChar :: Stream s m t
|
||||
=> ParserT s st m Char
|
||||
-> ParserT s st m Text
|
||||
manyChar = fmap T.pack . many
|
||||
|
||||
-- | Like @many1@, but packs its result.
|
||||
many1Char :: Stream s m t
|
||||
=> ParserT s st m Char
|
||||
-> ParserT s st m Text
|
||||
many1Char = fmap T.pack . many1
|
||||
|
||||
-- | Like @manyTill@, but packs its result.
|
||||
manyTillChar :: Stream s m t
|
||||
=> ParserT s st m Char
|
||||
-> ParserT s st m a
|
||||
-> ParserT s st m Text
|
||||
manyTillChar p = fmap T.pack . manyTill p
|
||||
|
||||
-- | Like @manyTill@, but reads at least one item.
|
||||
many1Till :: (Show end, Stream s m t)
|
||||
=> ParserT s st m a
|
||||
-> ParserT s st m end
|
||||
-> ParserT s st m [a]
|
||||
many1Till p end = do
|
||||
notFollowedBy' end
|
||||
first <- p
|
||||
rest <- manyTill p end
|
||||
return (first:rest)
|
||||
|
||||
-- | Like @many1Till@, but packs its result
|
||||
many1TillChar :: (Show end, Stream s m t)
|
||||
=> ParserT s st m Char
|
||||
-> ParserT s st m end
|
||||
-> ParserT s st m Text
|
||||
many1TillChar p = fmap T.pack . many1Till p
|
||||
|
||||
-- | Like @manyTill@, but also returns the result of end parser.
|
||||
manyUntil :: ParserT s u m a
|
||||
-> ParserT s u m b
|
||||
-> ParserT s u m ([a], b)
|
||||
manyUntil p end = scan
|
||||
where scan =
|
||||
(do e <- end
|
||||
return ([], e)
|
||||
) <|>
|
||||
(do x <- p
|
||||
(xs, e) <- scan
|
||||
return (x:xs, e))
|
||||
|
||||
-- | Like @manyUntil@, but also packs its result.
|
||||
manyUntilChar :: ParserT s u m Char
|
||||
-> ParserT s u m b
|
||||
-> ParserT s u m (Text, b)
|
||||
manyUntilChar p = fmap go . manyUntil p
|
||||
where
|
||||
go (x, y) = (T.pack x, y)
|
||||
|
||||
-- | Like @sepBy1@ from Parsec,
|
||||
-- but does not fail if it @sep@ succeeds and @p@ fails.
|
||||
sepBy1' :: ParsecT s u m a
|
||||
-> ParsecT s u m sep
|
||||
-> ParsecT s u m [a]
|
||||
sepBy1' p sep = (:) <$> p <*> many (try $ sep >> p)
|
||||
|
||||
-- | A more general form of @notFollowedBy@. This one allows any
|
||||
-- type of parser to be specified, and succeeds only if that parser fails.
|
||||
-- It does not consume any input.
|
||||
notFollowedBy' :: (Show b, Stream s m a) => ParserT s st m b -> ParserT s st m ()
|
||||
notFollowedBy' p = try $ join $ do a <- try p
|
||||
return (unexpected (show a))
|
||||
<|>
|
||||
return (return ())
|
||||
-- (This version due to Andrew Pimlott on the Haskell mailing list.)
|
||||
|
||||
oneOfStrings' :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> (Char -> Char -> Bool) -> [Text] -> ParserT s st m Text
|
||||
oneOfStrings' f = fmap T.pack . oneOfStrings'' f . fmap T.unpack
|
||||
|
||||
-- TODO: This should be re-implemented in a Text-aware way
|
||||
oneOfStrings'' :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> (Char -> Char -> Bool) -> [String] -> ParserT s st m String
|
||||
oneOfStrings'' _ [] = Prelude.fail "no strings"
|
||||
oneOfStrings'' matches strs = try $ do
|
||||
c <- anyChar
|
||||
let strs' = [xs | (x:xs) <- strs, x `matches` c]
|
||||
case strs' of
|
||||
[] -> Prelude.fail "not found"
|
||||
_ -> (c:) <$> oneOfStrings'' matches strs'
|
||||
<|> if "" `elem` strs'
|
||||
then return [c]
|
||||
else Prelude.fail "not found"
|
||||
|
||||
-- | Parses one of a list of strings. If the list contains
|
||||
-- two strings one of which is a prefix of the other, the longer
|
||||
-- string will be matched if possible.
|
||||
oneOfStrings :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> [Text] -> ParserT s st m Text
|
||||
oneOfStrings = oneOfStrings' (==)
|
||||
|
||||
-- | Parses one of a list of strings (tried in order), case insensitive.
|
||||
|
||||
-- TODO: This will not be accurate with general Unicode (neither
|
||||
-- Text.toLower nor Text.toCaseFold can be implemented with a map)
|
||||
oneOfStringsCI :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> [Text] -> ParserT s st m Text
|
||||
oneOfStringsCI = oneOfStrings' ciMatch
|
||||
where ciMatch x y = toLower' x == toLower' y
|
||||
-- this optimizes toLower by checking common ASCII case
|
||||
-- first, before calling the expensive unicode-aware
|
||||
-- function:
|
||||
toLower' c | isAsciiUpper c = chr (ord c + 32)
|
||||
| isAscii c = c
|
||||
| otherwise = toLower c
|
||||
|
||||
-- | Parses a space or tab.
|
||||
spaceChar :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Char
|
||||
spaceChar = satisfy $ \c -> c == ' ' || c == '\t'
|
||||
|
||||
-- | Parses a nonspace, nonnewline character.
|
||||
nonspaceChar :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Char
|
||||
nonspaceChar = satisfy (not . isSpaceChar)
|
||||
|
||||
isSpaceChar :: Char -> Bool
|
||||
isSpaceChar ' ' = True
|
||||
isSpaceChar '\t' = True
|
||||
isSpaceChar '\n' = True
|
||||
isSpaceChar '\r' = True
|
||||
isSpaceChar _ = False
|
||||
|
||||
-- | Skips zero or more spaces or tabs.
|
||||
skipSpaces :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m ()
|
||||
skipSpaces = skipMany spaceChar
|
||||
|
||||
-- | Skips zero or more spaces or tabs, then reads a newline.
|
||||
blankline :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Char
|
||||
blankline = try $ skipSpaces >> newline
|
||||
|
||||
-- | Parses one or more blank lines and returns a string of newlines.
|
||||
blanklines :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Text
|
||||
blanklines = T.pack <$> many1 blankline
|
||||
|
||||
-- | Gobble n spaces; if tabs are encountered, expand them
|
||||
-- and gobble some or all of their spaces, leaving the rest.
|
||||
gobbleSpaces :: (HasReaderOptions st, Monad m)
|
||||
=> Int -> ParserT Sources st m ()
|
||||
gobbleSpaces 0 = return ()
|
||||
gobbleSpaces n
|
||||
| n < 0 = error "gobbleSpaces called with negative number"
|
||||
| otherwise = try $ do
|
||||
char ' ' <|> eatOneSpaceOfTab
|
||||
gobbleSpaces (n - 1)
|
||||
|
||||
eatOneSpaceOfTab :: (HasReaderOptions st, Monad m) => ParserT Sources st m Char
|
||||
eatOneSpaceOfTab = do
|
||||
lookAhead (char '\t')
|
||||
pos <- getPosition
|
||||
tabstop <- getOption readerTabStop
|
||||
-- replace the tab on the input stream with spaces
|
||||
let numSpaces = tabstop - ((sourceColumn pos - 1) `mod` tabstop)
|
||||
inp <- getInput
|
||||
setInput $
|
||||
case inp of
|
||||
Sources [] -> error "eatOneSpaceOfTab - empty Sources list"
|
||||
Sources ((fp,t):rest) ->
|
||||
-- drop the tab and add spaces
|
||||
Sources ((fp, T.replicate numSpaces " " <> T.drop 1 t):rest)
|
||||
char ' '
|
||||
|
||||
-- | Gobble up to n spaces; if tabs are encountered, expand them
|
||||
-- and gobble some or all of their spaces, leaving the rest.
|
||||
gobbleAtMostSpaces :: (HasReaderOptions st, Monad m)
|
||||
=> Int -> ParserT Sources st m Int
|
||||
gobbleAtMostSpaces 0 = return 0
|
||||
gobbleAtMostSpaces n
|
||||
| n < 0 = error "gobbleAtMostSpaces called with negative number"
|
||||
| otherwise = option 0 $ do
|
||||
char ' ' <|> eatOneSpaceOfTab
|
||||
(+ 1) <$> gobbleAtMostSpaces (n - 1)
|
||||
|
||||
-- | Parses material enclosed between start and end parsers.
|
||||
enclosed :: (Show end, Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m t -- ^ start parser
|
||||
-> ParserT s st m end -- ^ end parser
|
||||
-> ParserT s st m a -- ^ content parser (to be used repeatedly)
|
||||
-> ParserT s st m [a]
|
||||
enclosed start end parser = try $
|
||||
start >> notFollowedBy space >> many1Till parser end
|
||||
|
||||
-- | Parse string, case insensitive.
|
||||
stringAnyCase :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> Text -> ParserT s st m Text
|
||||
stringAnyCase = fmap T.pack . stringAnyCase' . T.unpack
|
||||
|
||||
stringAnyCase' :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> String -> ParserT s st m String
|
||||
stringAnyCase' [] = string ""
|
||||
stringAnyCase' (x:xs) = do
|
||||
firstChar <- char (toUpper x) <|> char (toLower x)
|
||||
rest <- stringAnyCase' xs
|
||||
return (firstChar:rest)
|
||||
|
||||
-- TODO rewrite by just adding to Sources stream?
|
||||
-- | Parse contents of 'str' using 'parser' and return result.
|
||||
parseFromString :: Monad m
|
||||
=> ParserT Sources st m r
|
||||
-> Text
|
||||
-> ParserT Sources st m r
|
||||
parseFromString parser str = do
|
||||
oldPos <- getPosition
|
||||
oldInput <- getInput
|
||||
setInput $ toSources str
|
||||
setPosition $ initialPos $ sourceName oldPos <> "_chunk"
|
||||
result <- parser
|
||||
spaces
|
||||
setInput oldInput
|
||||
setPosition oldPos
|
||||
return result
|
||||
|
||||
-- | Like 'parseFromString' but specialized for 'ParserState'.
|
||||
-- This resets 'stateLastStrPos', which is almost always what we want.
|
||||
parseFromString' :: (Monad m, HasLastStrPosition u)
|
||||
=> ParserT Sources u m a
|
||||
-> Text
|
||||
-> ParserT Sources u m a
|
||||
parseFromString' parser str = do
|
||||
oldLastStrPos <- getLastStrPos <$> getState
|
||||
updateState $ setLastStrPos Nothing
|
||||
res <- parseFromString parser str
|
||||
updateState $ setLastStrPos oldLastStrPos
|
||||
return res
|
||||
|
||||
-- | Parse raw line block up to and including blank lines.
|
||||
lineClump :: Monad m => ParserT Sources st m Text
|
||||
lineClump = blanklines
|
||||
<|> (T.unlines <$> many1 (notFollowedBy blankline >> anyLine))
|
||||
|
||||
-- | Parse a string of characters between an open character
|
||||
-- and a close character, including text between balanced
|
||||
-- pairs of open and close, which must be different. For example,
|
||||
-- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))"
|
||||
-- and return "hello (there)".
|
||||
charsInBalanced :: (Stream s m Char, UpdateSourcePos s Char) => Char -> Char -> ParserT s st m Char
|
||||
-> ParserT s st m Text
|
||||
charsInBalanced open close parser = try $ do
|
||||
char open
|
||||
let isDelim c = c == open || c == close
|
||||
raw <- many $ T.pack <$> many1 (notFollowedBy (satisfy isDelim) >> parser)
|
||||
<|> (do res <- charsInBalanced open close parser
|
||||
return $ T.singleton open <> res <> T.singleton close)
|
||||
char close
|
||||
return $ T.concat raw
|
||||
|
||||
-- old charsInBalanced would be:
|
||||
-- charsInBalanced open close (noneOf "\n" <|> char '\n' >> notFollowedBy blankline)
|
||||
-- old charsInBalanced' would be:
|
||||
-- charsInBalanced open close anyChar
|
||||
|
||||
-- Parsers for email addresses and URIs
|
||||
|
||||
-- | Parses an email address; returns original and corresponding
|
||||
-- escaped mailto: URI.
|
||||
emailAddress :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (Text, Text)
|
||||
emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain)
|
||||
where toResult mbox dom = let full = fromEntities $ T.pack $ mbox ++ '@':dom
|
||||
in (full, escapeURI $ "mailto:" <> full)
|
||||
mailbox = intercalate "." <$> (emailWord `sepBy1'` dot)
|
||||
domain = intercalate "." <$> (subdomain `sepBy1'` dot)
|
||||
dot = char '.'
|
||||
subdomain = many1 $ alphaNum <|> innerPunct (=='-')
|
||||
-- this excludes some valid email addresses, since an
|
||||
-- email could contain e.g. '__', but gives better results
|
||||
-- for our purposes, when combined with markdown parsing:
|
||||
innerPunct f = try (satisfy f
|
||||
<* notFollowedBy (satisfy (not . isAlphaNum)))
|
||||
-- technically an email address could begin with a symbol,
|
||||
-- but allowing this creates too many problems.
|
||||
-- See e.g. https://github.com/jgm/pandoc/issues/2940
|
||||
emailWord = do x <- satisfy isAlphaNum
|
||||
xs <- many (satisfy isEmailChar)
|
||||
return (x:xs)
|
||||
isEmailChar c = isAlphaNum c || isEmailPunct c
|
||||
isEmailPunct c = T.any (== c) "!\"#$%&'*+-/=?^_{|}~;"
|
||||
|
||||
|
||||
uriScheme :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Text
|
||||
uriScheme = oneOfStringsCI (Set.toList schemes)
|
||||
|
||||
-- | Parses a URI. Returns pair of original and URI-escaped version.
|
||||
uri :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (Text, Text)
|
||||
uri = try $ do
|
||||
scheme <- uriScheme
|
||||
char ':'
|
||||
-- Avoid parsing e.g. "**Notes:**" as a raw URI:
|
||||
notFollowedBy $ satisfy (\c -> c == '*' || c == '_' || c == ']')
|
||||
-- We allow sentence punctuation except at the end, since
|
||||
-- we don't want the trailing '.' in 'http://google.com.' We want to allow
|
||||
-- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation)
|
||||
-- as a URL, while NOT picking up the closing paren in
|
||||
-- (http://wikipedia.org). So we include balanced parens in the URL.
|
||||
str <- T.concat <$> many1 (uriChunkBetween '(' ')'
|
||||
<|> uriChunkBetween '{' '}'
|
||||
<|> uriChunkBetween '[' ']'
|
||||
<|> T.pack <$> uriChunk)
|
||||
str' <- option str $ char '/' >> return (str <> "/")
|
||||
let uri' = scheme <> ":" <> fromEntities str'
|
||||
return (uri', escapeURI uri')
|
||||
where
|
||||
isWordChar '#' = True
|
||||
isWordChar '$' = True
|
||||
isWordChar '%' = True
|
||||
isWordChar '+' = True
|
||||
isWordChar '/' = True
|
||||
isWordChar '@' = True
|
||||
isWordChar '\\' = True
|
||||
isWordChar '_' = True
|
||||
isWordChar '-' = True
|
||||
isWordChar '&' = True
|
||||
isWordChar '=' = True
|
||||
isWordChar c = isAlphaNum c
|
||||
|
||||
wordChar = satisfy isWordChar
|
||||
percentEscaped = try $ (:) <$> char '%' <*> many1 hexDigit
|
||||
entity = try $ pure <$> characterReference
|
||||
punct = try $ many1 (char ',') <|> fmap pure (satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>'))
|
||||
uriChunk = many1 wordChar
|
||||
<|> percentEscaped
|
||||
<|> entity
|
||||
<|> try (punct <* lookAhead (void wordChar <|> void percentEscaped))
|
||||
uriChunkBetween l r = try $ do chunk <- between (char l) (char r) uriChunk
|
||||
return (T.pack $ [l] ++ chunk ++ [r])
|
||||
|
||||
-- | Applies a parser, returns tuple of its results and its horizontal
|
||||
-- displacement (the difference between the source column at the end
|
||||
-- and the source column at the beginning). Vertical displacement
|
||||
-- (source row) is ignored.
|
||||
withHorizDisplacement :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m a -- ^ Parser to apply
|
||||
-> ParserT s st m (a, Int) -- ^ (result, displacement)
|
||||
withHorizDisplacement parser = do
|
||||
pos1 <- getPosition
|
||||
result <- parser
|
||||
pos2 <- getPosition
|
||||
return (result, sourceColumn pos2 - sourceColumn pos1)
|
||||
|
||||
-- | Applies a parser and returns the raw string that was parsed,
|
||||
-- along with the value produced by the parser.
|
||||
withRaw :: Monad m
|
||||
=> ParsecT Sources st m a
|
||||
-> ParsecT Sources st m (a, Text)
|
||||
withRaw parser = do
|
||||
inps1 <- getInput
|
||||
result <- parser
|
||||
inps2 <- getInput
|
||||
-- 'raw' is the difference between inps1 and inps2
|
||||
return (result, sourcesDifference inps1 inps2)
|
||||
|
||||
sourcesDifference :: Sources -> Sources -> Text
|
||||
sourcesDifference (Sources is1) (Sources is2) = go is1 is2
|
||||
where
|
||||
go inps1 inps2 =
|
||||
case (inps1, inps2) of
|
||||
([], _) -> mempty
|
||||
(_, []) -> mconcat $ map snd inps1
|
||||
((p1,t1):rest1, (p2, t2):rest2)
|
||||
| p1 == p2
|
||||
, t1 == t2 -> go rest1 rest2
|
||||
| p1 == p2
|
||||
, t1 /= t2 -> fromMaybe mempty $ T.stripSuffix t2 t1
|
||||
| otherwise -> t1 <> go rest1 inps2
|
||||
|
||||
-- | Parses backslash, then applies character parser.
|
||||
escaped :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Char -- ^ Parser for character to escape
|
||||
-> ParserT s st m Char
|
||||
escaped parser = try $ char '\\' >> parser
|
||||
|
||||
-- | Parse character entity.
|
||||
characterReference :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Char
|
||||
characterReference = try $ do
|
||||
char '&'
|
||||
ent <- many1Till nonspaceChar (char ';')
|
||||
let ent' = case ent of
|
||||
'#':'X':xs -> '#':'x':xs -- workaround tagsoup bug
|
||||
'#':_ -> ent
|
||||
_ -> ent ++ ";"
|
||||
case lookupEntity ent' of
|
||||
Just (c : _) -> return c
|
||||
_ -> Prelude.fail "entity not found"
|
||||
|
||||
-- | Parses a character reference and returns a Str element.
|
||||
charRef :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inline
|
||||
charRef = Str . T.singleton <$> characterReference
|
||||
|
||||
lineBlockLine :: Monad m => ParserT Sources st m Text
|
||||
lineBlockLine = try $ do
|
||||
char '|'
|
||||
char ' '
|
||||
white <- T.pack <$> many (spaceChar >> return '\160')
|
||||
notFollowedBy newline
|
||||
line <- anyLine
|
||||
continuations <- many (try $ char ' ' >> anyLine)
|
||||
return $ white <> T.unwords (line : continuations)
|
||||
|
||||
blankLineBlockLine :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Char
|
||||
blankLineBlockLine = try (char '|' >> blankline)
|
||||
|
||||
-- | Parses an RST-style line block and returns a list of strings.
|
||||
lineBlockLines :: Monad m => ParserT Sources st m [Text]
|
||||
lineBlockLines = try $ do
|
||||
lines' <- many1 (lineBlockLine <|> (T.singleton <$> blankLineBlockLine))
|
||||
skipMany blankline
|
||||
return lines'
|
||||
|
||||
|
||||
-- | Removes the ParsecT layer from the monad transformer stack
|
||||
readWithM :: (Monad m, ToSources t)
|
||||
=> ParserT Sources st m a -- ^ parser
|
||||
-> st -- ^ initial state
|
||||
-> t -- ^ input
|
||||
-> m (Either PandocError a)
|
||||
readWithM parser state input =
|
||||
mapLeft (PandocParsecError sources)
|
||||
<$> runParserT parser state (initialSourceName sources) sources
|
||||
where
|
||||
sources = toSources input
|
||||
|
||||
-- | Parse a string with a given parser and state
|
||||
readWith :: ToSources t
|
||||
=> Parser Sources st a
|
||||
-> st
|
||||
-> t
|
||||
-> Either PandocError a
|
||||
readWith p t inp = runIdentity $ readWithM p t inp
|
||||
|
||||
-- | Parse a string with @parser@ (for testing).
|
||||
testStringWith :: Show a
|
||||
=> ParserT Sources ParserState Identity a
|
||||
-> Text
|
||||
-> IO ()
|
||||
testStringWith parser str = UTF8.putStrLn $ tshow $
|
||||
readWith parser defaultParserState (toSources str)
|
||||
|
||||
-- | Add header to the list of headers in state, together
|
||||
-- with its associated identifier. If the identifier is null
|
||||
-- and the auto_identifiers extension is set, generate a new
|
||||
-- unique identifier, and update the list of identifiers
|
||||
-- in state. Issue a warning if an explicit identifier
|
||||
-- is encountered that duplicates an earlier identifier
|
||||
-- (explicit or automatically generated).
|
||||
registerHeader :: (Stream s m a, HasReaderOptions st,
|
||||
HasLogMessages st, HasIdentifierList st)
|
||||
=> Attr -> Inlines -> ParserT s st m Attr
|
||||
registerHeader (ident,classes,kvs) header' = do
|
||||
ids <- extractIdentifierList <$> getState
|
||||
exts <- getOption readerExtensions
|
||||
if T.null ident && Ext_auto_identifiers `extensionEnabled` exts
|
||||
then do
|
||||
let id' = uniqueIdent exts (B.toList header') ids
|
||||
let id'' = if Ext_ascii_identifiers `extensionEnabled` exts
|
||||
then toAsciiText id'
|
||||
else id'
|
||||
updateState $ updateIdentifierList $ Set.insert id'
|
||||
updateState $ updateIdentifierList $ Set.insert id''
|
||||
return (id'',classes,kvs)
|
||||
else do
|
||||
unless (T.null ident) $ do
|
||||
when (ident `Set.member` ids) $ do
|
||||
pos <- getPosition
|
||||
logMessage $ DuplicateIdentifier ident pos
|
||||
updateState $ updateIdentifierList $ Set.insert ident
|
||||
return (ident,classes,kvs)
|
||||
|
||||
-- This is used to prevent exponential blowups for things like:
|
||||
-- a**a*a**a*a**a*a**a*a**a*a**a*a**
|
||||
nested :: Stream s m a
|
||||
=> ParserT s ParserState m a
|
||||
-> ParserT s ParserState m a
|
||||
nested p = do
|
||||
nestlevel <- stateMaxNestingLevel <$> getState
|
||||
guard $ nestlevel > 0
|
||||
updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 }
|
||||
res <- p
|
||||
updateState $ \st -> st{ stateMaxNestingLevel = nestlevel }
|
||||
return res
|
||||
|
||||
token :: (Stream s m t)
|
||||
=> (t -> Text)
|
||||
-> (t -> SourcePos)
|
||||
-> (t -> Maybe a)
|
||||
-> ParsecT s st m a
|
||||
token pp pos match = tokenPrim (T.unpack . pp) (\_ t _ -> pos t) match
|
||||
|
||||
infixr 5 <+?>
|
||||
(<+?>) :: (Monoid a) => ParserT s st m a -> ParserT s st m a -> ParserT s st m a
|
||||
a <+?> b = a >>= flip fmap (try b <|> return mempty) . mappend
|
||||
|
||||
extractIdClass :: Attr -> Attr
|
||||
extractIdClass (ident, cls, kvs) = (ident', cls', kvs')
|
||||
where
|
||||
ident' = fromMaybe ident (lookup "id" kvs)
|
||||
cls' = maybe cls T.words $ lookup "class" kvs
|
||||
kvs' = filter (\(k,_) -> k /= "id" || k /= "class") kvs
|
||||
|
||||
insertIncludedFile :: (PandocMonad m, HasIncludeFiles st)
|
||||
=> ParserT a st m b -- ^ parser to apply
|
||||
-> (Text -> a) -- ^ convert Text to stream type
|
||||
-> [FilePath] -- ^ search path (directories)
|
||||
-> FilePath -- ^ path of file to include
|
||||
-> Maybe Int -- ^ start line (negative counts from end)
|
||||
-> Maybe Int -- ^ end line (negative counts from end)
|
||||
-> ParserT a st m b
|
||||
insertIncludedFile parser toStream dirs f mbstartline mbendline = do
|
||||
oldPos <- getPosition
|
||||
oldInput <- getInput
|
||||
containers <- getIncludeFiles <$> getState
|
||||
when (T.pack f `elem` containers) $
|
||||
throwError $ PandocParseError $ T.pack $ "Include file loop at " ++ show oldPos
|
||||
updateState $ addIncludeFile $ T.pack f
|
||||
mbcontents <- readFileFromDirs dirs f
|
||||
contents <- case mbcontents of
|
||||
Just s -> return $ exciseLines mbstartline mbendline s
|
||||
Nothing -> do
|
||||
report $ CouldNotLoadIncludeFile (T.pack f) oldPos
|
||||
return ""
|
||||
setInput $ toStream contents
|
||||
setPosition $ newPos f (fromMaybe 1 mbstartline) 1
|
||||
result <- parser
|
||||
setInput oldInput
|
||||
setPosition oldPos
|
||||
updateState dropLatestIncludeFile
|
||||
return result
|
||||
|
||||
exciseLines :: Maybe Int -> Maybe Int -> Text -> Text
|
||||
exciseLines Nothing Nothing t = t
|
||||
exciseLines mbstartline mbendline t =
|
||||
T.unlines $ take (endline' - (startline' - 1))
|
||||
$ drop (startline' - 1) contentLines
|
||||
where
|
||||
contentLines = T.lines t
|
||||
numLines = length contentLines
|
||||
startline' = case mbstartline of
|
||||
Nothing -> 1
|
||||
Just x | x >= 0 -> x
|
||||
| otherwise -> numLines + x -- negative from end
|
||||
endline' = case mbendline of
|
||||
Nothing -> numLines
|
||||
Just x | x >= 0 -> x
|
||||
| otherwise -> numLines + x -- negative from end
|
217
src/Text/Pandoc/Parsing/GridTable.hs
Normal file
217
src/Text/Pandoc/Parsing/GridTable.hs
Normal file
|
@ -0,0 +1,217 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Parsing.GridTable
|
||||
Copyright : Copyright (C) 2006-2022 John MacFarlane
|
||||
License : GPL-2.0-or-later
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
Shared parsers for plaintext tables.
|
||||
-}
|
||||
module Text.Pandoc.Parsing.GridTable
|
||||
( gridTableWith
|
||||
, gridTableWith'
|
||||
, tableWith
|
||||
, tableWith'
|
||||
, widthsFromIndices
|
||||
)
|
||||
where
|
||||
|
||||
import Control.Monad (guard)
|
||||
import Data.List (transpose)
|
||||
import Data.Text (Text)
|
||||
import Text.Pandoc.Options (ReaderOptions (readerColumns))
|
||||
import Text.Pandoc.Builder (Blocks)
|
||||
import Text.Pandoc.Definition
|
||||
import Text.Pandoc.Parsing.Capabilities
|
||||
import Text.Pandoc.Parsing.Combinators
|
||||
import Text.Pandoc.Parsing.Types
|
||||
import Text.Pandoc.Shared (compactify, splitTextByIndices, trim, trimr)
|
||||
import Text.Pandoc.Sources
|
||||
import Text.Parsec
|
||||
( Stream (..), many1, notFollowedBy, option, optional, sepEndBy1, try )
|
||||
|
||||
import qualified Data.Text as T
|
||||
import qualified Text.Pandoc.Builder as B
|
||||
|
||||
-- | Parse a grid table: starts with row of '-' on top, then header
|
||||
-- (which may be grid), then the rows, which may be grid, separated by
|
||||
-- blank lines, and ending with a footer (dashed line followed by blank
|
||||
-- line).
|
||||
gridTableWith :: (Monad m, Monad mf, HasLastStrPosition st, HasReaderOptions st)
|
||||
=> ParserT Sources st m (mf Blocks) -- ^ Block list parser
|
||||
-> Bool -- ^ Headerless table
|
||||
-> ParserT Sources st m (mf Blocks)
|
||||
gridTableWith blocks headless =
|
||||
tableWith (gridTableHeader headless blocks) (gridTableRow blocks)
|
||||
(gridTableSep '-') gridTableFooter
|
||||
|
||||
gridTableWith' :: (Monad m, Monad mf,
|
||||
HasReaderOptions st, HasLastStrPosition st)
|
||||
=> ParserT Sources st m (mf Blocks) -- ^ Block list parser
|
||||
-> Bool -- ^ Headerless table
|
||||
-> ParserT Sources st m (TableComponents mf)
|
||||
gridTableWith' blocks headless =
|
||||
tableWith' (gridTableHeader headless blocks) (gridTableRow blocks)
|
||||
(gridTableSep '-') gridTableFooter
|
||||
|
||||
gridTableSplitLine :: [Int] -> Text -> [Text]
|
||||
gridTableSplitLine indices line = map removeFinalBar $ tail $
|
||||
splitTextByIndices (init indices) $ trimr line
|
||||
|
||||
gridPart :: Monad m => Char -> ParserT Sources st m ((Int, Int), Alignment)
|
||||
gridPart ch = do
|
||||
leftColon <- option False (True <$ char ':')
|
||||
dashes <- many1 (char ch)
|
||||
rightColon <- option False (True <$ char ':')
|
||||
char '+'
|
||||
let lengthDashes = length dashes + (if leftColon then 1 else 0) +
|
||||
(if rightColon then 1 else 0)
|
||||
let alignment = case (leftColon, rightColon) of
|
||||
(True, True) -> AlignCenter
|
||||
(True, False) -> AlignLeft
|
||||
(False, True) -> AlignRight
|
||||
(False, False) -> AlignDefault
|
||||
return ((lengthDashes, lengthDashes + 1), alignment)
|
||||
|
||||
gridDashedLines :: Monad m => Char -> ParserT Sources st m [((Int, Int), Alignment)]
|
||||
gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline
|
||||
|
||||
removeFinalBar :: Text -> Text
|
||||
removeFinalBar = T.dropWhileEnd go . T.dropWhileEnd (=='|')
|
||||
where
|
||||
go c = T.any (== c) " \t"
|
||||
|
||||
-- | Separator between rows of grid table.
|
||||
gridTableSep :: Monad m => Char -> ParserT Sources st m Char
|
||||
gridTableSep ch = try $ gridDashedLines ch >> return '\n'
|
||||
|
||||
-- | Parse header for a grid table.
|
||||
gridTableHeader :: (Monad m, Monad mf, HasLastStrPosition st)
|
||||
=> Bool -- ^ Headerless table
|
||||
-> ParserT Sources st m (mf Blocks)
|
||||
-> ParserT Sources st m (mf [Blocks], [Alignment], [Int])
|
||||
gridTableHeader True _ = do
|
||||
optional blanklines
|
||||
dashes <- gridDashedLines '-'
|
||||
let aligns = map snd dashes
|
||||
let lines' = map (snd . fst) dashes
|
||||
let indices = scanl (+) 0 lines'
|
||||
return (return [], aligns, indices)
|
||||
gridTableHeader False blocks = try $ do
|
||||
optional blanklines
|
||||
dashes <- gridDashedLines '-'
|
||||
rawContent <- many1 (notFollowedBy (gridTableSep '=') >> char '|' >>
|
||||
T.pack <$> many1Till anyChar newline)
|
||||
underDashes <- gridDashedLines '='
|
||||
guard $ length dashes == length underDashes
|
||||
let lines' = map (snd . fst) underDashes
|
||||
let indices = scanl (+) 0 lines'
|
||||
let aligns = map snd underDashes
|
||||
let rawHeads = map (T.unlines . map trim) $ transpose
|
||||
$ map (gridTableSplitLine indices) rawContent
|
||||
heads <- sequence <$> mapM (parseFromString' blocks . trim) rawHeads
|
||||
return (heads, aligns, indices)
|
||||
|
||||
gridTableRawLine :: (Stream s m Char, UpdateSourcePos s Char) => [Int] -> ParserT s st m [Text]
|
||||
gridTableRawLine indices = do
|
||||
char '|'
|
||||
line <- many1Till anyChar newline
|
||||
return (gridTableSplitLine indices $ T.pack line)
|
||||
|
||||
-- | Parse row of grid table.
|
||||
gridTableRow :: (Monad m, Monad mf, HasLastStrPosition st)
|
||||
=> ParserT Sources st m (mf Blocks)
|
||||
-> [Int]
|
||||
-> ParserT Sources st m (mf [Blocks])
|
||||
gridTableRow blocks indices = do
|
||||
colLines <- many1 (gridTableRawLine indices)
|
||||
let cols = map ((<> "\n") . T.unlines . removeOneLeadingSpace) $
|
||||
transpose colLines
|
||||
compactifyCell bs = case compactify [bs] of
|
||||
[] -> mempty
|
||||
x:_ -> x
|
||||
cells <- sequence <$> mapM (parseFromString' blocks) cols
|
||||
return $ fmap (map compactifyCell) cells
|
||||
|
||||
removeOneLeadingSpace :: [Text] -> [Text]
|
||||
removeOneLeadingSpace xs =
|
||||
if all startsWithSpace xs
|
||||
then map (T.drop 1) xs
|
||||
else xs
|
||||
where startsWithSpace t = case T.uncons t of
|
||||
Nothing -> True
|
||||
Just (c, _) -> c == ' '
|
||||
|
||||
-- | Parse footer for a grid table.
|
||||
gridTableFooter :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m ()
|
||||
gridTableFooter = optional blanklines
|
||||
|
||||
---
|
||||
|
||||
-- | Parse a table using 'headerParser', 'rowParser',
|
||||
-- 'lineParser', and 'footerParser'.
|
||||
tableWith :: (Stream s m Char, UpdateSourcePos s Char,
|
||||
HasReaderOptions st, Monad mf)
|
||||
=> ParserT s st m (mf [Blocks], [Alignment], [Int])
|
||||
-> ([Int] -> ParserT s st m (mf [Blocks]))
|
||||
-> ParserT s st m sep
|
||||
-> ParserT s st m end
|
||||
-> ParserT s st m (mf Blocks)
|
||||
tableWith headerParser rowParser lineParser footerParser = try $ do
|
||||
(aligns, widths, heads, rows) <- tableWith' headerParser rowParser
|
||||
lineParser footerParser
|
||||
let th = TableHead nullAttr <$> heads
|
||||
tb = (:[]) . TableBody nullAttr 0 [] <$> rows
|
||||
tf = pure $ TableFoot nullAttr []
|
||||
return $ B.table B.emptyCaption (zip aligns (map fromWidth widths)) <$> th <*> tb <*> tf
|
||||
where
|
||||
fromWidth n
|
||||
| n > 0 = ColWidth n
|
||||
| otherwise = ColWidthDefault
|
||||
|
||||
type TableComponents mf = ([Alignment], [Double], mf [Row], mf [Row])
|
||||
|
||||
tableWith' :: (Stream s m Char, UpdateSourcePos s Char,
|
||||
HasReaderOptions st, Monad mf)
|
||||
=> ParserT s st m (mf [Blocks], [Alignment], [Int])
|
||||
-> ([Int] -> ParserT s st m (mf [Blocks]))
|
||||
-> ParserT s st m sep
|
||||
-> ParserT s st m end
|
||||
-> ParserT s st m (TableComponents mf)
|
||||
tableWith' headerParser rowParser lineParser footerParser = try $ do
|
||||
(heads, aligns, indices) <- headerParser
|
||||
lines' <- sequence <$> rowParser indices `sepEndBy1` lineParser
|
||||
footerParser
|
||||
numColumns <- getOption readerColumns
|
||||
let widths = if null indices
|
||||
then replicate (length aligns) 0.0
|
||||
else widthsFromIndices numColumns indices
|
||||
let toRow = Row nullAttr . map B.simpleCell
|
||||
toHeaderRow l = [toRow l | not (null l)]
|
||||
return (aligns, widths, toHeaderRow <$> heads, map toRow <$> lines')
|
||||
|
||||
-- Calculate relative widths of table columns, based on indices
|
||||
widthsFromIndices :: Int -- Number of columns on terminal
|
||||
-> [Int] -- Indices
|
||||
-> [Double] -- Fractional relative sizes of columns
|
||||
widthsFromIndices _ [] = []
|
||||
widthsFromIndices numColumns' indices =
|
||||
let numColumns = max numColumns' (if null indices then 0 else last indices)
|
||||
lengths' = zipWith (-) indices (0:indices)
|
||||
lengths = reverse $
|
||||
case reverse lengths' of
|
||||
[] -> []
|
||||
[x] -> [x]
|
||||
-- compensate for the fact that intercolumn
|
||||
-- spaces are counted in widths of all columns
|
||||
-- but the last...
|
||||
(x:y:zs) -> if x < y && y - x <= 2
|
||||
then y:y:zs
|
||||
else x:y:zs
|
||||
totLength = sum lengths
|
||||
quotient = if totLength > numColumns
|
||||
then fromIntegral totLength
|
||||
else fromIntegral numColumns
|
||||
fracs = map (\l -> fromIntegral l / quotient) lengths in
|
||||
tail fracs
|
215
src/Text/Pandoc/Parsing/Lists.hs
Normal file
215
src/Text/Pandoc/Parsing/Lists.hs
Normal file
|
@ -0,0 +1,215 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Parsing.Lists
|
||||
Copyright : © 2006-2022 John MacFarlane
|
||||
License : GPL-2.0-or-later
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
Parsers for list markers.
|
||||
-}
|
||||
|
||||
module Text.Pandoc.Parsing.Lists
|
||||
( anyOrderedListMarker
|
||||
, decimal
|
||||
, indentWith
|
||||
, lowerAlpha
|
||||
, lowerRoman
|
||||
, orderedListMarker
|
||||
, romanNumeral
|
||||
, upperAlpha
|
||||
, upperRoman
|
||||
)
|
||||
where
|
||||
|
||||
import Data.Char
|
||||
( isAsciiUpper
|
||||
, isAsciiLower
|
||||
, ord
|
||||
, toLower
|
||||
)
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Text.Pandoc.Definition
|
||||
( ListNumberDelim(..)
|
||||
, ListAttributes
|
||||
, ListNumberStyle(..)
|
||||
)
|
||||
import Text.Pandoc.Shared (safeRead)
|
||||
import Text.Pandoc.Sources
|
||||
import Text.Parsec
|
||||
( (<|>)
|
||||
, Stream(..)
|
||||
, choice
|
||||
, getState
|
||||
, lookAhead
|
||||
, many
|
||||
, many1
|
||||
, option
|
||||
, try
|
||||
, updateState
|
||||
)
|
||||
import Text.Pandoc.Parsing.Combinators
|
||||
import Text.Pandoc.Parsing.State
|
||||
import Text.Pandoc.Parsing.Types (ParserT)
|
||||
|
||||
import qualified Data.Map as M
|
||||
import qualified Data.Text as T
|
||||
|
||||
-- | Parses a roman numeral (uppercase or lowercase), returns number.
|
||||
romanNumeral :: (Stream s m Char, UpdateSourcePos s Char) => Bool -- ^ Uppercase if true
|
||||
-> ParserT s st m Int
|
||||
romanNumeral upperCase = do
|
||||
let rchar uc = char $ if upperCase then uc else toLower uc
|
||||
let one = rchar 'I'
|
||||
let five = rchar 'V'
|
||||
let ten = rchar 'X'
|
||||
let fifty = rchar 'L'
|
||||
let hundred = rchar 'C'
|
||||
let fivehundred = rchar 'D'
|
||||
let thousand = rchar 'M'
|
||||
lookAhead $ choice [one, five, ten, fifty, hundred, fivehundred, thousand]
|
||||
thousands <- (1000 *) . length <$> many thousand
|
||||
ninehundreds <- option 0 $ try $ hundred >> thousand >> return 900
|
||||
fivehundreds <- option 0 $ 500 <$ fivehundred
|
||||
fourhundreds <- option 0 $ try $ hundred >> fivehundred >> return 400
|
||||
hundreds <- (100 *) . length <$> many hundred
|
||||
nineties <- option 0 $ try $ ten >> hundred >> return 90
|
||||
fifties <- option 0 (50 <$ fifty)
|
||||
forties <- option 0 $ try $ ten >> fifty >> return 40
|
||||
tens <- (10 *) . length <$> many ten
|
||||
nines <- option 0 $ try $ one >> ten >> return 9
|
||||
fives <- option 0 (5 <$ five)
|
||||
fours <- option 0 $ try $ one >> five >> return 4
|
||||
ones <- length <$> many one
|
||||
let total = thousands + ninehundreds + fivehundreds + fourhundreds +
|
||||
hundreds + nineties + fifties + forties + tens + nines +
|
||||
fives + fours + ones
|
||||
if total == 0
|
||||
then Prelude.fail "not a roman numeral"
|
||||
else return total
|
||||
|
||||
-- | Parses an uppercase roman numeral and returns (UpperRoman, number).
|
||||
upperRoman :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int)
|
||||
upperRoman = do
|
||||
num <- romanNumeral True
|
||||
return (UpperRoman, num)
|
||||
|
||||
-- | Parses a lowercase roman numeral and returns (LowerRoman, number).
|
||||
lowerRoman :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int)
|
||||
lowerRoman = do
|
||||
num <- romanNumeral False
|
||||
return (LowerRoman, num)
|
||||
|
||||
-- | Parses a decimal numeral and returns (Decimal, number).
|
||||
decimal :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int)
|
||||
decimal = do
|
||||
num <- many1 digit
|
||||
return (Decimal, fromMaybe 1 $ safeRead $ T.pack num)
|
||||
|
||||
-- | Parses a '@' and optional label and
|
||||
-- returns (DefaultStyle, [next example number]). The next
|
||||
-- example number is incremented in parser state, and the label
|
||||
-- (if present) is added to the label table.
|
||||
exampleNum :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s ParserState m (ListNumberStyle, Int)
|
||||
exampleNum = do
|
||||
char '@'
|
||||
lab <- mconcat . map T.pack <$>
|
||||
many (many1 alphaNum <|>
|
||||
try (do c <- char '_' <|> char '-'
|
||||
cs <- many1 alphaNum
|
||||
return (c:cs)))
|
||||
st <- getState
|
||||
let num = stateNextExample st
|
||||
let newlabels = if T.null lab
|
||||
then stateExamples st
|
||||
else M.insert lab num $ stateExamples st
|
||||
updateState $ \s -> s{ stateNextExample = num + 1
|
||||
, stateExamples = newlabels }
|
||||
return (Example, num)
|
||||
|
||||
-- | Parses a '#' returns (DefaultStyle, 1).
|
||||
defaultNum :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int)
|
||||
defaultNum = do
|
||||
char '#'
|
||||
return (DefaultStyle, 1)
|
||||
|
||||
-- | Parses a lowercase letter and returns (LowerAlpha, number).
|
||||
lowerAlpha :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int)
|
||||
lowerAlpha = do
|
||||
ch <- satisfy isAsciiLower
|
||||
return (LowerAlpha, ord ch - ord 'a' + 1)
|
||||
|
||||
-- | Parses an uppercase letter and returns (UpperAlpha, number).
|
||||
upperAlpha :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int)
|
||||
upperAlpha = do
|
||||
ch <- satisfy isAsciiUpper
|
||||
return (UpperAlpha, ord ch - ord 'A' + 1)
|
||||
|
||||
-- | Parses a roman numeral i or I
|
||||
romanOne :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m (ListNumberStyle, Int)
|
||||
romanOne = (char 'i' >> return (LowerRoman, 1)) <|>
|
||||
(char 'I' >> return (UpperRoman, 1))
|
||||
|
||||
-- | Parses an ordered list marker and returns list attributes.
|
||||
anyOrderedListMarker :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s ParserState m ListAttributes
|
||||
anyOrderedListMarker = choice
|
||||
[delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens],
|
||||
numParser <- [decimal, exampleNum, defaultNum, romanOne,
|
||||
lowerAlpha, lowerRoman, upperAlpha, upperRoman]]
|
||||
|
||||
-- | Parses a list number (num) followed by a period, returns list attributes.
|
||||
inPeriod :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m (ListNumberStyle, Int)
|
||||
-> ParserT s st m ListAttributes
|
||||
inPeriod num = try $ do
|
||||
(style, start) <- num
|
||||
char '.'
|
||||
let delim = if style == DefaultStyle
|
||||
then DefaultDelim
|
||||
else Period
|
||||
return (start, style, delim)
|
||||
|
||||
-- | Parses a list number (num) followed by a paren, returns list attributes.
|
||||
inOneParen :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m (ListNumberStyle, Int)
|
||||
-> ParserT s st m ListAttributes
|
||||
inOneParen num = try $ do
|
||||
(style, start) <- num
|
||||
char ')'
|
||||
return (start, style, OneParen)
|
||||
|
||||
-- | Parses a list number (num) enclosed in parens, returns list attributes.
|
||||
inTwoParens :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m (ListNumberStyle, Int)
|
||||
-> ParserT s st m ListAttributes
|
||||
inTwoParens num = try $ do
|
||||
char '('
|
||||
(style, start) <- num
|
||||
char ')'
|
||||
return (start, style, TwoParens)
|
||||
|
||||
-- | Parses an ordered list marker with a given style and delimiter,
|
||||
-- returns number.
|
||||
orderedListMarker :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ListNumberStyle
|
||||
-> ListNumberDelim
|
||||
-> ParserT s ParserState m Int
|
||||
orderedListMarker style delim = do
|
||||
let num = defaultNum <|> -- # can continue any kind of list
|
||||
case style of
|
||||
DefaultStyle -> decimal
|
||||
Example -> exampleNum
|
||||
Decimal -> decimal
|
||||
UpperRoman -> upperRoman
|
||||
LowerRoman -> lowerRoman
|
||||
UpperAlpha -> upperAlpha
|
||||
LowerAlpha -> lowerAlpha
|
||||
let context = case delim of
|
||||
DefaultDelim -> inPeriod
|
||||
Period -> inPeriod
|
||||
OneParen -> inOneParen
|
||||
TwoParens -> inTwoParens
|
||||
(start, _, _) <- context num
|
||||
return start
|
||||
|
96
src/Text/Pandoc/Parsing/Math.hs
Normal file
96
src/Text/Pandoc/Parsing/Math.hs
Normal file
|
@ -0,0 +1,96 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Parsing.Math
|
||||
Copyright : © 2006-2022 John MacFarlane
|
||||
License : GPL-2.0-or-later
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
Parsing of LaTeX math.
|
||||
-}
|
||||
|
||||
module Text.Pandoc.Parsing.Math
|
||||
( mathDisplay
|
||||
, mathInline
|
||||
)
|
||||
where
|
||||
|
||||
import Control.Monad (mzero, when)
|
||||
import Data.Text (Text)
|
||||
import Text.Parsec ((<|>), Stream(..), notFollowedBy, skipMany, try)
|
||||
import Text.Pandoc.Options
|
||||
( Extension(Ext_tex_math_dollars, Ext_tex_math_single_backslash,
|
||||
Ext_tex_math_double_backslash) )
|
||||
import Text.Pandoc.Parsing.Capabilities (HasReaderOptions, guardEnabled)
|
||||
import Text.Pandoc.Parsing.Combinators
|
||||
import Text.Pandoc.Parsing.Types (ParserT)
|
||||
import Text.Pandoc.Shared (trimMath)
|
||||
import Text.Pandoc.Sources
|
||||
(UpdateSourcePos, anyChar, char, digit, newline, satisfy, space, string)
|
||||
|
||||
import qualified Data.Text as T
|
||||
|
||||
mathInlineWith :: (Stream s m Char, UpdateSourcePos s Char) => Text -> Text -> ParserT s st m Text
|
||||
mathInlineWith op cl = try $ do
|
||||
textStr op
|
||||
when (op == "$") $ notFollowedBy space
|
||||
words' <- many1Till (
|
||||
(T.singleton <$>
|
||||
satisfy (\c -> not (isSpaceChar c || c == '\\')))
|
||||
<|> (char '\\' >>
|
||||
-- This next clause is needed because \text{..} can
|
||||
-- contain $, \(\), etc.
|
||||
(try (string "text" >>
|
||||
(("\\text" <>) <$> inBalancedBraces 0 ""))
|
||||
<|> (\c -> T.pack ['\\',c]) <$> anyChar))
|
||||
<|> do (blankline <* notFollowedBy' blankline) <|>
|
||||
(spaceChar <* skipMany spaceChar)
|
||||
notFollowedBy (char '$')
|
||||
return " "
|
||||
) (try $ textStr cl)
|
||||
notFollowedBy digit -- to prevent capture of $5
|
||||
return $ trimMath $ T.concat words'
|
||||
where
|
||||
inBalancedBraces :: (Stream s m Char, UpdateSourcePos s Char) => Int -> Text -> ParserT s st m Text
|
||||
inBalancedBraces n = fmap T.pack . inBalancedBraces' n . T.unpack
|
||||
|
||||
inBalancedBraces' :: (Stream s m Char, UpdateSourcePos s Char) => Int -> String -> ParserT s st m String
|
||||
inBalancedBraces' 0 "" = do
|
||||
c <- anyChar
|
||||
if c == '{'
|
||||
then inBalancedBraces' 1 "{"
|
||||
else mzero
|
||||
inBalancedBraces' 0 s = return $ reverse s
|
||||
inBalancedBraces' numOpen ('\\':xs) = do
|
||||
c <- anyChar
|
||||
inBalancedBraces' numOpen (c:'\\':xs)
|
||||
inBalancedBraces' numOpen xs = do
|
||||
c <- anyChar
|
||||
case c of
|
||||
'}' -> inBalancedBraces' (numOpen - 1) (c:xs)
|
||||
'{' -> inBalancedBraces' (numOpen + 1) (c:xs)
|
||||
_ -> inBalancedBraces' numOpen (c:xs)
|
||||
|
||||
mathDisplayWith :: (Stream s m Char, UpdateSourcePos s Char) => Text -> Text -> ParserT s st m Text
|
||||
mathDisplayWith op cl = try $ fmap T.pack $ do
|
||||
textStr op
|
||||
many1Till (satisfy (/= '\n') <|> (newline <* notFollowedBy' blankline))
|
||||
(try $ textStr cl)
|
||||
|
||||
mathDisplay :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Text
|
||||
mathDisplay =
|
||||
(guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$")
|
||||
<|> (guardEnabled Ext_tex_math_single_backslash >>
|
||||
mathDisplayWith "\\[" "\\]")
|
||||
<|> (guardEnabled Ext_tex_math_double_backslash >>
|
||||
mathDisplayWith "\\\\[" "\\\\]")
|
||||
|
||||
mathInline :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Text
|
||||
mathInline =
|
||||
(guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$")
|
||||
<|> (guardEnabled Ext_tex_math_single_backslash >>
|
||||
mathInlineWith "\\(" "\\)")
|
||||
<|> (guardEnabled Ext_tex_math_double_backslash >>
|
||||
mathInlineWith "\\\\(" "\\\\)")
|
145
src/Text/Pandoc/Parsing/Smart.hs
Normal file
145
src/Text/Pandoc/Parsing/Smart.hs
Normal file
|
@ -0,0 +1,145 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Parsing.Combinators
|
||||
Copyright : © 2006-2022 John MacFarlane
|
||||
License : GPL-2.0-or-later
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
Smart parsing of quotes, dashes, and other character combinations.
|
||||
-}
|
||||
|
||||
module Text.Pandoc.Parsing.Smart
|
||||
( apostrophe
|
||||
, dash
|
||||
, doubleCloseQuote
|
||||
, doubleQuoteEnd
|
||||
, doubleQuoteStart
|
||||
, doubleQuoted
|
||||
, ellipses
|
||||
, singleQuoteEnd
|
||||
, singleQuoteStart
|
||||
, singleQuoted
|
||||
, smartPunctuation
|
||||
)
|
||||
where
|
||||
|
||||
import Control.Monad (guard , void)
|
||||
import Text.Pandoc.Builder (Inlines)
|
||||
import Text.Pandoc.Options
|
||||
( extensionEnabled
|
||||
, Extension(Ext_old_dashes, Ext_smart)
|
||||
, ReaderOptions(readerExtensions) )
|
||||
import Text.Pandoc.Sources
|
||||
import Text.Pandoc.Parsing.Capabilities
|
||||
import Text.Pandoc.Parsing.Combinators
|
||||
import Text.Pandoc.Parsing.Types (ParserT)
|
||||
import Text.Parsec
|
||||
( (<|>)
|
||||
, Stream(..)
|
||||
, choice
|
||||
, lookAhead
|
||||
, manyTill
|
||||
, notFollowedBy
|
||||
, try
|
||||
)
|
||||
|
||||
import qualified Text.Pandoc.Builder as B
|
||||
|
||||
smartPunctuation :: (HasReaderOptions st, HasLastStrPosition st,
|
||||
HasQuoteContext st m,
|
||||
Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Inlines
|
||||
-> ParserT s st m Inlines
|
||||
smartPunctuation inlineParser = do
|
||||
guardEnabled Ext_smart
|
||||
choice [ quoted inlineParser, apostrophe, doubleCloseQuote, dash, ellipses ]
|
||||
|
||||
quoted :: (HasLastStrPosition st, HasQuoteContext st m,
|
||||
Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Inlines
|
||||
-> ParserT s st m Inlines
|
||||
quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser
|
||||
|
||||
singleQuoted :: (HasLastStrPosition st, HasQuoteContext st m,
|
||||
Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Inlines
|
||||
-> ParserT s st m Inlines
|
||||
singleQuoted inlineParser = do
|
||||
singleQuoteStart
|
||||
(B.singleQuoted . mconcat <$>
|
||||
try
|
||||
(withQuoteContext InSingleQuote (many1Till inlineParser singleQuoteEnd)))
|
||||
<|> pure "\8217"
|
||||
|
||||
doubleQuoted :: (HasQuoteContext st m, HasLastStrPosition st,
|
||||
Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Inlines
|
||||
-> ParserT s st m Inlines
|
||||
doubleQuoted inlineParser = do
|
||||
doubleQuoteStart
|
||||
(B.doubleQuoted . mconcat <$>
|
||||
try
|
||||
(withQuoteContext InDoubleQuote (manyTill inlineParser doubleQuoteEnd)))
|
||||
<|> pure (B.str "\8220")
|
||||
|
||||
charOrRef :: (Stream s m Char, UpdateSourcePos s Char) => [Char] -> ParserT s st m Char
|
||||
charOrRef cs =
|
||||
oneOf cs <|> try (do c <- characterReference
|
||||
guard (c `elem` cs)
|
||||
return c)
|
||||
|
||||
singleQuoteStart :: (HasLastStrPosition st, HasQuoteContext st m,
|
||||
Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m ()
|
||||
singleQuoteStart = do
|
||||
failIfInQuoteContext InSingleQuote
|
||||
-- single quote start can't be right after str
|
||||
guard =<< notAfterString
|
||||
try $ do
|
||||
charOrRef "'\8216\145"
|
||||
void $ lookAhead (satisfy (not . isSpaceChar))
|
||||
|
||||
singleQuoteEnd :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m ()
|
||||
singleQuoteEnd = try $ do
|
||||
charOrRef "'\8217\146"
|
||||
notFollowedBy alphaNum
|
||||
|
||||
doubleQuoteStart :: (HasLastStrPosition st,
|
||||
HasQuoteContext st m,
|
||||
Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m ()
|
||||
doubleQuoteStart = do
|
||||
failIfInQuoteContext InDoubleQuote
|
||||
guard =<< notAfterString
|
||||
try $ do charOrRef "\"\8220\147"
|
||||
void $ lookAhead (satisfy (not . isSpaceChar))
|
||||
|
||||
doubleQuoteEnd :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m ()
|
||||
doubleQuoteEnd = void (charOrRef "\"\8221\148")
|
||||
|
||||
apostrophe :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines
|
||||
apostrophe = (char '\'' <|> char '\8217') >> return (B.str "\8217")
|
||||
|
||||
doubleCloseQuote :: (Stream s m Char, UpdateSourcePos s Char) => ParserT s st m Inlines
|
||||
doubleCloseQuote = B.str "\8221" <$ char '"'
|
||||
|
||||
ellipses :: (Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Inlines
|
||||
ellipses = try (string "..." >> return (B.str "\8230"))
|
||||
|
||||
dash :: (HasReaderOptions st, Stream s m Char, UpdateSourcePos s Char)
|
||||
=> ParserT s st m Inlines
|
||||
dash = try $ do
|
||||
oldDashes <- extensionEnabled Ext_old_dashes <$> getOption readerExtensions
|
||||
if oldDashes
|
||||
then do
|
||||
char '-'
|
||||
(char '-' >> return (B.str "\8212"))
|
||||
<|> (lookAhead digit >> return (B.str "\8211"))
|
||||
else do
|
||||
string "--"
|
||||
(char '-' >> return (B.str "\8212"))
|
||||
<|> return (B.str "\8211")
|
191
src/Text/Pandoc/Parsing/State.hs
Normal file
191
src/Text/Pandoc/Parsing/State.hs
Normal file
|
@ -0,0 +1,191 @@
|
|||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Parsing
|
||||
Copyright : Copyright (C) 2006-2022 John MacFarlane
|
||||
License : GPL-2.0-or-later
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
A default parser state with commonly used properties.
|
||||
-}
|
||||
|
||||
module Text.Pandoc.Parsing.State
|
||||
( ParserState (..)
|
||||
, ParserContext (..)
|
||||
, HeaderType (..)
|
||||
, NoteTable
|
||||
, NoteTable'
|
||||
, Key (..)
|
||||
, KeyTable
|
||||
, SubstTable
|
||||
, defaultParserState
|
||||
, toKey
|
||||
)
|
||||
where
|
||||
|
||||
import Data.Default (Default (def))
|
||||
import Data.Text (Text)
|
||||
import Text.Parsec (SourcePos, getState, setState)
|
||||
import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines)
|
||||
import Text.Pandoc.Definition (Attr, Meta, Target, nullMeta)
|
||||
import Text.Pandoc.Logging (LogMessage)
|
||||
import Text.Pandoc.Options (ReaderOptions)
|
||||
import Text.Pandoc.Parsing.Capabilities
|
||||
import Text.Pandoc.Parsing.Types
|
||||
import Text.Pandoc.Readers.LaTeX.Types (Macro)
|
||||
|
||||
import qualified Data.Map as M
|
||||
import qualified Data.Set as Set
|
||||
import qualified Data.Text as T
|
||||
|
||||
-- | Parsing options.
|
||||
data ParserState = ParserState
|
||||
{ stateOptions :: ReaderOptions -- ^ User options
|
||||
, stateParserContext :: ParserContext -- ^ Inside list?
|
||||
, stateQuoteContext :: QuoteContext -- ^ Inside quoted environment?
|
||||
, stateAllowLinks :: Bool -- ^ Allow parsing of links
|
||||
, stateAllowLineBreaks :: Bool -- ^ Allow parsing of line breaks
|
||||
, stateMaxNestingLevel :: Int -- ^ Max # of nested Strong/Emph
|
||||
, stateLastStrPos :: Maybe SourcePos -- ^ Position after last str parsed
|
||||
, stateKeys :: KeyTable -- ^ List of reference keys
|
||||
, stateHeaderKeys :: KeyTable -- ^ List of implicit header ref keys
|
||||
, stateSubstitutions :: SubstTable -- ^ List of substitution references
|
||||
, stateNotes :: NoteTable -- ^ List of notes (raw bodies)
|
||||
, stateNotes' :: NoteTable' -- ^ List of notes (parsed bodies)
|
||||
, stateNoteRefs :: Set.Set Text -- ^ List of note references used
|
||||
, stateInNote :: Bool -- ^ True if parsing note contents
|
||||
, stateNoteNumber :: Int -- ^ Last note number for citations
|
||||
, stateMeta :: Meta -- ^ Document metadata
|
||||
, stateMeta' :: Future ParserState Meta -- ^ Document metadata
|
||||
, stateCitations :: M.Map Text Text -- ^ RST-style citations
|
||||
, stateHeaderTable :: [HeaderType] -- ^ Ordered list of header types used
|
||||
, stateIdentifiers :: Set.Set Text -- ^ Header identifiers used
|
||||
, stateNextExample :: Int -- ^ Number of next example
|
||||
, stateExamples :: M.Map Text Int -- ^ Map from example labels to numbers
|
||||
, stateMacros :: M.Map Text Macro -- ^ Table of macros defined so far
|
||||
, stateRstDefaultRole :: Text -- ^ Current rST default
|
||||
-- interpreted text role
|
||||
, stateRstHighlight :: Maybe Text -- ^ Current rST literal block
|
||||
-- language
|
||||
, stateRstCustomRoles :: M.Map Text (Text, Maybe Text, Attr)
|
||||
-- ^ Current rST cust text roles;
|
||||
-- Triple represents:) Base role 2) Optional format (only for :raw:
|
||||
-- roles) 3) Addition classes (rest of Attr is unused)).
|
||||
, stateCaption :: Maybe Inlines -- ^ Caption in current environment
|
||||
, stateInHtmlBlock :: Maybe Text -- ^ Tag type of HTML block being parsed
|
||||
, stateFencedDivLevel :: Int -- ^ Depth of fenced div
|
||||
, stateContainers :: [Text] -- ^ parent include files
|
||||
, stateLogMessages :: [LogMessage] -- ^ log messages
|
||||
, stateMarkdownAttribute :: Bool -- ^ True if in markdown=1 context
|
||||
}
|
||||
|
||||
instance Default ParserState where
|
||||
def = defaultParserState
|
||||
|
||||
instance HasMeta ParserState where
|
||||
setMeta field val st =
|
||||
st{ stateMeta = setMeta field val $ stateMeta st }
|
||||
deleteMeta field st =
|
||||
st{ stateMeta = deleteMeta field $ stateMeta st }
|
||||
|
||||
instance HasReaderOptions ParserState where
|
||||
extractReaderOptions = stateOptions
|
||||
|
||||
instance Monad m => HasQuoteContext ParserState m where
|
||||
getQuoteContext = stateQuoteContext <$> getState
|
||||
withQuoteContext context parser = do
|
||||
oldState <- getState
|
||||
let oldQuoteContext = stateQuoteContext oldState
|
||||
setState oldState { stateQuoteContext = context }
|
||||
result <- parser
|
||||
newState <- getState
|
||||
setState newState { stateQuoteContext = oldQuoteContext }
|
||||
return result
|
||||
|
||||
instance HasIdentifierList ParserState where
|
||||
extractIdentifierList = stateIdentifiers
|
||||
updateIdentifierList f st = st{ stateIdentifiers = f $ stateIdentifiers st }
|
||||
|
||||
instance HasMacros ParserState where
|
||||
extractMacros = stateMacros
|
||||
updateMacros f st = st{ stateMacros = f $ stateMacros st }
|
||||
|
||||
instance HasLastStrPosition ParserState where
|
||||
setLastStrPos pos st = st{ stateLastStrPos = pos }
|
||||
getLastStrPos st = stateLastStrPos st
|
||||
|
||||
instance HasLogMessages ParserState where
|
||||
addLogMessage msg st = st{ stateLogMessages = msg : stateLogMessages st }
|
||||
getLogMessages st = reverse $ stateLogMessages st
|
||||
|
||||
instance HasIncludeFiles ParserState where
|
||||
getIncludeFiles = stateContainers
|
||||
addIncludeFile f s = s{ stateContainers = f : stateContainers s }
|
||||
dropLatestIncludeFile s = s { stateContainers = drop 1 $ stateContainers s }
|
||||
|
||||
data ParserContext
|
||||
= ListItemState -- ^ Used when running parser on list item contents
|
||||
| NullState -- ^ Default state
|
||||
deriving (Eq, Show)
|
||||
|
||||
data HeaderType
|
||||
= SingleHeader Char -- ^ Single line of characters underneath
|
||||
| DoubleHeader Char -- ^ Lines of characters above and below
|
||||
deriving (Eq, Show)
|
||||
|
||||
defaultParserState :: ParserState
|
||||
defaultParserState = ParserState
|
||||
{ stateOptions = def
|
||||
, stateParserContext = NullState
|
||||
, stateQuoteContext = NoQuote
|
||||
, stateAllowLinks = True
|
||||
, stateAllowLineBreaks = True
|
||||
, stateMaxNestingLevel = 6
|
||||
, stateLastStrPos = Nothing
|
||||
, stateKeys = M.empty
|
||||
, stateHeaderKeys = M.empty
|
||||
, stateSubstitutions = M.empty
|
||||
, stateNotes = []
|
||||
, stateNotes' = M.empty
|
||||
, stateNoteRefs = Set.empty
|
||||
, stateInNote = False
|
||||
, stateNoteNumber = 0
|
||||
, stateMeta = nullMeta
|
||||
, stateMeta' = return nullMeta
|
||||
, stateCitations = M.empty
|
||||
, stateHeaderTable = []
|
||||
, stateIdentifiers = Set.empty
|
||||
, stateNextExample = 1
|
||||
, stateExamples = M.empty
|
||||
, stateMacros = M.empty
|
||||
, stateRstDefaultRole = "title-reference"
|
||||
, stateRstHighlight = Nothing
|
||||
, stateRstCustomRoles = M.empty
|
||||
, stateCaption = Nothing
|
||||
, stateInHtmlBlock = Nothing
|
||||
, stateFencedDivLevel = 0
|
||||
, stateContainers = []
|
||||
, stateLogMessages = []
|
||||
, stateMarkdownAttribute = False
|
||||
}
|
||||
|
||||
type NoteTable = [(Text, Text)]
|
||||
|
||||
type NoteTable' = M.Map Text (SourcePos, Future ParserState Blocks)
|
||||
-- used in markdown reader
|
||||
|
||||
newtype Key = Key Text deriving (Show, Read, Eq, Ord)
|
||||
|
||||
toKey :: Text -> Key
|
||||
toKey = Key . T.toLower . T.unwords . T.words . unbracket
|
||||
where unbracket t
|
||||
| Just ('[', t') <- T.uncons t
|
||||
, Just (t'', ']') <- T.unsnoc t'
|
||||
= t''
|
||||
| otherwise
|
||||
= t
|
||||
|
||||
type KeyTable = M.Map Key (Target, Attr)
|
||||
|
||||
type SubstTable = M.Map Key Inlines
|
57
src/Text/Pandoc/Parsing/Types.hs
Normal file
57
src/Text/Pandoc/Parsing/Types.hs
Normal file
|
@ -0,0 +1,57 @@
|
|||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Parsing
|
||||
Copyright : Copyright (C) 2006-2022 John MacFarlane
|
||||
License : GPL-2.0-or-later
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
Types and type-related functions for parsers.
|
||||
-}
|
||||
|
||||
module Text.Pandoc.Parsing.Types
|
||||
( Parser
|
||||
, ParserT
|
||||
, Future (..)
|
||||
, runF
|
||||
, askF
|
||||
, asksF
|
||||
, returnF
|
||||
)
|
||||
where
|
||||
|
||||
import Control.Applicative (liftA2)
|
||||
import Control.Monad.Reader
|
||||
( asks, runReader, MonadReader(ask), Reader, ReaderT(ReaderT) )
|
||||
import Text.Parsec ( Parsec , ParsecT )
|
||||
|
||||
-- | Generic parser type used by many pandoc readers.
|
||||
type Parser t s = Parsec t s
|
||||
|
||||
-- | Generic parser transformer used by many pandoc readers.
|
||||
type ParserT = ParsecT
|
||||
|
||||
-- | Reader monad wrapping the parser state. This is used to possibly
|
||||
-- delay evaluation until all relevant information has been parsed and
|
||||
-- made available in the parser state.
|
||||
newtype Future s a = Future { runDelayed :: Reader s a }
|
||||
deriving (Monad, Applicative, Functor)
|
||||
|
||||
instance Semigroup a => Semigroup (Future s a) where
|
||||
(<>) = liftA2 (<>)
|
||||
|
||||
instance (Semigroup a, Monoid a) => Monoid (Future s a) where
|
||||
mempty = return mempty
|
||||
mappend = (<>)
|
||||
|
||||
-- | Run a delayed action with the given state.
|
||||
runF :: Future s a -> s -> a
|
||||
runF = runReader . runDelayed
|
||||
|
||||
askF :: Future s s
|
||||
askF = Future ask
|
||||
|
||||
asksF :: (s -> a) -> Future s a
|
||||
asksF f = Future $ asks f
|
||||
|
||||
returnF :: Monad m => a -> m (Future s a)
|
||||
returnF = return . return
|
Loading…
Add table
Reference in a new issue