Added Text.Pandoc.Readers.LaTeX.Parsing (unexported).
This collects some of the general-purpose code from the LaTeX reader, with the aim of making the module smaller. (We've been having out-of-memory issues compiling this module on CI.)
This commit is contained in:
parent
4f9ab7e032
commit
9dac993835
3 changed files with 667 additions and 557 deletions
|
@ -508,6 +508,7 @@ library
|
|||
Text.Pandoc.Readers.Docx.Util,
|
||||
Text.Pandoc.Readers.Docx.StyleMap,
|
||||
Text.Pandoc.Readers.Docx.Fields,
|
||||
Text.Pandoc.Readers.LaTeX.Parsing,
|
||||
Text.Pandoc.Readers.Odt.Base,
|
||||
Text.Pandoc.Readers.Odt.Namespaces,
|
||||
Text.Pandoc.Readers.Odt.StyleReader,
|
||||
|
|
|
@ -47,8 +47,7 @@ import Prelude
|
|||
import Control.Applicative (many, optional, (<|>))
|
||||
import Control.Monad
|
||||
import Control.Monad.Except (throwError)
|
||||
import Control.Monad.Trans (lift)
|
||||
import Data.Char (chr, isAlphaNum, isDigit, isLetter, ord, toLower, toUpper)
|
||||
import Data.Char (isDigit, isLetter, toLower, toUpper)
|
||||
import Data.Default
|
||||
import Data.List (intercalate, isPrefixOf)
|
||||
import qualified Data.Map as M
|
||||
|
@ -63,7 +62,7 @@ import Text.Pandoc.Builder
|
|||
import Text.Pandoc.Class (PandocMonad, PandocPure, getResourcePath, lookupEnv,
|
||||
readFileFromDirs, report, setResourcePath,
|
||||
setTranslations, translateTerm, trace)
|
||||
import Text.Pandoc.Error (PandocError (PandocMacroLoop, PandocParseError, PandocParsecError))
|
||||
import Text.Pandoc.Error (PandocError ( PandocParseError, PandocParsecError))
|
||||
import Text.Pandoc.Highlighting (fromListingsLanguage, languagesByExtension)
|
||||
import Text.Pandoc.ImageSize (numUnit, showFl)
|
||||
import Text.Pandoc.Logging
|
||||
|
@ -72,10 +71,10 @@ import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline,
|
|||
optional, space, spaces, withRaw, (<|>))
|
||||
import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..),
|
||||
ArgSpec (..), Tok (..), TokType (..))
|
||||
import Text.Pandoc.Readers.LaTeX.Parsing
|
||||
import Text.Pandoc.Shared
|
||||
import qualified Text.Pandoc.Translations as Translations
|
||||
import Text.Pandoc.Walk
|
||||
import Text.Parsec.Pos
|
||||
import qualified Text.Pandoc.Builder as B
|
||||
|
||||
-- for debugging:
|
||||
|
@ -137,151 +136,6 @@ resolveRefs _ x = x
|
|||
-- Left e -> error (show e)
|
||||
-- Right r -> return r
|
||||
|
||||
newtype DottedNum = DottedNum [Int]
|
||||
deriving (Show)
|
||||
|
||||
renderDottedNum :: DottedNum -> String
|
||||
renderDottedNum (DottedNum xs) =
|
||||
intercalate "." (map show xs)
|
||||
|
||||
incrementDottedNum :: Int -> DottedNum -> DottedNum
|
||||
incrementDottedNum level (DottedNum ns) = DottedNum $
|
||||
case reverse (take level (ns ++ repeat 0)) of
|
||||
(x:xs) -> reverse (x+1 : xs)
|
||||
[] -> [] -- shouldn't happen
|
||||
|
||||
data LaTeXState = LaTeXState{ sOptions :: ReaderOptions
|
||||
, sMeta :: Meta
|
||||
, sQuoteContext :: QuoteContext
|
||||
, sMacros :: M.Map Text Macro
|
||||
, sContainers :: [String]
|
||||
, sHeaders :: M.Map Inlines String
|
||||
, sLogMessages :: [LogMessage]
|
||||
, sIdentifiers :: Set.Set String
|
||||
, sVerbatimMode :: Bool
|
||||
, sCaption :: (Maybe Inlines, Maybe String)
|
||||
, sInListItem :: Bool
|
||||
, sInTableCell :: Bool
|
||||
, sLastHeaderNum :: DottedNum
|
||||
, sLastFigureNum :: DottedNum
|
||||
, sLabels :: M.Map String [Inline]
|
||||
, sHasChapters :: Bool
|
||||
, sToggles :: M.Map String Bool
|
||||
}
|
||||
deriving Show
|
||||
|
||||
defaultLaTeXState :: LaTeXState
|
||||
defaultLaTeXState = LaTeXState{ sOptions = def
|
||||
, sMeta = nullMeta
|
||||
, sQuoteContext = NoQuote
|
||||
, sMacros = M.empty
|
||||
, sContainers = []
|
||||
, sHeaders = M.empty
|
||||
, sLogMessages = []
|
||||
, sIdentifiers = Set.empty
|
||||
, sVerbatimMode = False
|
||||
, sCaption = (Nothing, Nothing)
|
||||
, sInListItem = False
|
||||
, sInTableCell = False
|
||||
, sLastHeaderNum = DottedNum []
|
||||
, sLastFigureNum = DottedNum []
|
||||
, sLabels = M.empty
|
||||
, sHasChapters = False
|
||||
, sToggles = M.empty
|
||||
}
|
||||
|
||||
instance PandocMonad m => HasQuoteContext LaTeXState m where
|
||||
getQuoteContext = sQuoteContext <$> getState
|
||||
withQuoteContext context parser = do
|
||||
oldState <- getState
|
||||
let oldQuoteContext = sQuoteContext oldState
|
||||
setState oldState { sQuoteContext = context }
|
||||
result <- parser
|
||||
newState <- getState
|
||||
setState newState { sQuoteContext = oldQuoteContext }
|
||||
return result
|
||||
|
||||
instance HasLogMessages LaTeXState where
|
||||
addLogMessage msg st = st{ sLogMessages = msg : sLogMessages st }
|
||||
getLogMessages st = reverse $ sLogMessages st
|
||||
|
||||
instance HasIdentifierList LaTeXState where
|
||||
extractIdentifierList = sIdentifiers
|
||||
updateIdentifierList f st = st{ sIdentifiers = f $ sIdentifiers st }
|
||||
|
||||
instance HasIncludeFiles LaTeXState where
|
||||
getIncludeFiles = sContainers
|
||||
addIncludeFile f s = s{ sContainers = f : sContainers s }
|
||||
dropLatestIncludeFile s = s { sContainers = drop 1 $ sContainers s }
|
||||
|
||||
instance HasHeaderMap LaTeXState where
|
||||
extractHeaderMap = sHeaders
|
||||
updateHeaderMap f st = st{ sHeaders = f $ sHeaders st }
|
||||
|
||||
instance HasMacros LaTeXState where
|
||||
extractMacros st = sMacros st
|
||||
updateMacros f st = st{ sMacros = f (sMacros st) }
|
||||
|
||||
instance HasReaderOptions LaTeXState where
|
||||
extractReaderOptions = sOptions
|
||||
|
||||
instance HasMeta LaTeXState where
|
||||
setMeta field val st =
|
||||
st{ sMeta = setMeta field val $ sMeta st }
|
||||
deleteMeta field st =
|
||||
st{ sMeta = deleteMeta field $ sMeta st }
|
||||
|
||||
instance Default LaTeXState where
|
||||
def = defaultLaTeXState
|
||||
|
||||
type LP m = ParserT [Tok] LaTeXState m
|
||||
|
||||
withVerbatimMode :: PandocMonad m => LP m a -> LP m a
|
||||
withVerbatimMode parser = do
|
||||
updateState $ \st -> st{ sVerbatimMode = True }
|
||||
result <- parser
|
||||
updateState $ \st -> st{ sVerbatimMode = False }
|
||||
return result
|
||||
|
||||
rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s)
|
||||
=> Bool -> LP m a -> LP m a -> ParserT String s m (a, String)
|
||||
rawLaTeXParser retokenize parser valParser = do
|
||||
inp <- getInput
|
||||
let toks = tokenize "source" $ T.pack inp
|
||||
pstate <- getState
|
||||
let lstate = def{ sOptions = extractReaderOptions pstate }
|
||||
let lstate' = lstate { sMacros = extractMacros pstate }
|
||||
let rawparser = (,) <$> withRaw valParser <*> getState
|
||||
res' <- lift $ runParserT (snd <$> withRaw parser) lstate "chunk" toks
|
||||
case res' of
|
||||
Left _ -> mzero
|
||||
Right toks' -> do
|
||||
res <- lift $ runParserT (do when retokenize $ do
|
||||
-- retokenize, applying macros
|
||||
doMacros 0
|
||||
ts <- many (satisfyTok (const True))
|
||||
setInput ts
|
||||
rawparser)
|
||||
lstate' "chunk" toks'
|
||||
case res of
|
||||
Left _ -> mzero
|
||||
Right ((val, raw), st) -> do
|
||||
updateState (updateMacros (sMacros st <>))
|
||||
_ <- takeP (T.length (untokenize toks'))
|
||||
return (val, T.unpack (untokenize raw))
|
||||
|
||||
applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s)
|
||||
=> String -> ParserT String s m String
|
||||
applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|>
|
||||
do let retokenize = doMacros 0 *>
|
||||
(toksToString <$> many (satisfyTok (const True)))
|
||||
pstate <- getState
|
||||
let lstate = def{ sOptions = extractReaderOptions pstate
|
||||
, sMacros = extractMacros pstate }
|
||||
res <- runParserT retokenize lstate "math" (tokenize "math" (T.pack s))
|
||||
case res of
|
||||
Left e -> fail (show e)
|
||||
Right s' -> return s'
|
||||
|
||||
rawLaTeXBlock :: (PandocMonad m, HasMacros s, HasReaderOptions s)
|
||||
=> ParserT String s m String
|
||||
|
@ -326,358 +180,6 @@ inlineCommand = do
|
|||
lookAhead (try (char '\\' >> letter))
|
||||
fst <$> rawLaTeXParser True (inlineEnvironment <|> inlineCommand') inlines
|
||||
|
||||
tokenize :: SourceName -> Text -> [Tok]
|
||||
tokenize sourcename = totoks (initialPos sourcename)
|
||||
|
||||
totoks :: SourcePos -> Text -> [Tok]
|
||||
totoks pos t =
|
||||
case T.uncons t of
|
||||
Nothing -> []
|
||||
Just (c, rest)
|
||||
| c == '\n' ->
|
||||
Tok pos Newline "\n"
|
||||
: totoks (setSourceColumn (incSourceLine pos 1) 1) rest
|
||||
| isSpaceOrTab c ->
|
||||
let (sps, rest') = T.span isSpaceOrTab t
|
||||
in Tok pos Spaces sps
|
||||
: totoks (incSourceColumn pos (T.length sps))
|
||||
rest'
|
||||
| isAlphaNum c ->
|
||||
let (ws, rest') = T.span isAlphaNum t
|
||||
in Tok pos Word ws
|
||||
: totoks (incSourceColumn pos (T.length ws)) rest'
|
||||
| c == '%' ->
|
||||
let (cs, rest') = T.break (== '\n') rest
|
||||
in Tok pos Comment ("%" <> cs)
|
||||
: totoks (incSourceColumn pos (1 + T.length cs)) rest'
|
||||
| c == '\\' ->
|
||||
case T.uncons rest of
|
||||
Nothing -> [Tok pos (CtrlSeq " ") "\\"]
|
||||
Just (d, rest')
|
||||
| isLetterOrAt d ->
|
||||
-- \makeatletter is common in macro defs;
|
||||
-- ideally we should make tokenization sensitive
|
||||
-- to \makeatletter and \makeatother, but this is
|
||||
-- probably best for now
|
||||
let (ws, rest'') = T.span isLetterOrAt rest
|
||||
(ss, rest''') = T.span isSpaceOrTab rest''
|
||||
in Tok pos (CtrlSeq ws) ("\\" <> ws <> ss)
|
||||
: totoks (incSourceColumn pos
|
||||
(1 + T.length ws + T.length ss)) rest'''
|
||||
| isSpaceOrTab d || d == '\n' ->
|
||||
let (w1, r1) = T.span isSpaceOrTab rest
|
||||
(w2, (w3, r3)) = case T.uncons r1 of
|
||||
Just ('\n', r2)
|
||||
-> (T.pack "\n",
|
||||
T.span isSpaceOrTab r2)
|
||||
_ -> (mempty, (mempty, r1))
|
||||
ws = "\\" <> w1 <> w2 <> w3
|
||||
in case T.uncons r3 of
|
||||
Just ('\n', _) ->
|
||||
Tok pos (CtrlSeq " ") ("\\" <> w1)
|
||||
: totoks (incSourceColumn pos (T.length ws))
|
||||
r1
|
||||
_ ->
|
||||
Tok pos (CtrlSeq " ") ws
|
||||
: totoks (incSourceColumn pos (T.length ws))
|
||||
r3
|
||||
| otherwise ->
|
||||
Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d])
|
||||
: totoks (incSourceColumn pos 2) rest'
|
||||
| c == '#' ->
|
||||
let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest
|
||||
in case safeRead (T.unpack t1) of
|
||||
Just i ->
|
||||
Tok pos (Arg i) ("#" <> t1)
|
||||
: totoks (incSourceColumn pos (1 + T.length t1)) t2
|
||||
Nothing ->
|
||||
Tok pos Symbol "#"
|
||||
: totoks (incSourceColumn pos 1) t2
|
||||
| c == '^' ->
|
||||
case T.uncons rest of
|
||||
Just ('^', rest') ->
|
||||
case T.uncons rest' of
|
||||
Just (d, rest'')
|
||||
| isLowerHex d ->
|
||||
case T.uncons rest'' of
|
||||
Just (e, rest''') | isLowerHex e ->
|
||||
Tok pos Esc2 (T.pack ['^','^',d,e])
|
||||
: totoks (incSourceColumn pos 4) rest'''
|
||||
_ ->
|
||||
Tok pos Esc1 (T.pack ['^','^',d])
|
||||
: totoks (incSourceColumn pos 3) rest''
|
||||
| d < '\128' ->
|
||||
Tok pos Esc1 (T.pack ['^','^',d])
|
||||
: totoks (incSourceColumn pos 3) rest''
|
||||
_ -> Tok pos Symbol "^" :
|
||||
Tok (incSourceColumn pos 1) Symbol "^" :
|
||||
totoks (incSourceColumn pos 2) rest'
|
||||
_ -> Tok pos Symbol "^"
|
||||
: totoks (incSourceColumn pos 1) rest
|
||||
| otherwise ->
|
||||
Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest
|
||||
|
||||
isSpaceOrTab :: Char -> Bool
|
||||
isSpaceOrTab ' ' = True
|
||||
isSpaceOrTab '\t' = True
|
||||
isSpaceOrTab _ = False
|
||||
|
||||
isLetterOrAt :: Char -> Bool
|
||||
isLetterOrAt '@' = True
|
||||
isLetterOrAt c = isLetter c
|
||||
|
||||
isLowerHex :: Char -> Bool
|
||||
isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f'
|
||||
|
||||
untokenize :: [Tok] -> Text
|
||||
untokenize = mconcat . map untoken
|
||||
|
||||
untoken :: Tok -> Text
|
||||
untoken (Tok _ _ t) = t
|
||||
|
||||
satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok
|
||||
satisfyTok f =
|
||||
try $ do
|
||||
res <- tokenPrim (T.unpack . untoken) updatePos matcher
|
||||
doMacros 0 -- apply macros on remaining input stream
|
||||
return res
|
||||
where matcher t | f t = Just t
|
||||
| otherwise = Nothing
|
||||
updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos
|
||||
updatePos _spos _ (Tok pos _ _ : _) = pos
|
||||
updatePos spos _ [] = incSourceColumn spos 1
|
||||
|
||||
doMacros :: PandocMonad m => Int -> LP m ()
|
||||
doMacros n = do
|
||||
verbatimMode <- sVerbatimMode <$> getState
|
||||
unless verbatimMode $ do
|
||||
inp <- getInput
|
||||
case inp of
|
||||
Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" :
|
||||
Tok _ Word name : Tok _ Symbol "}" : ts
|
||||
-> handleMacros spos name ts
|
||||
Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" :
|
||||
Tok _ Word name : Tok _ Symbol "}" : ts
|
||||
-> handleMacros spos ("end" <> name) ts
|
||||
Tok _ (CtrlSeq "expandafter") _ : t : ts
|
||||
-> do setInput ts
|
||||
doMacros n
|
||||
getInput >>= setInput . combineTok t
|
||||
Tok spos (CtrlSeq name) _ : ts
|
||||
-> handleMacros spos name ts
|
||||
_ -> return ()
|
||||
where combineTok (Tok spos (CtrlSeq name) x) (Tok _ Word w : ts)
|
||||
| T.all isLetterOrAt w =
|
||||
Tok spos (CtrlSeq (name <> w)) (x1 <> w <> x2) : ts
|
||||
where (x1, x2) = T.break isSpaceOrTab x
|
||||
combineTok t ts = t:ts
|
||||
handleMacros spos name ts = do
|
||||
macros <- sMacros <$> getState
|
||||
case M.lookup name macros of
|
||||
Nothing -> return ()
|
||||
Just (Macro expansionPoint argspecs optarg newtoks) -> do
|
||||
setInput ts
|
||||
let matchTok (Tok _ toktype txt) =
|
||||
satisfyTok (\(Tok _ toktype' txt') ->
|
||||
toktype == toktype' &&
|
||||
txt == txt')
|
||||
let matchPattern toks = try $ mapM_ matchTok toks
|
||||
let getargs argmap [] = return argmap
|
||||
getargs argmap (Pattern toks : rest) = try $ do
|
||||
matchPattern toks
|
||||
getargs argmap rest
|
||||
getargs argmap (ArgNum i : Pattern toks : rest) =
|
||||
try $ do
|
||||
x <- mconcat <$> manyTill
|
||||
(braced <|> ((:[]) <$> anyTok))
|
||||
(matchPattern toks)
|
||||
getargs (M.insert i x argmap) rest
|
||||
getargs argmap (ArgNum i : rest) = do
|
||||
x <- try $ spaces >> bracedOrToken
|
||||
getargs (M.insert i x argmap) rest
|
||||
args <- case optarg of
|
||||
Nothing -> getargs M.empty argspecs
|
||||
Just o -> do
|
||||
x <- option o bracketedToks
|
||||
getargs (M.singleton 1 x) argspecs
|
||||
-- first boolean param is true if we're tokenizing
|
||||
-- an argument (in which case we don't want to
|
||||
-- expand #1 etc.)
|
||||
let addTok False (Tok _ (Arg i) _) acc =
|
||||
case M.lookup i args of
|
||||
Nothing -> mzero
|
||||
Just xs -> foldr (addTok True) acc xs
|
||||
-- see #4007
|
||||
addTok _ (Tok _ (CtrlSeq x) txt)
|
||||
acc@(Tok _ Word _ : _)
|
||||
| not (T.null txt) &&
|
||||
isLetter (T.last txt) =
|
||||
Tok spos (CtrlSeq x) (txt <> " ") : acc
|
||||
addTok _ t acc = setpos spos t : acc
|
||||
ts' <- getInput
|
||||
setInput $ foldr (addTok False) ts' newtoks
|
||||
case expansionPoint of
|
||||
ExpandWhenUsed ->
|
||||
if n > 20 -- detect macro expansion loops
|
||||
then throwError $ PandocMacroLoop (T.unpack name)
|
||||
else doMacros (n + 1)
|
||||
ExpandWhenDefined -> return ()
|
||||
|
||||
|
||||
setpos :: SourcePos -> Tok -> Tok
|
||||
setpos spos (Tok _ tt txt) = Tok spos tt txt
|
||||
|
||||
anyControlSeq :: PandocMonad m => LP m Tok
|
||||
anyControlSeq = satisfyTok isCtrlSeq
|
||||
|
||||
isCtrlSeq :: Tok -> Bool
|
||||
isCtrlSeq (Tok _ (CtrlSeq _) _) = True
|
||||
isCtrlSeq _ = False
|
||||
|
||||
anySymbol :: PandocMonad m => LP m Tok
|
||||
anySymbol = satisfyTok isSymbolTok
|
||||
|
||||
isSymbolTok :: Tok -> Bool
|
||||
isSymbolTok (Tok _ Symbol _) = True
|
||||
isSymbolTok _ = False
|
||||
|
||||
spaces :: PandocMonad m => LP m ()
|
||||
spaces = skipMany (satisfyTok (tokTypeIn [Comment, Spaces, Newline]))
|
||||
|
||||
spaces1 :: PandocMonad m => LP m ()
|
||||
spaces1 = skipMany1 (satisfyTok (tokTypeIn [Comment, Spaces, Newline]))
|
||||
|
||||
tokTypeIn :: [TokType] -> Tok -> Bool
|
||||
tokTypeIn toktypes (Tok _ tt _) = tt `elem` toktypes
|
||||
|
||||
controlSeq :: PandocMonad m => Text -> LP m Tok
|
||||
controlSeq name = satisfyTok isNamed
|
||||
where isNamed (Tok _ (CtrlSeq n) _) = n == name
|
||||
isNamed _ = False
|
||||
|
||||
symbol :: PandocMonad m => Char -> LP m Tok
|
||||
symbol c = satisfyTok isc
|
||||
where isc (Tok _ Symbol d) = case T.uncons d of
|
||||
Just (c',_) -> c == c'
|
||||
_ -> False
|
||||
isc _ = False
|
||||
|
||||
symbolIn :: PandocMonad m => [Char] -> LP m Tok
|
||||
symbolIn cs = satisfyTok isInCs
|
||||
where isInCs (Tok _ Symbol d) = case T.uncons d of
|
||||
Just (c,_) -> c `elem` cs
|
||||
_ -> False
|
||||
isInCs _ = False
|
||||
|
||||
sp :: PandocMonad m => LP m ()
|
||||
sp = whitespace <|> endline
|
||||
|
||||
whitespace :: PandocMonad m => LP m ()
|
||||
whitespace = () <$ satisfyTok isSpaceTok
|
||||
|
||||
isSpaceTok :: Tok -> Bool
|
||||
isSpaceTok (Tok _ Spaces _) = True
|
||||
isSpaceTok _ = False
|
||||
|
||||
newlineTok :: PandocMonad m => LP m ()
|
||||
newlineTok = () <$ satisfyTok isNewlineTok
|
||||
|
||||
isNewlineTok :: Tok -> Bool
|
||||
isNewlineTok (Tok _ Newline _) = True
|
||||
isNewlineTok _ = False
|
||||
|
||||
comment :: PandocMonad m => LP m ()
|
||||
comment = () <$ satisfyTok isCommentTok
|
||||
|
||||
isCommentTok :: Tok -> Bool
|
||||
isCommentTok (Tok _ Comment _) = True
|
||||
isCommentTok _ = False
|
||||
|
||||
anyTok :: PandocMonad m => LP m Tok
|
||||
anyTok = satisfyTok (const True)
|
||||
|
||||
endline :: PandocMonad m => LP m ()
|
||||
endline = try $ do
|
||||
newlineTok
|
||||
lookAhead anyTok
|
||||
notFollowedBy blankline
|
||||
|
||||
blankline :: PandocMonad m => LP m ()
|
||||
blankline = try $ skipMany whitespace *> newlineTok
|
||||
|
||||
primEscape :: PandocMonad m => LP m Char
|
||||
primEscape = do
|
||||
Tok _ toktype t <- satisfyTok (tokTypeIn [Esc1, Esc2])
|
||||
case toktype of
|
||||
Esc1 -> case T.uncons (T.drop 2 t) of
|
||||
Just (c, _)
|
||||
| c >= '\64' && c <= '\127' -> return (chr (ord c - 64))
|
||||
| otherwise -> return (chr (ord c + 64))
|
||||
Nothing -> fail "Empty content of Esc1"
|
||||
Esc2 -> case safeRead ('0':'x':T.unpack (T.drop 2 t)) of
|
||||
Just x -> return (chr x)
|
||||
Nothing -> fail $ "Could not read: " ++ T.unpack t
|
||||
_ -> fail "Expected an Esc1 or Esc2 token" -- should not happen
|
||||
|
||||
bgroup :: PandocMonad m => LP m Tok
|
||||
bgroup = try $ do
|
||||
skipMany sp
|
||||
symbol '{' <|> controlSeq "bgroup" <|> controlSeq "begingroup"
|
||||
|
||||
egroup :: PandocMonad m => LP m Tok
|
||||
egroup = symbol '}' <|> controlSeq "egroup" <|> controlSeq "endgroup"
|
||||
|
||||
grouped :: (PandocMonad m, Monoid a) => LP m a -> LP m a
|
||||
grouped parser = try $ do
|
||||
bgroup
|
||||
-- first we check for an inner 'grouped', because
|
||||
-- {{a,b}} should be parsed the same as {a,b}
|
||||
try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup)
|
||||
|
||||
braced' :: PandocMonad m => LP m Tok -> Int -> LP m [Tok]
|
||||
braced' getTok n =
|
||||
handleEgroup <|> handleBgroup <|> handleOther
|
||||
where handleEgroup = do
|
||||
t <- egroup
|
||||
if n == 1
|
||||
then return []
|
||||
else (t:) <$> braced' getTok (n - 1)
|
||||
handleBgroup = do
|
||||
t <- bgroup
|
||||
(t:) <$> braced' getTok (n + 1)
|
||||
handleOther = do
|
||||
t <- getTok
|
||||
(t:) <$> braced' getTok n
|
||||
|
||||
braced :: PandocMonad m => LP m [Tok]
|
||||
braced = bgroup *> braced' anyTok 1
|
||||
|
||||
-- URLs require special handling, because they can contain %
|
||||
-- characters. So we retonenize comments as we go...
|
||||
bracedUrl :: PandocMonad m => LP m [Tok]
|
||||
bracedUrl = bgroup *> braced' (retokenizeComment >> anyTok) 1
|
||||
|
||||
bracketed :: PandocMonad m => Monoid a => LP m a -> LP m a
|
||||
bracketed parser = try $ do
|
||||
symbol '['
|
||||
mconcat <$> manyTill parser (symbol ']')
|
||||
|
||||
parenWrapped :: PandocMonad m => Monoid a => LP m a -> LP m a
|
||||
parenWrapped parser = try $ do
|
||||
symbol '('
|
||||
mconcat <$> manyTill parser (symbol ')')
|
||||
|
||||
dimenarg :: PandocMonad m => LP m Text
|
||||
dimenarg = try $ do
|
||||
ch <- option False $ True <$ symbol '='
|
||||
Tok _ _ s <- satisfyTok isWordTok
|
||||
guard $ T.take 2 (T.reverse s) `elem`
|
||||
["pt","pc","in","bp","cm","mm","dd","cc","sp"]
|
||||
let num = T.take (T.length s - 2) s
|
||||
guard $ T.length num > 0
|
||||
guard $ T.all isDigit num
|
||||
return $ T.pack ['=' | ch] <> s
|
||||
|
||||
-- inline elements:
|
||||
|
||||
word :: PandocMonad m => LP m Inlines
|
||||
|
@ -689,13 +191,6 @@ regularSymbol = (str . T.unpack . untoken) <$> satisfyTok isRegularSymbol
|
|||
isRegularSymbol _ = False
|
||||
isSpecial c = c `Set.member` specialChars
|
||||
|
||||
specialChars :: Set.Set Char
|
||||
specialChars = Set.fromList "#$%&~_^\\{}"
|
||||
|
||||
isWordTok :: Tok -> Bool
|
||||
isWordTok (Tok _ Word _) = True
|
||||
isWordTok _ = False
|
||||
|
||||
inlineGroup :: PandocMonad m => LP m Inlines
|
||||
inlineGroup = do
|
||||
ils <- grouped inline
|
||||
|
@ -1396,9 +891,6 @@ breve 'U' = "Ŭ"
|
|||
breve 'u' = "ŭ"
|
||||
breve c = [c]
|
||||
|
||||
toksToString :: [Tok] -> String
|
||||
toksToString = T.unpack . untokenize
|
||||
|
||||
mathDisplay :: String -> Inlines
|
||||
mathDisplay = displayMath . trim
|
||||
|
||||
|
@ -1562,19 +1054,6 @@ tok = try $ spaces >> grouped inline <|> inlineCommand' <|> singleChar'
|
|||
Tok _ _ t <- singleChar
|
||||
return (str (T.unpack t))
|
||||
|
||||
singleChar :: PandocMonad m => LP m Tok
|
||||
singleChar = try $ do
|
||||
Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol])
|
||||
guard $ not $ toktype == Symbol &&
|
||||
T.any (`Set.member` specialChars) t
|
||||
if T.length t > 1
|
||||
then do
|
||||
let (t1, t2) = (T.take 1 t, T.drop 1 t)
|
||||
inp <- getInput
|
||||
setInput $ Tok (incSourceColumn pos 1) toktype t2 : inp
|
||||
return $ Tok pos toktype t1
|
||||
else return $ Tok pos toktype t
|
||||
|
||||
opt :: PandocMonad m => LP m Inlines
|
||||
opt = bracketed inline <|> (str . T.unpack <$> rawopt)
|
||||
|
||||
|
@ -1611,20 +1090,6 @@ overlayTok =
|
|||
Tok _ Symbol c -> c `elem` ["-","+","@","|",":",","]
|
||||
_ -> False)
|
||||
|
||||
ignore :: (Monoid a, PandocMonad m) => String -> ParserT s u m a
|
||||
ignore raw = do
|
||||
pos <- getPosition
|
||||
report $ SkippedContent raw pos
|
||||
return mempty
|
||||
|
||||
withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok])
|
||||
withRaw parser = do
|
||||
inp <- getInput
|
||||
result <- parser
|
||||
nxt <- option (Tok (initialPos "source") Word "") (lookAhead anyTok)
|
||||
let raw = takeWhile (/= nxt) inp
|
||||
return (result, raw)
|
||||
|
||||
inBrackets :: Inlines -> Inlines
|
||||
inBrackets x = str "[" <> x <> str "]"
|
||||
|
||||
|
@ -1634,17 +1099,6 @@ unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs
|
|||
unescapeURL (x:xs) = x:unescapeURL xs
|
||||
unescapeURL [] = ""
|
||||
|
||||
-- For handling URLs, which allow literal % characters...
|
||||
retokenizeComment :: PandocMonad m => LP m ()
|
||||
retokenizeComment = (do
|
||||
Tok pos Comment txt <- satisfyTok isCommentTok
|
||||
let updPos (Tok pos' toktype' txt') =
|
||||
Tok (incSourceColumn (incSourceLine pos' (sourceLine pos - 1))
|
||||
(sourceColumn pos)) toktype' txt'
|
||||
let newtoks = map updPos $ tokenize (sourceName pos) $ T.tail txt
|
||||
getInput >>= setInput . ((Tok pos Symbol "%" : newtoks) ++))
|
||||
<|> return ()
|
||||
|
||||
mathEnvWith :: PandocMonad m
|
||||
=> (Inlines -> a) -> Maybe Text -> Text -> LP m a
|
||||
mathEnvWith f innerEnv name = f . mathDisplay . inner <$> mathEnv name
|
||||
|
@ -2364,9 +1818,6 @@ isArgTok :: Tok -> Bool
|
|||
isArgTok (Tok _ (Arg _) _) = True
|
||||
isArgTok _ = False
|
||||
|
||||
bracedOrToken :: PandocMonad m => LP m [Tok]
|
||||
bracedOrToken = braced <|> ((:[]) <$> (anyControlSeq <|> singleChar))
|
||||
|
||||
newcommand :: PandocMonad m => LP m (Text, Macro)
|
||||
newcommand = do
|
||||
pos <- getPosition
|
||||
|
@ -2417,11 +1868,6 @@ newenvironment = do
|
|||
return (name, Macro ExpandWhenUsed argspecs optarg startcontents,
|
||||
Macro ExpandWhenUsed [] Nothing endcontents)
|
||||
|
||||
bracketedToks :: PandocMonad m => LP m [Tok]
|
||||
bracketedToks = do
|
||||
symbol '['
|
||||
mconcat <$> manyTill (braced <|> (:[]) <$> anyTok) (symbol ']')
|
||||
|
||||
bracketedNum :: PandocMonad m => LP m Int
|
||||
bracketedNum = do
|
||||
ds <- untokenize <$> bracketedToks
|
||||
|
|
663
src/Text/Pandoc/Readers/LaTeX/Parsing.hs
Normal file
663
src/Text/Pandoc/Readers/LaTeX/Parsing.hs
Normal file
|
@ -0,0 +1,663 @@
|
|||
{-# LANGUAGE NoImplicitPrelude #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE ScopedTypeVariables #-}
|
||||
{-
|
||||
Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
-}
|
||||
|
||||
{- |
|
||||
Module : Text.Pandoc.Readers.LaTeX.Parsing
|
||||
Copyright : Copyright (C) 2006-2018 John MacFarlane
|
||||
License : GNU GPL, version 2 or above
|
||||
|
||||
Maintainer : John MacFarlane <jgm@berkeley.edu>
|
||||
Stability : alpha
|
||||
Portability : portable
|
||||
|
||||
General parsing types and functions for LaTeX.
|
||||
-}
|
||||
module Text.Pandoc.Readers.LaTeX.Parsing
|
||||
( DottedNum(..)
|
||||
, renderDottedNum
|
||||
, incrementDottedNum
|
||||
, LaTeXState(..)
|
||||
, defaultLaTeXState
|
||||
, LP
|
||||
, withVerbatimMode
|
||||
, rawLaTeXParser
|
||||
, applyMacros
|
||||
, tokenize
|
||||
, untokenize
|
||||
, untoken
|
||||
, totoks
|
||||
, toksToString
|
||||
, satisfyTok
|
||||
, doMacros
|
||||
, setpos
|
||||
, anyControlSeq
|
||||
, anySymbol
|
||||
, isWordTok
|
||||
, isNewlineTok
|
||||
, spaces
|
||||
, spaces1
|
||||
, tokTypeIn
|
||||
, controlSeq
|
||||
, symbol
|
||||
, symbolIn
|
||||
, sp
|
||||
, whitespace
|
||||
, newlineTok
|
||||
, comment
|
||||
, anyTok
|
||||
, singleChar
|
||||
, specialChars
|
||||
, endline
|
||||
, blankline
|
||||
, primEscape
|
||||
, bgroup
|
||||
, egroup
|
||||
, grouped
|
||||
, braced
|
||||
, braced'
|
||||
, bracedUrl
|
||||
, bracedOrToken
|
||||
, bracketed
|
||||
, bracketedToks
|
||||
, parenWrapped
|
||||
, dimenarg
|
||||
, ignore
|
||||
, withRaw
|
||||
) where
|
||||
|
||||
import Prelude
|
||||
import Control.Applicative (many, (<|>))
|
||||
import Control.Monad
|
||||
import Control.Monad.Except (throwError)
|
||||
import Control.Monad.Trans (lift)
|
||||
import Data.Char (chr, isAlphaNum, isDigit, isLetter, ord)
|
||||
import Data.Default
|
||||
import Data.List (intercalate)
|
||||
import qualified Data.Map as M
|
||||
import qualified Data.Set as Set
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import Text.Pandoc.Builder
|
||||
import Text.Pandoc.Class (PandocMonad, report)
|
||||
import Text.Pandoc.Error (PandocError (PandocMacroLoop))
|
||||
import Text.Pandoc.Logging
|
||||
import Text.Pandoc.Options
|
||||
import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline,
|
||||
optional, space, spaces, withRaw, (<|>))
|
||||
import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..),
|
||||
ArgSpec (..), Tok (..), TokType (..))
|
||||
import Text.Pandoc.Shared
|
||||
import Text.Parsec.Pos
|
||||
|
||||
newtype DottedNum = DottedNum [Int]
|
||||
deriving (Show)
|
||||
|
||||
renderDottedNum :: DottedNum -> String
|
||||
renderDottedNum (DottedNum xs) =
|
||||
intercalate "." (map show xs)
|
||||
|
||||
incrementDottedNum :: Int -> DottedNum -> DottedNum
|
||||
incrementDottedNum level (DottedNum ns) = DottedNum $
|
||||
case reverse (take level (ns ++ repeat 0)) of
|
||||
(x:xs) -> reverse (x+1 : xs)
|
||||
[] -> [] -- shouldn't happen
|
||||
|
||||
data LaTeXState = LaTeXState{ sOptions :: ReaderOptions
|
||||
, sMeta :: Meta
|
||||
, sQuoteContext :: QuoteContext
|
||||
, sMacros :: M.Map Text Macro
|
||||
, sContainers :: [String]
|
||||
, sHeaders :: M.Map Inlines String
|
||||
, sLogMessages :: [LogMessage]
|
||||
, sIdentifiers :: Set.Set String
|
||||
, sVerbatimMode :: Bool
|
||||
, sCaption :: (Maybe Inlines, Maybe String)
|
||||
, sInListItem :: Bool
|
||||
, sInTableCell :: Bool
|
||||
, sLastHeaderNum :: DottedNum
|
||||
, sLastFigureNum :: DottedNum
|
||||
, sLabels :: M.Map String [Inline]
|
||||
, sHasChapters :: Bool
|
||||
, sToggles :: M.Map String Bool
|
||||
}
|
||||
deriving Show
|
||||
|
||||
defaultLaTeXState :: LaTeXState
|
||||
defaultLaTeXState = LaTeXState{ sOptions = def
|
||||
, sMeta = nullMeta
|
||||
, sQuoteContext = NoQuote
|
||||
, sMacros = M.empty
|
||||
, sContainers = []
|
||||
, sHeaders = M.empty
|
||||
, sLogMessages = []
|
||||
, sIdentifiers = Set.empty
|
||||
, sVerbatimMode = False
|
||||
, sCaption = (Nothing, Nothing)
|
||||
, sInListItem = False
|
||||
, sInTableCell = False
|
||||
, sLastHeaderNum = DottedNum []
|
||||
, sLastFigureNum = DottedNum []
|
||||
, sLabels = M.empty
|
||||
, sHasChapters = False
|
||||
, sToggles = M.empty
|
||||
}
|
||||
|
||||
instance PandocMonad m => HasQuoteContext LaTeXState m where
|
||||
getQuoteContext = sQuoteContext <$> getState
|
||||
withQuoteContext context parser = do
|
||||
oldState <- getState
|
||||
let oldQuoteContext = sQuoteContext oldState
|
||||
setState oldState { sQuoteContext = context }
|
||||
result <- parser
|
||||
newState <- getState
|
||||
setState newState { sQuoteContext = oldQuoteContext }
|
||||
return result
|
||||
|
||||
instance HasLogMessages LaTeXState where
|
||||
addLogMessage msg st = st{ sLogMessages = msg : sLogMessages st }
|
||||
getLogMessages st = reverse $ sLogMessages st
|
||||
|
||||
instance HasIdentifierList LaTeXState where
|
||||
extractIdentifierList = sIdentifiers
|
||||
updateIdentifierList f st = st{ sIdentifiers = f $ sIdentifiers st }
|
||||
|
||||
instance HasIncludeFiles LaTeXState where
|
||||
getIncludeFiles = sContainers
|
||||
addIncludeFile f s = s{ sContainers = f : sContainers s }
|
||||
dropLatestIncludeFile s = s { sContainers = drop 1 $ sContainers s }
|
||||
|
||||
instance HasHeaderMap LaTeXState where
|
||||
extractHeaderMap = sHeaders
|
||||
updateHeaderMap f st = st{ sHeaders = f $ sHeaders st }
|
||||
|
||||
instance HasMacros LaTeXState where
|
||||
extractMacros st = sMacros st
|
||||
updateMacros f st = st{ sMacros = f (sMacros st) }
|
||||
|
||||
instance HasReaderOptions LaTeXState where
|
||||
extractReaderOptions = sOptions
|
||||
|
||||
instance HasMeta LaTeXState where
|
||||
setMeta field val st =
|
||||
st{ sMeta = setMeta field val $ sMeta st }
|
||||
deleteMeta field st =
|
||||
st{ sMeta = deleteMeta field $ sMeta st }
|
||||
|
||||
instance Default LaTeXState where
|
||||
def = defaultLaTeXState
|
||||
|
||||
type LP m = ParserT [Tok] LaTeXState m
|
||||
|
||||
withVerbatimMode :: PandocMonad m => LP m a -> LP m a
|
||||
withVerbatimMode parser = do
|
||||
updateState $ \st -> st{ sVerbatimMode = True }
|
||||
result <- parser
|
||||
updateState $ \st -> st{ sVerbatimMode = False }
|
||||
return result
|
||||
|
||||
rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s)
|
||||
=> Bool -> LP m a -> LP m a -> ParserT String s m (a, String)
|
||||
rawLaTeXParser retokenize parser valParser = do
|
||||
inp <- getInput
|
||||
let toks = tokenize "source" $ T.pack inp
|
||||
pstate <- getState
|
||||
let lstate = def{ sOptions = extractReaderOptions pstate }
|
||||
let lstate' = lstate { sMacros = extractMacros pstate }
|
||||
let rawparser = (,) <$> withRaw valParser <*> getState
|
||||
res' <- lift $ runParserT (snd <$> withRaw parser) lstate "chunk" toks
|
||||
case res' of
|
||||
Left _ -> mzero
|
||||
Right toks' -> do
|
||||
res <- lift $ runParserT (do when retokenize $ do
|
||||
-- retokenize, applying macros
|
||||
doMacros 0
|
||||
ts <- many (satisfyTok (const True))
|
||||
setInput ts
|
||||
rawparser)
|
||||
lstate' "chunk" toks'
|
||||
case res of
|
||||
Left _ -> mzero
|
||||
Right ((val, raw), st) -> do
|
||||
updateState (updateMacros (sMacros st <>))
|
||||
_ <- takeP (T.length (untokenize toks'))
|
||||
return (val, T.unpack (untokenize raw))
|
||||
|
||||
applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s)
|
||||
=> String -> ParserT String s m String
|
||||
applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|>
|
||||
do let retokenize = doMacros 0 *>
|
||||
(toksToString <$> many (satisfyTok (const True)))
|
||||
pstate <- getState
|
||||
let lstate = def{ sOptions = extractReaderOptions pstate
|
||||
, sMacros = extractMacros pstate }
|
||||
res <- runParserT retokenize lstate "math" (tokenize "math" (T.pack s))
|
||||
case res of
|
||||
Left e -> fail (show e)
|
||||
Right s' -> return s'
|
||||
tokenize :: SourceName -> Text -> [Tok]
|
||||
tokenize sourcename = totoks (initialPos sourcename)
|
||||
|
||||
totoks :: SourcePos -> Text -> [Tok]
|
||||
totoks pos t =
|
||||
case T.uncons t of
|
||||
Nothing -> []
|
||||
Just (c, rest)
|
||||
| c == '\n' ->
|
||||
Tok pos Newline "\n"
|
||||
: totoks (setSourceColumn (incSourceLine pos 1) 1) rest
|
||||
| isSpaceOrTab c ->
|
||||
let (sps, rest') = T.span isSpaceOrTab t
|
||||
in Tok pos Spaces sps
|
||||
: totoks (incSourceColumn pos (T.length sps))
|
||||
rest'
|
||||
| isAlphaNum c ->
|
||||
let (ws, rest') = T.span isAlphaNum t
|
||||
in Tok pos Word ws
|
||||
: totoks (incSourceColumn pos (T.length ws)) rest'
|
||||
| c == '%' ->
|
||||
let (cs, rest') = T.break (== '\n') rest
|
||||
in Tok pos Comment ("%" <> cs)
|
||||
: totoks (incSourceColumn pos (1 + T.length cs)) rest'
|
||||
| c == '\\' ->
|
||||
case T.uncons rest of
|
||||
Nothing -> [Tok pos (CtrlSeq " ") "\\"]
|
||||
Just (d, rest')
|
||||
| isLetterOrAt d ->
|
||||
-- \makeatletter is common in macro defs;
|
||||
-- ideally we should make tokenization sensitive
|
||||
-- to \makeatletter and \makeatother, but this is
|
||||
-- probably best for now
|
||||
let (ws, rest'') = T.span isLetterOrAt rest
|
||||
(ss, rest''') = T.span isSpaceOrTab rest''
|
||||
in Tok pos (CtrlSeq ws) ("\\" <> ws <> ss)
|
||||
: totoks (incSourceColumn pos
|
||||
(1 + T.length ws + T.length ss)) rest'''
|
||||
| isSpaceOrTab d || d == '\n' ->
|
||||
let (w1, r1) = T.span isSpaceOrTab rest
|
||||
(w2, (w3, r3)) = case T.uncons r1 of
|
||||
Just ('\n', r2)
|
||||
-> (T.pack "\n",
|
||||
T.span isSpaceOrTab r2)
|
||||
_ -> (mempty, (mempty, r1))
|
||||
ws = "\\" <> w1 <> w2 <> w3
|
||||
in case T.uncons r3 of
|
||||
Just ('\n', _) ->
|
||||
Tok pos (CtrlSeq " ") ("\\" <> w1)
|
||||
: totoks (incSourceColumn pos (T.length ws))
|
||||
r1
|
||||
_ ->
|
||||
Tok pos (CtrlSeq " ") ws
|
||||
: totoks (incSourceColumn pos (T.length ws))
|
||||
r3
|
||||
| otherwise ->
|
||||
Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d])
|
||||
: totoks (incSourceColumn pos 2) rest'
|
||||
| c == '#' ->
|
||||
let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest
|
||||
in case safeRead (T.unpack t1) of
|
||||
Just i ->
|
||||
Tok pos (Arg i) ("#" <> t1)
|
||||
: totoks (incSourceColumn pos (1 + T.length t1)) t2
|
||||
Nothing ->
|
||||
Tok pos Symbol "#"
|
||||
: totoks (incSourceColumn pos 1) t2
|
||||
| c == '^' ->
|
||||
case T.uncons rest of
|
||||
Just ('^', rest') ->
|
||||
case T.uncons rest' of
|
||||
Just (d, rest'')
|
||||
| isLowerHex d ->
|
||||
case T.uncons rest'' of
|
||||
Just (e, rest''') | isLowerHex e ->
|
||||
Tok pos Esc2 (T.pack ['^','^',d,e])
|
||||
: totoks (incSourceColumn pos 4) rest'''
|
||||
_ ->
|
||||
Tok pos Esc1 (T.pack ['^','^',d])
|
||||
: totoks (incSourceColumn pos 3) rest''
|
||||
| d < '\128' ->
|
||||
Tok pos Esc1 (T.pack ['^','^',d])
|
||||
: totoks (incSourceColumn pos 3) rest''
|
||||
_ -> Tok pos Symbol "^" :
|
||||
Tok (incSourceColumn pos 1) Symbol "^" :
|
||||
totoks (incSourceColumn pos 2) rest'
|
||||
_ -> Tok pos Symbol "^"
|
||||
: totoks (incSourceColumn pos 1) rest
|
||||
| otherwise ->
|
||||
Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest
|
||||
|
||||
isSpaceOrTab :: Char -> Bool
|
||||
isSpaceOrTab ' ' = True
|
||||
isSpaceOrTab '\t' = True
|
||||
isSpaceOrTab _ = False
|
||||
|
||||
isLetterOrAt :: Char -> Bool
|
||||
isLetterOrAt '@' = True
|
||||
isLetterOrAt c = isLetter c
|
||||
|
||||
isLowerHex :: Char -> Bool
|
||||
isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f'
|
||||
|
||||
untokenize :: [Tok] -> Text
|
||||
untokenize = mconcat . map untoken
|
||||
|
||||
untoken :: Tok -> Text
|
||||
untoken (Tok _ _ t) = t
|
||||
|
||||
toksToString :: [Tok] -> String
|
||||
toksToString = T.unpack . untokenize
|
||||
|
||||
satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok
|
||||
satisfyTok f =
|
||||
try $ do
|
||||
res <- tokenPrim (T.unpack . untoken) updatePos matcher
|
||||
doMacros 0 -- apply macros on remaining input stream
|
||||
return res
|
||||
where matcher t | f t = Just t
|
||||
| otherwise = Nothing
|
||||
updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos
|
||||
updatePos _spos _ (Tok pos _ _ : _) = pos
|
||||
updatePos spos _ [] = incSourceColumn spos 1
|
||||
|
||||
doMacros :: PandocMonad m => Int -> LP m ()
|
||||
doMacros n = do
|
||||
verbatimMode <- sVerbatimMode <$> getState
|
||||
unless verbatimMode $ do
|
||||
inp <- getInput
|
||||
case inp of
|
||||
Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" :
|
||||
Tok _ Word name : Tok _ Symbol "}" : ts
|
||||
-> handleMacros spos name ts
|
||||
Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" :
|
||||
Tok _ Word name : Tok _ Symbol "}" : ts
|
||||
-> handleMacros spos ("end" <> name) ts
|
||||
Tok _ (CtrlSeq "expandafter") _ : t : ts
|
||||
-> do setInput ts
|
||||
doMacros n
|
||||
getInput >>= setInput . combineTok t
|
||||
Tok spos (CtrlSeq name) _ : ts
|
||||
-> handleMacros spos name ts
|
||||
_ -> return ()
|
||||
where combineTok (Tok spos (CtrlSeq name) x) (Tok _ Word w : ts)
|
||||
| T.all isLetterOrAt w =
|
||||
Tok spos (CtrlSeq (name <> w)) (x1 <> w <> x2) : ts
|
||||
where (x1, x2) = T.break isSpaceOrTab x
|
||||
combineTok t ts = t:ts
|
||||
handleMacros spos name ts = do
|
||||
macros <- sMacros <$> getState
|
||||
case M.lookup name macros of
|
||||
Nothing -> return ()
|
||||
Just (Macro expansionPoint argspecs optarg newtoks) -> do
|
||||
setInput ts
|
||||
let matchTok (Tok _ toktype txt) =
|
||||
satisfyTok (\(Tok _ toktype' txt') ->
|
||||
toktype == toktype' &&
|
||||
txt == txt')
|
||||
let matchPattern toks = try $ mapM_ matchTok toks
|
||||
let getargs argmap [] = return argmap
|
||||
getargs argmap (Pattern toks : rest) = try $ do
|
||||
matchPattern toks
|
||||
getargs argmap rest
|
||||
getargs argmap (ArgNum i : Pattern toks : rest) =
|
||||
try $ do
|
||||
x <- mconcat <$> manyTill
|
||||
(braced <|> ((:[]) <$> anyTok))
|
||||
(matchPattern toks)
|
||||
getargs (M.insert i x argmap) rest
|
||||
getargs argmap (ArgNum i : rest) = do
|
||||
x <- try $ spaces >> bracedOrToken
|
||||
getargs (M.insert i x argmap) rest
|
||||
args <- case optarg of
|
||||
Nothing -> getargs M.empty argspecs
|
||||
Just o -> do
|
||||
x <- option o bracketedToks
|
||||
getargs (M.singleton 1 x) argspecs
|
||||
-- first boolean param is true if we're tokenizing
|
||||
-- an argument (in which case we don't want to
|
||||
-- expand #1 etc.)
|
||||
let addTok False (Tok _ (Arg i) _) acc =
|
||||
case M.lookup i args of
|
||||
Nothing -> mzero
|
||||
Just xs -> foldr (addTok True) acc xs
|
||||
-- see #4007
|
||||
addTok _ (Tok _ (CtrlSeq x) txt)
|
||||
acc@(Tok _ Word _ : _)
|
||||
| not (T.null txt) &&
|
||||
isLetter (T.last txt) =
|
||||
Tok spos (CtrlSeq x) (txt <> " ") : acc
|
||||
addTok _ t acc = setpos spos t : acc
|
||||
ts' <- getInput
|
||||
setInput $ foldr (addTok False) ts' newtoks
|
||||
case expansionPoint of
|
||||
ExpandWhenUsed ->
|
||||
if n > 20 -- detect macro expansion loops
|
||||
then throwError $ PandocMacroLoop (T.unpack name)
|
||||
else doMacros (n + 1)
|
||||
ExpandWhenDefined -> return ()
|
||||
|
||||
|
||||
setpos :: SourcePos -> Tok -> Tok
|
||||
setpos spos (Tok _ tt txt) = Tok spos tt txt
|
||||
|
||||
anyControlSeq :: PandocMonad m => LP m Tok
|
||||
anyControlSeq = satisfyTok isCtrlSeq
|
||||
|
||||
isCtrlSeq :: Tok -> Bool
|
||||
isCtrlSeq (Tok _ (CtrlSeq _) _) = True
|
||||
isCtrlSeq _ = False
|
||||
|
||||
anySymbol :: PandocMonad m => LP m Tok
|
||||
anySymbol = satisfyTok isSymbolTok
|
||||
|
||||
isSymbolTok :: Tok -> Bool
|
||||
isSymbolTok (Tok _ Symbol _) = True
|
||||
isSymbolTok _ = False
|
||||
|
||||
isWordTok :: Tok -> Bool
|
||||
isWordTok (Tok _ Word _) = True
|
||||
isWordTok _ = False
|
||||
|
||||
spaces :: PandocMonad m => LP m ()
|
||||
spaces = skipMany (satisfyTok (tokTypeIn [Comment, Spaces, Newline]))
|
||||
|
||||
spaces1 :: PandocMonad m => LP m ()
|
||||
spaces1 = skipMany1 (satisfyTok (tokTypeIn [Comment, Spaces, Newline]))
|
||||
|
||||
tokTypeIn :: [TokType] -> Tok -> Bool
|
||||
tokTypeIn toktypes (Tok _ tt _) = tt `elem` toktypes
|
||||
|
||||
controlSeq :: PandocMonad m => Text -> LP m Tok
|
||||
controlSeq name = satisfyTok isNamed
|
||||
where isNamed (Tok _ (CtrlSeq n) _) = n == name
|
||||
isNamed _ = False
|
||||
|
||||
symbol :: PandocMonad m => Char -> LP m Tok
|
||||
symbol c = satisfyTok isc
|
||||
where isc (Tok _ Symbol d) = case T.uncons d of
|
||||
Just (c',_) -> c == c'
|
||||
_ -> False
|
||||
isc _ = False
|
||||
|
||||
symbolIn :: PandocMonad m => [Char] -> LP m Tok
|
||||
symbolIn cs = satisfyTok isInCs
|
||||
where isInCs (Tok _ Symbol d) = case T.uncons d of
|
||||
Just (c,_) -> c `elem` cs
|
||||
_ -> False
|
||||
isInCs _ = False
|
||||
|
||||
sp :: PandocMonad m => LP m ()
|
||||
sp = whitespace <|> endline
|
||||
|
||||
whitespace :: PandocMonad m => LP m ()
|
||||
whitespace = () <$ satisfyTok isSpaceTok
|
||||
|
||||
isSpaceTok :: Tok -> Bool
|
||||
isSpaceTok (Tok _ Spaces _) = True
|
||||
isSpaceTok _ = False
|
||||
|
||||
newlineTok :: PandocMonad m => LP m ()
|
||||
newlineTok = () <$ satisfyTok isNewlineTok
|
||||
|
||||
isNewlineTok :: Tok -> Bool
|
||||
isNewlineTok (Tok _ Newline _) = True
|
||||
isNewlineTok _ = False
|
||||
|
||||
comment :: PandocMonad m => LP m ()
|
||||
comment = () <$ satisfyTok isCommentTok
|
||||
|
||||
isCommentTok :: Tok -> Bool
|
||||
isCommentTok (Tok _ Comment _) = True
|
||||
isCommentTok _ = False
|
||||
|
||||
anyTok :: PandocMonad m => LP m Tok
|
||||
anyTok = satisfyTok (const True)
|
||||
|
||||
singleChar :: PandocMonad m => LP m Tok
|
||||
singleChar = try $ do
|
||||
Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol])
|
||||
guard $ not $ toktype == Symbol &&
|
||||
T.any (`Set.member` specialChars) t
|
||||
if T.length t > 1
|
||||
then do
|
||||
let (t1, t2) = (T.take 1 t, T.drop 1 t)
|
||||
inp <- getInput
|
||||
setInput $ Tok (incSourceColumn pos 1) toktype t2 : inp
|
||||
return $ Tok pos toktype t1
|
||||
else return $ Tok pos toktype t
|
||||
|
||||
specialChars :: Set.Set Char
|
||||
specialChars = Set.fromList "#$%&~_^\\{}"
|
||||
|
||||
endline :: PandocMonad m => LP m ()
|
||||
endline = try $ do
|
||||
newlineTok
|
||||
lookAhead anyTok
|
||||
notFollowedBy blankline
|
||||
|
||||
blankline :: PandocMonad m => LP m ()
|
||||
blankline = try $ skipMany whitespace *> newlineTok
|
||||
|
||||
primEscape :: PandocMonad m => LP m Char
|
||||
primEscape = do
|
||||
Tok _ toktype t <- satisfyTok (tokTypeIn [Esc1, Esc2])
|
||||
case toktype of
|
||||
Esc1 -> case T.uncons (T.drop 2 t) of
|
||||
Just (c, _)
|
||||
| c >= '\64' && c <= '\127' -> return (chr (ord c - 64))
|
||||
| otherwise -> return (chr (ord c + 64))
|
||||
Nothing -> fail "Empty content of Esc1"
|
||||
Esc2 -> case safeRead ('0':'x':T.unpack (T.drop 2 t)) of
|
||||
Just x -> return (chr x)
|
||||
Nothing -> fail $ "Could not read: " ++ T.unpack t
|
||||
_ -> fail "Expected an Esc1 or Esc2 token" -- should not happen
|
||||
|
||||
bgroup :: PandocMonad m => LP m Tok
|
||||
bgroup = try $ do
|
||||
skipMany sp
|
||||
symbol '{' <|> controlSeq "bgroup" <|> controlSeq "begingroup"
|
||||
|
||||
egroup :: PandocMonad m => LP m Tok
|
||||
egroup = symbol '}' <|> controlSeq "egroup" <|> controlSeq "endgroup"
|
||||
|
||||
grouped :: (PandocMonad m, Monoid a) => LP m a -> LP m a
|
||||
grouped parser = try $ do
|
||||
bgroup
|
||||
-- first we check for an inner 'grouped', because
|
||||
-- {{a,b}} should be parsed the same as {a,b}
|
||||
try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup)
|
||||
|
||||
braced' :: PandocMonad m => LP m Tok -> Int -> LP m [Tok]
|
||||
braced' getTok n =
|
||||
handleEgroup <|> handleBgroup <|> handleOther
|
||||
where handleEgroup = do
|
||||
t <- egroup
|
||||
if n == 1
|
||||
then return []
|
||||
else (t:) <$> braced' getTok (n - 1)
|
||||
handleBgroup = do
|
||||
t <- bgroup
|
||||
(t:) <$> braced' getTok (n + 1)
|
||||
handleOther = do
|
||||
t <- getTok
|
||||
(t:) <$> braced' getTok n
|
||||
|
||||
braced :: PandocMonad m => LP m [Tok]
|
||||
braced = bgroup *> braced' anyTok 1
|
||||
|
||||
-- URLs require special handling, because they can contain %
|
||||
-- characters. So we retonenize comments as we go...
|
||||
bracedUrl :: PandocMonad m => LP m [Tok]
|
||||
bracedUrl = bgroup *> braced' (retokenizeComment >> anyTok) 1
|
||||
|
||||
-- For handling URLs, which allow literal % characters...
|
||||
retokenizeComment :: PandocMonad m => LP m ()
|
||||
retokenizeComment = (do
|
||||
Tok pos Comment txt <- satisfyTok isCommentTok
|
||||
let updPos (Tok pos' toktype' txt') =
|
||||
Tok (incSourceColumn (incSourceLine pos' (sourceLine pos - 1))
|
||||
(sourceColumn pos)) toktype' txt'
|
||||
let newtoks = map updPos $ tokenize (sourceName pos) $ T.tail txt
|
||||
getInput >>= setInput . ((Tok pos Symbol "%" : newtoks) ++))
|
||||
<|> return ()
|
||||
|
||||
bracedOrToken :: PandocMonad m => LP m [Tok]
|
||||
bracedOrToken = braced <|> ((:[]) <$> (anyControlSeq <|> singleChar))
|
||||
|
||||
bracketed :: PandocMonad m => Monoid a => LP m a -> LP m a
|
||||
bracketed parser = try $ do
|
||||
symbol '['
|
||||
mconcat <$> manyTill parser (symbol ']')
|
||||
|
||||
bracketedToks :: PandocMonad m => LP m [Tok]
|
||||
bracketedToks = do
|
||||
symbol '['
|
||||
mconcat <$> manyTill (braced <|> (:[]) <$> anyTok) (symbol ']')
|
||||
|
||||
parenWrapped :: PandocMonad m => Monoid a => LP m a -> LP m a
|
||||
parenWrapped parser = try $ do
|
||||
symbol '('
|
||||
mconcat <$> manyTill parser (symbol ')')
|
||||
|
||||
dimenarg :: PandocMonad m => LP m Text
|
||||
dimenarg = try $ do
|
||||
ch <- option False $ True <$ symbol '='
|
||||
Tok _ _ s <- satisfyTok isWordTok
|
||||
guard $ T.take 2 (T.reverse s) `elem`
|
||||
["pt","pc","in","bp","cm","mm","dd","cc","sp"]
|
||||
let num = T.take (T.length s - 2) s
|
||||
guard $ T.length num > 0
|
||||
guard $ T.all isDigit num
|
||||
return $ T.pack ['=' | ch] <> s
|
||||
|
||||
ignore :: (Monoid a, PandocMonad m) => String -> ParserT s u m a
|
||||
ignore raw = do
|
||||
pos <- getPosition
|
||||
report $ SkippedContent raw pos
|
||||
return mempty
|
||||
|
||||
withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok])
|
||||
withRaw parser = do
|
||||
inp <- getInput
|
||||
result <- parser
|
||||
nxt <- option (Tok (initialPos "source") Word "") (lookAhead anyTok)
|
||||
let raw = takeWhile (/= nxt) inp
|
||||
return (result, raw)
|
Loading…
Reference in a new issue