Improved handling of include files in LaTeX reader.

Previously `\include` wouldn't work if the included file
contained, e.g., a begin without a matching end.

We've changed the Tok type so that it stores a full SourcePos,
rather than just a line and column.  So tokens keeep track
of the file they came from. This allows us to use a simpler
method for includes, which doesn't require parsing the included
document as a whole.

Closes #3971.
This commit is contained in:
John MacFarlane 2017-10-16 22:03:57 -07:00
parent 9cf9a64923
commit c40857b389
5 changed files with 93 additions and 61 deletions

View file

@ -148,6 +148,7 @@ extra-source-files:
test/command/3510-subdoc.org
test/command/3510-export.latex
test/command/3510-src.hs
test/command/3971b.tex
test/docbook-reader.docbook
test/docbook-xref.docbook
test/html-reader.html

View file

@ -71,7 +71,9 @@ import Text.Pandoc.Shared
import Text.Pandoc.Readers.LaTeX.Types (Macro(..), ExpansionPoint(..), Tok(..),
TokType(..))
import Text.Pandoc.Walk
import Text.Pandoc.Error (PandocError(PandocParsecError, PandocMacroLoop))
import Text.Pandoc.Error
(PandocError(PandocParsecError, PandocParseError, PandocMacroLoop))
import Text.Parsec.Pos
-- for debugging:
-- import Text.Pandoc.Extensions (getDefaultExtensions)
@ -85,7 +87,7 @@ readLaTeX :: PandocMonad m
-> m Pandoc
readLaTeX opts ltx = do
parsed <- runParserT parseLaTeX def{ sOptions = opts } "source"
(tokenize (crFilter ltx))
(tokenize "source" (crFilter ltx))
case parsed of
Right result -> return result
Left e -> throwError $ PandocParsecError (T.unpack ltx) e
@ -127,7 +129,7 @@ resolveRefs _ x = x
-- res <- runIOorExplode (runParserT p defaultLaTeXState{
-- sOptions = def{ readerExtensions =
-- enableExtension Ext_raw_tex $
-- getDefaultExtensions "latex" }} "source" (tokenize t))
-- getDefaultExtensions "latex" }} "source" (tokenize "source" t))
-- case res of
-- Left e -> error (show e)
-- Right r -> return r
@ -238,7 +240,7 @@ rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s)
=> LP m a -> ParserT String s m String
rawLaTeXParser parser = do
inp <- getInput
let toks = tokenize $ T.pack inp
let toks = tokenize "source" $ T.pack inp
pstate <- getState
let lstate = def{ sOptions = extractReaderOptions pstate }
res <- lift $ runParserT ((,) <$> try (snd <$> withRaw parser) <*> getState)
@ -257,7 +259,7 @@ applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|>
pstate <- getState
let lstate = def{ sOptions = extractReaderOptions pstate
, sMacros = extractMacros pstate }
res <- runParserT retokenize lstate "math" (tokenize (T.pack s))
res <- runParserT retokenize lstate "math" (tokenize "math" (T.pack s))
case res of
Left e -> fail (show e)
Right s' -> return s'
@ -278,7 +280,7 @@ inlineCommand :: PandocMonad m => ParserT String ParserState m Inlines
inlineCommand = do
lookAhead (try (char '\\' >> letter) <|> char '$')
inp <- getInput
let toks = tokenize $ T.pack inp
let toks = tokenize "chunk" $ T.pack inp
let rawinline = do
(il, raw) <- try $ withRaw (inlineEnvironment <|> inlineCommand')
st <- getState
@ -294,32 +296,33 @@ inlineCommand = do
takeP (T.length (untokenize raw))
return il
tokenize :: Text -> [Tok]
tokenize = totoks (1, 1)
tokenize :: SourceName -> Text -> [Tok]
tokenize sourcename = totoks (initialPos sourcename)
totoks :: (Line, Column) -> Text -> [Tok]
totoks (lin,col) t =
totoks :: SourcePos -> Text -> [Tok]
totoks pos t =
case T.uncons t of
Nothing -> []
Just (c, rest)
| c == '\n' ->
Tok (lin, col) Newline "\n"
: totoks (lin + 1,1) rest
Tok pos Newline "\n"
: totoks (setSourceColumn (incSourceLine pos 1) 1) rest
| isSpaceOrTab c ->
let (sps, rest') = T.span isSpaceOrTab t
in Tok (lin, col) Spaces sps
: totoks (lin, col + T.length sps) rest'
in Tok pos Spaces sps
: totoks (incSourceColumn pos (T.length sps))
rest'
| isAlphaNum c ->
let (ws, rest') = T.span isAlphaNum t
in Tok (lin, col) Word ws
: totoks (lin, col + T.length ws) rest'
in Tok pos Word ws
: totoks (incSourceColumn pos (T.length ws)) rest'
| c == '%' ->
let (cs, rest') = T.break (== '\n') rest
in Tok (lin, col) Comment ("%" <> cs)
: totoks (lin, col + 1 + T.length cs) rest'
in Tok pos Comment ("%" <> cs)
: totoks (incSourceColumn pos (1 + T.length cs)) rest'
| c == '\\' ->
case T.uncons rest of
Nothing -> [Tok (lin, col) Symbol (T.singleton c)]
Nothing -> [Tok pos Symbol (T.singleton c)]
Just (d, rest')
| isLetterOrAt d ->
-- \makeatletter is common in macro defs;
@ -328,24 +331,24 @@ totoks (lin,col) t =
-- probably best for now
let (ws, rest'') = T.span isLetterOrAt rest
(ss, rest''') = T.span isSpaceOrTab rest''
in Tok (lin, col) (CtrlSeq ws) ("\\" <> ws <> ss)
: totoks (lin,
col + 1 + T.length ws + T.length ss) rest'''
in Tok pos (CtrlSeq ws) ("\\" <> ws <> ss)
: totoks (incSourceColumn pos
(1 + T.length ws + T.length ss)) rest'''
| d == '\t' || d == '\n' ->
Tok (lin, col) Symbol ("\\")
: totoks (lin, col + 1) rest
Tok pos Symbol ("\\")
: totoks (incSourceColumn pos 1) rest
| otherwise ->
Tok (lin, col) (CtrlSeq (T.singleton d)) (T.pack [c,d])
: totoks (lin, col + 2) rest'
Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d])
: totoks (incSourceColumn pos 2) rest'
| c == '#' ->
let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest
in case safeRead (T.unpack t1) of
Just i ->
Tok (lin, col) (Arg i) ("#" <> t1)
: totoks (lin, col + 1 + T.length t1) t2
Tok pos (Arg i) ("#" <> t1)
: totoks (incSourceColumn pos (1 + T.length t1)) t2
Nothing ->
Tok (lin, col) Symbol ("#")
: totoks (lin, col + 1) t2
Tok pos Symbol ("#")
: totoks (incSourceColumn pos 1) t2
| c == '^' ->
case T.uncons rest of
Just ('^', rest') ->
@ -354,20 +357,20 @@ totoks (lin,col) t =
| isLowerHex d ->
case T.uncons rest'' of
Just (e, rest''') | isLowerHex e ->
Tok (lin, col) Esc2 (T.pack ['^','^',d,e])
: totoks (lin, col + 4) rest'''
Tok pos Esc2 (T.pack ['^','^',d,e])
: totoks (incSourceColumn pos 4) rest'''
_ ->
Tok (lin, col) Esc1 (T.pack ['^','^',d])
: totoks (lin, col + 3) rest''
Tok pos Esc1 (T.pack ['^','^',d])
: totoks (incSourceColumn pos 3) rest''
| d < '\128' ->
Tok (lin, col) Esc1 (T.pack ['^','^',d])
: totoks (lin, col + 3) rest''
_ -> [Tok (lin, col) Symbol ("^"),
Tok (lin, col + 1) Symbol ("^")]
_ -> Tok (lin, col) Symbol ("^")
: totoks (lin, col + 1) rest
Tok pos Esc1 (T.pack ['^','^',d])
: totoks (incSourceColumn pos 3) rest''
_ -> [Tok pos Symbol ("^"),
Tok (incSourceColumn pos 1) Symbol ("^")]
_ -> Tok pos Symbol ("^")
: totoks (incSourceColumn pos 1) rest
| otherwise ->
Tok (lin, col) Symbol (T.singleton c) : totoks (lin, col + 1) rest
Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest
where isSpaceOrTab ' ' = True
isSpaceOrTab '\t' = True
@ -393,8 +396,7 @@ satisfyTok f =
where matcher t | f t = Just t
| otherwise = Nothing
updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos
updatePos spos _ (Tok (lin,col) _ _ : _) =
setSourceColumn (setSourceLine spos lin) col
updatePos _spos _ (Tok pos _ _ : _) = pos
updatePos spos _ [] = spos
doMacros :: PandocMonad m => Int -> LP m ()
@ -437,7 +439,7 @@ doMacros n = do
else doMacros (n + 1)
ExpandWhenDefined -> return ()
setpos :: (Line, Column) -> Tok -> Tok
setpos :: SourcePos -> Tok -> Tok
setpos spos (Tok _ tt txt) = Tok spos tt txt
anyControlSeq :: PandocMonad m => LP m Tok
@ -728,15 +730,15 @@ doverb = do
verbTok :: PandocMonad m => Char -> LP m Tok
verbTok stopchar = do
t@(Tok (lin, col) toktype txt) <- satisfyTok (not . isNewlineTok)
t@(Tok pos toktype txt) <- satisfyTok (not . isNewlineTok)
case T.findIndex (== stopchar) txt of
Nothing -> return t
Just i -> do
let (t1, t2) = T.splitAt i txt
inp <- getInput
setInput $ Tok (lin, col + i) Symbol (T.singleton stopchar)
: (totoks (lin, col + i + 1) (T.drop 1 t2)) ++ inp
return $ Tok (lin, col) toktype t1
setInput $ Tok (incSourceColumn pos i) Symbol (T.singleton stopchar)
: (totoks (incSourceColumn pos (i + 1)) (T.drop 1 t2)) ++ inp
return $ Tok pos toktype t1
dolstinline :: PandocMonad m => LP m Inlines
dolstinline = do
@ -1117,16 +1119,16 @@ tok = grouped inline <|> inlineCommand' <|> singleChar'
singleChar :: PandocMonad m => LP m Tok
singleChar = try $ do
Tok (lin,col) toktype t <- satisfyTok (tokTypeIn [Word, Symbol])
Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol])
guard $ not $ toktype == Symbol &&
T.any (`Set.member` specialChars) t
if T.length t > 1
then do
let (t1, t2) = (T.take 1 t, T.drop 1 t)
inp <- getInput
setInput $ (Tok (lin, col + 1) toktype t2) : inp
return $ Tok (lin,col) toktype t1
else return $ Tok (lin,col) toktype t
setInput $ (Tok (incSourceColumn pos 1) toktype t2) : inp
return $ Tok pos toktype t1
else return $ Tok pos toktype t
opt :: PandocMonad m => LP m Inlines
opt = bracketed inline
@ -1159,7 +1161,7 @@ withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok])
withRaw parser = do
inp <- getInput
result <- parser
nxt <- option (Tok (0,0) Word "") (lookAhead anyTok)
nxt <- option (Tok (initialPos "source") Word "") (lookAhead anyTok)
let raw = takeWhile (/= nxt) inp
return (result, raw)
@ -1739,7 +1741,27 @@ include = do
then map (maybeAddExtension ".sty") fs
else map (maybeAddExtension ".tex") fs
dirs <- (splitBy (==':') . fromMaybe ".") <$> lookupEnv "TEXINPUTS"
mconcat <$> mapM (insertIncludedFile blocks (tokenize . T.pack) dirs) fs'
mapM_ (insertIncluded dirs) fs'
return mempty
insertIncluded :: PandocMonad m
=> [FilePath]
-> FilePath
-> LP m ()
insertIncluded dirs f = do
pos <- getPosition
containers <- getIncludeFiles <$> getState
when (f `elem` containers) $ do
throwError $ PandocParseError $ "Include file loop at " ++ show pos
updateState $ addIncludeFile f
mbcontents <- readFileFromDirs dirs f
contents <- case mbcontents of
Just s -> return s
Nothing -> do
report $ CouldNotLoadIncludeFile f pos
return ""
getInput >>= setInput . (tokenize f (T.pack contents) ++)
updateState dropLatestIncludeFile
maybeAddExtension :: String -> FilePath -> FilePath
maybeAddExtension ext fp =
@ -2394,9 +2416,7 @@ parseTableRow envname prefsufs = do
>> anyTok)
suffpos <- getPosition
option [] (count 1 amp)
return $ map (setpos (sourceLine prefpos, sourceColumn prefpos)) pref
++ contents ++
map (setpos (sourceLine suffpos, sourceColumn suffpos)) suff
return $ map (setpos prefpos) pref ++ contents ++ map (setpos suffpos) suff
rawcells <- sequence (map celltoks prefsufs)
oldInput <- getInput
cells <- sequence $ map (\ts -> setInput ts >> parseTableCell) rawcells

View file

@ -31,17 +31,17 @@ module Text.Pandoc.Readers.LaTeX.Types ( Tok(..)
, TokType(..)
, Macro(..)
, ExpansionPoint(..)
, Line
, Column )
, SourcePos
)
where
import Data.Text (Text)
import Text.Parsec.Pos (Line, Column)
import Text.Parsec.Pos (SourcePos)
data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment |
Esc1 | Esc2 | Arg Int
deriving (Eq, Ord, Show)
data Tok = Tok (Line, Column) TokType Text
data Tok = Tok SourcePos TokType Text
deriving (Eq, Ord, Show)
data ExpansionPoint = ExpandWhenDefined | ExpandWhenUsed

9
test/command/3971.md Normal file
View file

@ -0,0 +1,9 @@
```
% TEXINPUTS=command pandoc -f latex -t native
\documentclass{article}
\include{3971b}
\code{f}
\end{document}
^D
[Para [Code ("",[],[]) "f"]]
```

2
test/command/3971b.tex Normal file
View file

@ -0,0 +1,2 @@
\newcommand{\code}[1]{\texttt{#1}}
\begin{document}