Man reader: move macro resolution to lexer phase.

We also introduce a new type ManTokens (a sequence of tokens)
and remove MComment.  This allows lexers to return empty strings
of tokens, or multiple tokens (as when macros are resolved).

One test still fails.  This needs to be fixed by moving handling
of .BI, .I, etc. to the parsing phase.
This commit is contained in:
John MacFarlane 2018-10-21 12:07:07 -07:00
parent a98e2b7c42
commit 25248c7a37
3 changed files with 103 additions and 77 deletions

View file

@ -1,4 +1,5 @@
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{- {-
Copyright (C) 2018 Yan Pashkovsky <yanp.bugz@gmail.com> Copyright (C) 2018 Yan Pashkovsky <yanp.bugz@gmail.com>
@ -55,6 +56,8 @@ import Text.Parsec hiding (tokenPrim, space)
import qualified Text.Parsec as Parsec import qualified Text.Parsec as Parsec
import Text.Parsec.Pos (updatePosString) import Text.Parsec.Pos (updatePosString)
import Text.Pandoc.GroffChar (characterCodes, combiningAccents) import Text.Pandoc.GroffChar (characterCodes, combiningAccents)
import qualified Data.Sequence as Seq
import qualified Data.Foldable as Foldable
-- import Debug.Trace (traceShowId) -- import Debug.Trace (traceShowId)
@ -75,24 +78,29 @@ data LinePart = RoffStr (String, Font)
data ManToken = MLine [LinePart] data ManToken = MLine [LinePart]
| MEmptyLine | MEmptyLine
| MMacro MacroKind [[LinePart]] | MMacro MacroKind [[LinePart]]
| MComment
deriving Show deriving Show
newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
deriving (Show, Semigroup, Monoid)
singleTok :: ManToken -> ManTokens
singleTok t = ManTokens (Seq.singleton t)
data RoffState = RoffState { fontKind :: Font data RoffState = RoffState { fontKind :: Font
, customMacros :: M.Map String ManTokens
} deriving Show } deriving Show
instance Default RoffState where instance Default RoffState where
def = RoffState { fontKind = S.singleton Regular } def = RoffState { customMacros = mempty
, fontKind = S.singleton Regular }
data ManState = ManState { customMacros :: M.Map String [ManToken] data ManState = ManState { readerOptions :: ReaderOptions
, readerOptions :: ReaderOptions
, metadata :: Meta , metadata :: Meta
} deriving Show } deriving Show
instance Default ManState where instance Default ManState where
def = ManState { customMacros = mempty def = ManState { readerOptions = def
, readerOptions = def , metadata = nullMeta }
, metadata = nullMeta }
type ManLexer m = ParserT [Char] RoffState m type ManLexer m = ParserT [Char] RoffState m
type ManParser m = ParserT [ManToken] ManState m type ManParser m = ParserT [ManToken] ManState m
@ -101,7 +109,9 @@ type ManParser m = ParserT [ManToken] ManState m
-- | Read man (troff) from an input string and return a Pandoc document. -- | Read man (troff) from an input string and return a Pandoc document.
readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc
readMan opts txt = do readMan opts txt = do
eithertokens <- readWithM (many manToken) def (T.unpack $ crFilter txt) eithertokens <- readWithM
(Foldable.toList . unManTokens . mconcat <$> many manToken)
def (T.unpack $ crFilter txt)
case eithertokens of case eithertokens of
Left e -> throwError e Left e -> throwError e
Right tokenz -> do Right tokenz -> do
@ -128,7 +138,7 @@ readMan opts txt = do
-- String -> ManToken function -- String -> ManToken function
-- --
manToken :: PandocMonad m => ManLexer m ManToken manToken :: PandocMonad m => ManLexer m ManTokens
manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
parseMan :: PandocMonad m => ManParser m Pandoc parseMan :: PandocMonad m => ManParser m Pandoc
@ -147,8 +157,7 @@ parseBlock = choice [ parseList
, parseSkippedContent , parseSkippedContent
, parseCodeBlock , parseCodeBlock
, parseHeader , parseHeader
, parseMacroDef , skipUnkownMacro
, parseUnkownMacro
] ]
eofline :: Stream s m Char => ParsecT s u m () eofline :: Stream s m Char => ParsecT s u m ()
@ -268,15 +277,15 @@ currentFont :: PandocMonad m => ManLexer m Font
currentFont = fontKind <$> getState currentFont = fontKind <$> getState
-- separate function from lexMacro since real man files sometimes do not follow the rules -- separate function from lexMacro since real man files sometimes do not follow the rules
lexComment :: PandocMonad m => ManLexer m ManToken lexComment :: PandocMonad m => ManLexer m ManTokens
lexComment = do lexComment = do
try $ string ".\\\"" try $ string ".\\\""
many Parsec.space many Parsec.space
skipMany $ noneOf "\n" skipMany $ noneOf "\n"
char '\n' char '\n'
return MComment return mempty
lexMacro :: PandocMonad m => ManLexer m ManToken lexMacro :: PandocMonad m => ManLexer m ManTokens
lexMacro = do lexMacro = do
char '.' <|> char '\'' char '.' <|> char '\''
many spacetab many spacetab
@ -287,15 +296,16 @@ lexMacro = do
addFontToRoffStr _ x = x addFontToRoffStr _ x = x
case macroName of case macroName of
"" -> return MComment "" -> return mempty
"\\\"" -> return MComment "\\\"" -> return mempty
"\\#" -> return MComment "\\#" -> return mempty
"de" -> lexMacroDef args
"B" -> do "B" -> do
args' <- argsOrFromNextLine args args' <- argsOrFromNextLine args
return $ MLine $ concatMap (addFont Bold) args' return $ singleTok $ MLine $ concatMap (addFont Bold) args'
"I" -> do "I" -> do
args' <- argsOrFromNextLine args args' <- argsOrFromNextLine args
return $ MLine $ concatMap (addFont Italic) args' return $ singleTok $ MLine $ concatMap (addFont Italic) args'
x | x `elem` ["BI", "IB", "RI", "IR", "BR", "RB"] -> do x | x `elem` ["BI", "IB", "RI", "IR", "BR", "RB"] -> do
let toFont 'I' = Italic let toFont 'I' = Italic
toFont 'R' = Regular toFont 'R' = Regular
@ -303,17 +313,56 @@ lexMacro = do
toFont 'M' = Monospace toFont 'M' = Monospace
toFont _ = Regular toFont _ = Regular
let fontlist = map toFont x let fontlist = map toFont x
return $ MLine $ concat $ zipWith addFont (cycle fontlist) args return $ singleTok
x | x `elem` [ "P", "PP", "LP", "sp"] -> return MEmptyLine $ MLine $ concat $ zipWith addFont (cycle fontlist) args
_ -> return $ MMacro macroName args x | x `elem` [ "P", "PP", "LP", "sp"] -> return $ singleTok MEmptyLine
_ -> resolveMacro macroName args
where where
argsOrFromNextLine :: PandocMonad m => [[LinePart]] -> ManLexer m [[LinePart]] resolveMacro :: PandocMonad m
=> String -> [[LinePart]] -> ManLexer m ManTokens
resolveMacro macroName args = do
macros <- customMacros <$> getState
case M.lookup macroName macros of
Nothing -> return $ singleTok $ MMacro macroName args
Just ts -> do
let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
fillLP (MacroArg i) zs =
case drop (i - 1) args of
[] -> zs
(ys:_) -> ys ++ zs
let fillMacroArg (MLine lineparts) =
MLine (foldr fillLP [] lineparts)
fillMacroArg x = x
return $ ManTokens . fmap fillMacroArg . unManTokens $ ts
lexMacroDef :: PandocMonad m => [[LinePart]] -> ManLexer m ManTokens
lexMacroDef args = do -- macro definition
(macroName, stopMacro) <-
case args of
(x : y : _) -> return (linePartsToString x, linePartsToString y)
-- optional second arg
(x:_) -> return (linePartsToString x, ".")
[] -> fail "No argument to .de"
let stop = try $ do
char '.' <|> char '\''
many spacetab
string stopMacro
_ <- lexArgs
return ()
ts <- mconcat <$> manyTill manToken stop
modifyState $ \st ->
st{ customMacros = M.insert macroName ts (customMacros st) }
return mempty
argsOrFromNextLine :: PandocMonad m
=> [[LinePart]] -> ManLexer m [[LinePart]]
argsOrFromNextLine args = argsOrFromNextLine args =
if null args if null args
then do then do
MLine lps <- lexLine lps <- many1 linePart
eofline
return [lps] return [lps]
else return args else return args
@ -357,11 +406,11 @@ lexMacro = do
fonts <- currentFont fonts <- currentFont
return $ RoffStr ("\"", fonts) return $ RoffStr ("\"", fonts)
lexLine :: PandocMonad m => ManLexer m ManToken lexLine :: PandocMonad m => ManLexer m ManTokens
lexLine = do lexLine = do
lnparts <- many1 linePart lnparts <- many1 linePart
eofline eofline
return $ MLine lnparts return $ singleTok $ MLine lnparts
where where
linePart :: PandocMonad m => ManLexer m LinePart linePart :: PandocMonad m => ManLexer m LinePart
@ -398,8 +447,8 @@ spaceTabChar = do
font <- currentFont font <- currentFont
return $ RoffStr ([c], font) return $ RoffStr ([c], font)
lexEmptyLine :: PandocMonad m => ManLexer m ManToken lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
lexEmptyLine = char '\n' >> return MEmptyLine lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
-- --
-- ManToken parsec functions -- ManToken parsec functions
@ -434,11 +483,6 @@ mmacroAny = msatisfy isMMacro where
isMMacro (MMacro _ _) = True isMMacro (MMacro _ _) = True
isMMacro _ = False isMMacro _ = False
mcomment :: PandocMonad m => ManParser m ManToken
mcomment = msatisfy isMComment where
isMComment MComment = True
isMComment _ = False
-- --
-- ManToken -> Block functions -- ManToken -> Block functions
-- --
@ -459,7 +503,7 @@ parseTitle = do
return mempty return mempty
parseSkippedContent :: PandocMonad m => ManParser m Blocks parseSkippedContent :: PandocMonad m => ManParser m Blocks
parseSkippedContent = mempty <$ (mcomment <|> memptyLine) parseSkippedContent = mempty <$ memptyLine
linePartsToInlines :: [LinePart] -> Inlines linePartsToInlines :: [LinePart] -> Inlines
linePartsToInlines = go linePartsToInlines = go
@ -502,7 +546,7 @@ parsePara = para . trimInlines <$> parseInlines
parseInlines :: PandocMonad m => ManParser m Inlines parseInlines :: PandocMonad m => ManParser m Inlines
parseInlines = do parseInlines = do
inls <- many1 (lineInl <|> comment <|> parseLink <|> parseEmailLink) inls <- many1 (lineInl <|> parseLink <|> parseEmailLink)
return $ mconcat $ intersperse B.space inls return $ mconcat $ intersperse B.space inls
lineInl :: PandocMonad m => ManParser m Inlines lineInl :: PandocMonad m => ManParser m Inlines
@ -510,9 +554,6 @@ lineInl = do
(MLine fragments) <- mline (MLine fragments) <- mline
return $ linePartsToInlines $ fragments return $ linePartsToInlines $ fragments
comment :: PandocMonad m => ManParser m Inlines
comment = mcomment >> return mempty
bareIP :: PandocMonad m => ManParser m ManToken bareIP :: PandocMonad m => ManParser m ManToken
bareIP = msatisfy isBareIP where bareIP = msatisfy isBareIP where
isBareIP (MMacro "IP" []) = True isBareIP (MMacro "IP" []) = True
@ -522,7 +563,7 @@ parseCodeBlock :: PandocMonad m => ManParser m Blocks
parseCodeBlock = try $ do parseCodeBlock = try $ do
optional bareIP -- some people indent their code optional bareIP -- some people indent their code
mmacro "nf" mmacro "nf"
toks <- many (mline <|> memptyLine <|> mcomment) toks <- many (mline <|> memptyLine)
mmacro "fi" mmacro "fi"
return $ codeBlock (removeFinalNewline $ return $ codeBlock (removeFinalNewline $
intercalate "\n" . catMaybes $ intercalate "\n" . catMaybes $
@ -612,7 +653,7 @@ parseDefinitionList = definitionList <$> many1 definitionListItem
parseLink :: PandocMonad m => ManParser m Inlines parseLink :: PandocMonad m => ManParser m Inlines
parseLink = try $ do parseLink = try $ do
MMacro _ args <- mmacro "UR" MMacro _ args <- mmacro "UR"
contents <- mconcat <$> many1 (lineInl <|> comment) contents <- mconcat <$> many1 lineInl
mmacro "UE" mmacro "UE"
let url = case args of let url = case args of
[] -> "" [] -> ""
@ -622,48 +663,19 @@ parseLink = try $ do
parseEmailLink :: PandocMonad m => ManParser m Inlines parseEmailLink :: PandocMonad m => ManParser m Inlines
parseEmailLink = do parseEmailLink = do
MMacro _ args <- mmacro "MT" MMacro _ args <- mmacro "MT"
contents <- mconcat <$> many1 (lineInl <|> comment) contents <- mconcat <$> many1 lineInl
mmacro "ME" mmacro "ME"
let url = case args of let url = case args of
[] -> "" [] -> ""
(x:_) -> "mailto:" ++ linePartsToString x (x:_) -> "mailto:" ++ linePartsToString x
return $ link url "" contents return $ link url "" contents
parseMacroDef :: PandocMonad m => ManParser m Blocks skipUnkownMacro :: PandocMonad m => ManParser m Blocks
parseMacroDef = do skipUnkownMacro = do
MMacro _ args <- mmacro "de"
(macroName, stopMacro) <-
case args of
(x : y : _) -> return (linePartsToString x, linePartsToString y)
-- optional second arg
(x:_) -> return (linePartsToString x, ".")
[] -> fail "No argument to .de"
ts <- manyTill (msatisfy (const True)) (mmacro stopMacro)
modifyState $ \st ->
st{ customMacros = M.insert macroName ts (customMacros st) }
return mempty
-- In case of weird man file it will be parsed succesfully
parseUnkownMacro :: PandocMonad m => ManParser m Blocks
parseUnkownMacro = do
pos <- getPosition pos <- getPosition
tok <- mmacroAny tok <- mmacroAny
case tok of case tok of
MMacro mkind args -> do MMacro mkind _ -> do
macros <- customMacros <$> getState report $ SkippedContent ('.':mkind) pos
case M.lookup mkind macros of return mempty
Nothing -> do _ -> fail "the impossible happened"
report $ SkippedContent ('.':mkind) pos
return mempty
Just ts -> do
toks <- getInput
let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
fillLP (MacroArg i) zs =
case drop (i - 1) args of
[] -> zs
(ys:_) -> ys ++ zs
let fillMacroArg (MLine lineparts) = MLine (foldr fillLP [] lineparts)
fillMacroArg x = x
setInput $ (map fillMacroArg ts) ++ toks
return mempty
_ -> fail "the impossible happened"

View file

@ -207,3 +207,15 @@ site
.MT me@example.com .MT me@example.com
my email address. my email address.
.ME .ME
.SH Macros
.de au
.B
Me Myself
..
.de auth !!
.I
The author is \$1.
.!!
.au
and I.
.auth "John Jones"

View file

@ -101,4 +101,6 @@ Pandoc (Meta {unMeta = fromList [("date",MetaInlines [Str "Oct",Space,Str "17,",
,Para [Str "Minus:",Space,Str "-"] ,Para [Str "Minus:",Space,Str "-"]
,Header 1 ("",[],[]) [Str "Links"] ,Header 1 ("",[],[]) [Str "Links"]
,Para [Link ("",[],[]) [Str "some",Space,Str "randomsite"] ("http://example.com","")] ,Para [Link ("",[],[]) [Str "some",Space,Str "randomsite"] ("http://example.com","")]
,Para [Link ("",[],[]) [Str "my",Space,Str "email",Space,Str "address."] ("mailto:me@example.com","")]] ,Para [Link ("",[],[]) [Str "my",Space,Str "email",Space,Str "address."] ("mailto:me@example.com","")]
,Header 1 ("",[],[]) [Str "Macros"]
,Para [Strong [Str "Me",Space,Str "Myself"],Space,Str "and",Space,Str "I.",Space,Emph [Str "The",Space,Str "author",Space,Str "is",Space,Str "John",Space,Str "Jones",Str "."]]]