Man reader: move macro resolution to lexer phase.

We also introduce a new type ManTokens (a sequence of tokens)
and remove MComment.  This allows lexers to return empty strings
of tokens, or multiple tokens (as when macros are resolved).

One test still fails.  This needs to be fixed by moving handling
of .BI, .I, etc. to the parsing phase.
This commit is contained in:
John MacFarlane 2018-10-21 12:07:07 -07:00
parent a98e2b7c42
commit 25248c7a37
3 changed files with 103 additions and 77 deletions

View file

@ -1,4 +1,5 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-
Copyright (C) 2018 Yan Pashkovsky <yanp.bugz@gmail.com>
@ -55,6 +56,8 @@ import Text.Parsec hiding (tokenPrim, space)
import qualified Text.Parsec as Parsec
import Text.Parsec.Pos (updatePosString)
import Text.Pandoc.GroffChar (characterCodes, combiningAccents)
import qualified Data.Sequence as Seq
import qualified Data.Foldable as Foldable
-- import Debug.Trace (traceShowId)
@ -75,24 +78,29 @@ data LinePart = RoffStr (String, Font)
data ManToken = MLine [LinePart]
| MEmptyLine
| MMacro MacroKind [[LinePart]]
| MComment
deriving Show
newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
deriving (Show, Semigroup, Monoid)
singleTok :: ManToken -> ManTokens
singleTok t = ManTokens (Seq.singleton t)
data RoffState = RoffState { fontKind :: Font
, customMacros :: M.Map String ManTokens
} deriving Show
instance Default RoffState where
def = RoffState { fontKind = S.singleton Regular }
def = RoffState { customMacros = mempty
, fontKind = S.singleton Regular }
data ManState = ManState { customMacros :: M.Map String [ManToken]
, readerOptions :: ReaderOptions
data ManState = ManState { readerOptions :: ReaderOptions
, metadata :: Meta
} deriving Show
instance Default ManState where
def = ManState { customMacros = mempty
, readerOptions = def
, metadata = nullMeta }
def = ManState { readerOptions = def
, metadata = nullMeta }
type ManLexer m = ParserT [Char] RoffState m
type ManParser m = ParserT [ManToken] ManState m
@ -101,7 +109,9 @@ type ManParser m = ParserT [ManToken] ManState m
-- | Read man (troff) from an input string and return a Pandoc document.
readMan :: PandocMonad m => ReaderOptions -> T.Text -> m Pandoc
readMan opts txt = do
eithertokens <- readWithM (many manToken) def (T.unpack $ crFilter txt)
eithertokens <- readWithM
(Foldable.toList . unManTokens . mconcat <$> many manToken)
def (T.unpack $ crFilter txt)
case eithertokens of
Left e -> throwError e
Right tokenz -> do
@ -128,7 +138,7 @@ readMan opts txt = do
-- String -> ManToken function
--
manToken :: PandocMonad m => ManLexer m ManToken
manToken :: PandocMonad m => ManLexer m ManTokens
manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
parseMan :: PandocMonad m => ManParser m Pandoc
@ -147,8 +157,7 @@ parseBlock = choice [ parseList
, parseSkippedContent
, parseCodeBlock
, parseHeader
, parseMacroDef
, parseUnkownMacro
, skipUnkownMacro
]
eofline :: Stream s m Char => ParsecT s u m ()
@ -268,15 +277,15 @@ currentFont :: PandocMonad m => ManLexer m Font
currentFont = fontKind <$> getState
-- separate function from lexMacro since real man files sometimes do not follow the rules
lexComment :: PandocMonad m => ManLexer m ManToken
lexComment :: PandocMonad m => ManLexer m ManTokens
lexComment = do
try $ string ".\\\""
many Parsec.space
skipMany $ noneOf "\n"
char '\n'
return MComment
return mempty
lexMacro :: PandocMonad m => ManLexer m ManToken
lexMacro :: PandocMonad m => ManLexer m ManTokens
lexMacro = do
char '.' <|> char '\''
many spacetab
@ -287,15 +296,16 @@ lexMacro = do
addFontToRoffStr _ x = x
case macroName of
"" -> return MComment
"\\\"" -> return MComment
"\\#" -> return MComment
"" -> return mempty
"\\\"" -> return mempty
"\\#" -> return mempty
"de" -> lexMacroDef args
"B" -> do
args' <- argsOrFromNextLine args
return $ MLine $ concatMap (addFont Bold) args'
return $ singleTok $ MLine $ concatMap (addFont Bold) args'
"I" -> do
args' <- argsOrFromNextLine args
return $ MLine $ concatMap (addFont Italic) args'
return $ singleTok $ MLine $ concatMap (addFont Italic) args'
x | x `elem` ["BI", "IB", "RI", "IR", "BR", "RB"] -> do
let toFont 'I' = Italic
toFont 'R' = Regular
@ -303,17 +313,56 @@ lexMacro = do
toFont 'M' = Monospace
toFont _ = Regular
let fontlist = map toFont x
return $ MLine $ concat $ zipWith addFont (cycle fontlist) args
x | x `elem` [ "P", "PP", "LP", "sp"] -> return MEmptyLine
_ -> return $ MMacro macroName args
return $ singleTok
$ MLine $ concat $ zipWith addFont (cycle fontlist) args
x | x `elem` [ "P", "PP", "LP", "sp"] -> return $ singleTok MEmptyLine
_ -> resolveMacro macroName args
where
argsOrFromNextLine :: PandocMonad m => [[LinePart]] -> ManLexer m [[LinePart]]
resolveMacro :: PandocMonad m
=> String -> [[LinePart]] -> ManLexer m ManTokens
resolveMacro macroName args = do
macros <- customMacros <$> getState
case M.lookup macroName macros of
Nothing -> return $ singleTok $ MMacro macroName args
Just ts -> do
let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
fillLP (MacroArg i) zs =
case drop (i - 1) args of
[] -> zs
(ys:_) -> ys ++ zs
let fillMacroArg (MLine lineparts) =
MLine (foldr fillLP [] lineparts)
fillMacroArg x = x
return $ ManTokens . fmap fillMacroArg . unManTokens $ ts
lexMacroDef :: PandocMonad m => [[LinePart]] -> ManLexer m ManTokens
lexMacroDef args = do -- macro definition
(macroName, stopMacro) <-
case args of
(x : y : _) -> return (linePartsToString x, linePartsToString y)
-- optional second arg
(x:_) -> return (linePartsToString x, ".")
[] -> fail "No argument to .de"
let stop = try $ do
char '.' <|> char '\''
many spacetab
string stopMacro
_ <- lexArgs
return ()
ts <- mconcat <$> manyTill manToken stop
modifyState $ \st ->
st{ customMacros = M.insert macroName ts (customMacros st) }
return mempty
argsOrFromNextLine :: PandocMonad m
=> [[LinePart]] -> ManLexer m [[LinePart]]
argsOrFromNextLine args =
if null args
then do
MLine lps <- lexLine
lps <- many1 linePart
eofline
return [lps]
else return args
@ -357,11 +406,11 @@ lexMacro = do
fonts <- currentFont
return $ RoffStr ("\"", fonts)
lexLine :: PandocMonad m => ManLexer m ManToken
lexLine :: PandocMonad m => ManLexer m ManTokens
lexLine = do
lnparts <- many1 linePart
eofline
return $ MLine lnparts
return $ singleTok $ MLine lnparts
where
linePart :: PandocMonad m => ManLexer m LinePart
@ -398,8 +447,8 @@ spaceTabChar = do
font <- currentFont
return $ RoffStr ([c], font)
lexEmptyLine :: PandocMonad m => ManLexer m ManToken
lexEmptyLine = char '\n' >> return MEmptyLine
lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
--
-- ManToken parsec functions
@ -434,11 +483,6 @@ mmacroAny = msatisfy isMMacro where
isMMacro (MMacro _ _) = True
isMMacro _ = False
mcomment :: PandocMonad m => ManParser m ManToken
mcomment = msatisfy isMComment where
isMComment MComment = True
isMComment _ = False
--
-- ManToken -> Block functions
--
@ -459,7 +503,7 @@ parseTitle = do
return mempty
parseSkippedContent :: PandocMonad m => ManParser m Blocks
parseSkippedContent = mempty <$ (mcomment <|> memptyLine)
parseSkippedContent = mempty <$ memptyLine
linePartsToInlines :: [LinePart] -> Inlines
linePartsToInlines = go
@ -502,7 +546,7 @@ parsePara = para . trimInlines <$> parseInlines
parseInlines :: PandocMonad m => ManParser m Inlines
parseInlines = do
inls <- many1 (lineInl <|> comment <|> parseLink <|> parseEmailLink)
inls <- many1 (lineInl <|> parseLink <|> parseEmailLink)
return $ mconcat $ intersperse B.space inls
lineInl :: PandocMonad m => ManParser m Inlines
@ -510,9 +554,6 @@ lineInl = do
(MLine fragments) <- mline
return $ linePartsToInlines $ fragments
comment :: PandocMonad m => ManParser m Inlines
comment = mcomment >> return mempty
bareIP :: PandocMonad m => ManParser m ManToken
bareIP = msatisfy isBareIP where
isBareIP (MMacro "IP" []) = True
@ -522,7 +563,7 @@ parseCodeBlock :: PandocMonad m => ManParser m Blocks
parseCodeBlock = try $ do
optional bareIP -- some people indent their code
mmacro "nf"
toks <- many (mline <|> memptyLine <|> mcomment)
toks <- many (mline <|> memptyLine)
mmacro "fi"
return $ codeBlock (removeFinalNewline $
intercalate "\n" . catMaybes $
@ -612,7 +653,7 @@ parseDefinitionList = definitionList <$> many1 definitionListItem
parseLink :: PandocMonad m => ManParser m Inlines
parseLink = try $ do
MMacro _ args <- mmacro "UR"
contents <- mconcat <$> many1 (lineInl <|> comment)
contents <- mconcat <$> many1 lineInl
mmacro "UE"
let url = case args of
[] -> ""
@ -622,48 +663,19 @@ parseLink = try $ do
parseEmailLink :: PandocMonad m => ManParser m Inlines
parseEmailLink = do
MMacro _ args <- mmacro "MT"
contents <- mconcat <$> many1 (lineInl <|> comment)
contents <- mconcat <$> many1 lineInl
mmacro "ME"
let url = case args of
[] -> ""
(x:_) -> "mailto:" ++ linePartsToString x
return $ link url "" contents
parseMacroDef :: PandocMonad m => ManParser m Blocks
parseMacroDef = do
MMacro _ args <- mmacro "de"
(macroName, stopMacro) <-
case args of
(x : y : _) -> return (linePartsToString x, linePartsToString y)
-- optional second arg
(x:_) -> return (linePartsToString x, ".")
[] -> fail "No argument to .de"
ts <- manyTill (msatisfy (const True)) (mmacro stopMacro)
modifyState $ \st ->
st{ customMacros = M.insert macroName ts (customMacros st) }
return mempty
-- In case of weird man file it will be parsed succesfully
parseUnkownMacro :: PandocMonad m => ManParser m Blocks
parseUnkownMacro = do
skipUnkownMacro :: PandocMonad m => ManParser m Blocks
skipUnkownMacro = do
pos <- getPosition
tok <- mmacroAny
case tok of
MMacro mkind args -> do
macros <- customMacros <$> getState
case M.lookup mkind macros of
Nothing -> do
report $ SkippedContent ('.':mkind) pos
return mempty
Just ts -> do
toks <- getInput
let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
fillLP (MacroArg i) zs =
case drop (i - 1) args of
[] -> zs
(ys:_) -> ys ++ zs
let fillMacroArg (MLine lineparts) = MLine (foldr fillLP [] lineparts)
fillMacroArg x = x
setInput $ (map fillMacroArg ts) ++ toks
return mempty
_ -> fail "the impossible happened"
MMacro mkind _ -> do
report $ SkippedContent ('.':mkind) pos
return mempty
_ -> fail "the impossible happened"

View file

@ -207,3 +207,15 @@ site
.MT me@example.com
my email address.
.ME
.SH Macros
.de au
.B
Me Myself
..
.de auth !!
.I
The author is \$1.
.!!
.au
and I.
.auth "John Jones"

View file

@ -101,4 +101,6 @@ Pandoc (Meta {unMeta = fromList [("date",MetaInlines [Str "Oct",Space,Str "17,",
,Para [Str "Minus:",Space,Str "-"]
,Header 1 ("",[],[]) [Str "Links"]
,Para [Link ("",[],[]) [Str "some",Space,Str "randomsite"] ("http://example.com","")]
,Para [Link ("",[],[]) [Str "my",Space,Str "email",Space,Str "address."] ("mailto:me@example.com","")]]
,Para [Link ("",[],[]) [Str "my",Space,Str "email",Space,Str "address."] ("mailto:me@example.com","")]
,Header 1 ("",[],[]) [Str "Macros"]
,Para [Strong [Str "Me",Space,Str "Myself"],Space,Str "and",Space,Str "I.",Space,Emph [Str "The",Space,Str "author",Space,Str "is",Space,Str "John",Space,Str "Jones",Str "."]]]