Added Text.Pandoc.Readers.Groff.

This is an internal module that exports a tokenizer for groff
formats.

Closes #4998.
This commit is contained in:
John MacFarlane 2018-10-24 17:38:08 -07:00
parent 123381765c
commit 6c71100fcf
3 changed files with 531 additions and 465 deletions

View file

@ -534,6 +534,7 @@ library
Text.Pandoc.Readers.Org.ParserState,
Text.Pandoc.Readers.Org.Parsing,
Text.Pandoc.Readers.Org.Shared,
Text.Pandoc.Readers.Groff,
Text.Pandoc.Writers.Groff,
Text.Pandoc.Writers.Powerpoint.Presentation,
Text.Pandoc.Writers.Powerpoint.Output,

View file

@ -0,0 +1,524 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-
Copyright (C) 2018 Yan Pashkovsky <yanp.bugz@gmail.com>
and John MacFarlane
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Readers.Groff
Copyright : Copyright (C) 2018 Yan Pashkovsky and John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : Yan Pashkovsky <yanp.bugz@gmail.com>
Stability : WIP
Portability : portable
Tokenizer for groff formats (man, ms).
-}
module Text.Pandoc.Readers.Groff
( FontKind(..)
, Font
, MacroKind
, LinePart(..)
, Arg
, ManToken(..)
, ManTokens(..)
, singleTok
, RoffState(..)
, ManLexer
, manToken
, linePartsToString
)
where
import Prelude
import Control.Monad (void, mzero, guard, when)
import Text.Pandoc.Class
(getResourcePath, readFileFromDirs, PandocMonad(..), report)
import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
import Data.Default (Default)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.List (intercalate, isSuffixOf)
import qualified Data.Text as T
import Text.Pandoc.Builder as B
import Text.Pandoc.Logging (LogMessage(..))
import Text.Pandoc.Options
import Text.Pandoc.Parsing
import Text.Pandoc.Shared (safeRead)
import Text.Parsec hiding (tokenPrim)
import qualified Text.Parsec as Parsec
import Text.Pandoc.GroffChar (characterCodes, combiningAccents)
import qualified Data.Sequence as Seq
import qualified Data.Foldable as Foldable
import qualified Data.Text.Normalize as Normalize
-- import Debug.Trace (traceShowId)
--
-- Data Types
--
data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
type MacroKind = String
type Font = Set FontKind
data LinePart = RoffStr (String, Font)
| MacroArg Int
deriving Show
type Arg = [LinePart]
-- TODO parse tables (see man tbl)
data ManToken = MLine [LinePart]
| MEmptyLine
| MMacro MacroKind [Arg] SourcePos
| MTable [Alignment] ManTokens [ManTokens] [[ManTokens]]
deriving Show
newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
deriving (Show, Semigroup, Monoid)
singleTok :: ManToken -> ManTokens
singleTok t = ManTokens (Seq.singleton t)
data RoffState = RoffState { fontKind :: Font
, customMacros :: M.Map String ManTokens
} deriving Show
instance Default RoffState where
def = RoffState { customMacros = M.fromList
$ map (\(n, s) ->
(n, singleTok
(MLine [RoffStr (s, mempty)])))
[ ("Tm", "\x2122")
, ("lq", "\x201C")
, ("rq", "\x201D")
, ("R", "\x00AE") ]
, fontKind = S.singleton Regular }
type ManLexer m = ParserT [Char] RoffState m
--
-- Lexer: String -> ManToken
--
eofline :: Stream s m Char => ParsecT s u m ()
eofline = void newline <|> eof
spacetab :: Stream s m Char => ParsecT s u m Char
spacetab = char ' ' <|> char '\t'
characterCodeMap :: M.Map String Char
characterCodeMap =
M.fromList $ map (\(x,y) -> (y,x)) characterCodes
combiningAccentsMap :: M.Map String Char
combiningAccentsMap =
M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
escapeLexer :: PandocMonad m => ManLexer m String
escapeLexer = try $ do
char '\\'
c <- noneOf ['*','$'] -- see escStar, macroArg
case c of
'(' -> twoCharGlyph
'[' -> bracketedGlyph
'f' -> escFont
's' -> escFontSize
'"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
'#' -> mempty <$ manyTill anyChar newline
'%' -> return mempty
'{' -> return mempty
'}' -> return mempty
'&' -> return mempty
'\n' -> return mempty
':' -> return mempty
'0' -> return mempty
'c' -> return mempty
'-' -> return "-"
'_' -> return "_"
' ' -> return " "
'\\' -> return "\\"
't' -> return "\t"
'e' -> return "\\"
'`' -> return "`"
'^' -> return " "
'|' -> return " "
'\'' -> return "`"
'.' -> return "`"
'~' -> return "\160" -- nonbreaking space
_ -> escUnknown ['\\',c] "\xFFFD"
where
twoCharGlyph = do
cs <- count 2 anyChar
case M.lookup cs characterCodeMap of
Just c -> return [c]
Nothing -> escUnknown ('\\':'(':cs) "\xFFFD"
bracketedGlyph = unicodeGlyph <|> charGlyph
charGlyph = do
cs <- manyTill (noneOf ['[',']','\n']) (char ']')
(case words cs of
[] -> mzero
[s] -> case M.lookup s characterCodeMap of
Nothing -> mzero
Just c -> return [c]
(s:ss) -> do
basechar <- case M.lookup cs characterCodeMap of
Nothing ->
case s of
[ch] | isAscii ch && isAlphaNum ch ->
return ch
_ -> mzero
Just c -> return c
let addAccents [] xs = return $ T.unpack .
Normalize.normalize Normalize.NFC .
T.pack $ reverse xs
addAccents (a:as) xs =
case M.lookup a combiningAccentsMap of
Just x -> addAccents as (x:xs)
Nothing -> mzero
addAccents ss [basechar])
<|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD"
unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']'
ucharCode = try $ do
char 'u'
cs <- many1 (satisfy isHexDigit)
let lcs = length cs
guard $ lcs >= 4 && lcs <= 6
case chr <$> safeRead ('0':'x':cs) of
Nothing -> mzero
Just c -> return c
-- \s-1 \s0 -- we ignore these
escFontSize :: PandocMonad m => ManLexer m String
escFontSize = do
pos <- getPosition
pm <- option "" $ count 1 (oneOf "+-")
ds <- many1 digit
report $ SkippedContent ("\\s" ++ pm ++ ds) pos
return mempty
escFont :: PandocMonad m => ManLexer m String
escFont = do
font <- choice
[ S.singleton <$> letterFontKind
, char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
, char 'S' >> return (S.singleton Regular)
, try lettersFont
, digit >> return (S.singleton Regular)
]
modifyState (\r -> r {fontKind = font})
return mempty
lettersFont :: PandocMonad m => ManLexer m Font
lettersFont = do
char '['
fs <- many letterFontKind
skipMany letter
char ']'
return $ S.fromList fs
letterFontKind :: PandocMonad m => ManLexer m FontKind
letterFontKind = choice [
oneOf ['B','b'] >> return Bold
, oneOf ['I','i'] >> return Italic
, oneOf ['C','c'] >> return Monospace
, oneOf ['P','p','R','r'] >> return Regular
]
escUnknown :: PandocMonad m => String -> a -> ManLexer m a
escUnknown s x = do
pos <- getPosition
report $ SkippedContent ("Unknown escape sequence " ++ s) pos
return x
currentFont :: PandocMonad m => ManLexer m Font
currentFont = fontKind <$> getState
-- separate function from lexMacro since real man files sometimes do not follow the rules
lexComment :: PandocMonad m => ManLexer m ManTokens
lexComment = do
try $ string ".\\\""
many Parsec.space
skipMany $ noneOf "\n"
char '\n'
return mempty
lexMacro :: PandocMonad m => ManLexer m ManTokens
lexMacro = do
pos <- getPosition
char '.' <|> char '\''
many spacetab
macroName <- many (satisfy (not . isSpace))
case macroName of
"nop" -> return mempty
"ie" -> lexConditional
"if" -> lexConditional
"el" -> skipConditional
"TS" -> lexTable
_ -> do
args <- lexArgs
case macroName of
"" -> return mempty
"\\\"" -> return mempty
"\\#" -> return mempty
"de" -> lexMacroDef args
"de1" -> lexMacroDef args
"ds" -> lexStringDef args
"ds1" -> lexStringDef args
"sp" -> return $ singleTok MEmptyLine
"so" -> lexIncludeFile args
_ -> resolveMacro macroName args pos
-- | TODO placeholder
lexTable :: PandocMonad m => ManLexer m ManTokens
lexTable = do
pos <- getPosition
manyTill anyLine (try (string ".TE" >> many spacetab >> eofline))
report $ SkippedContent "table" pos
return mempty
-- We don't fully handle the conditional. But we do
-- include everything under '.ie n', which occurs commonly
-- in man pages. We always skip the '.el' part.
lexConditional :: PandocMonad m => ManLexer m ManTokens
lexConditional = do
skipMany spacetab
parseNCond <|> skipConditional
-- n means nroff mode
parseNCond :: PandocMonad m => ManLexer m ManTokens
parseNCond = do
char '\n'
many1 spacetab
lexGroup <|> manToken
lexGroup :: PandocMonad m => ManLexer m ManTokens
lexGroup = do
groupstart
mconcat <$> manyTill manToken groupend
where
groupstart = try $ string "\\{\\" >> newline
groupend = try $ string "\\}" >> eofline
skipConditional :: PandocMonad m => ManLexer m ManTokens
skipConditional = do
rest <- anyLine
when ("\\{\\" `isSuffixOf` rest) $
void $ manyTill anyChar (try (string "\\}"))
return mempty
lexIncludeFile :: PandocMonad m => [Arg] -> ManLexer m ManTokens
lexIncludeFile args = do
pos <- getPosition
case args of
(f:_) -> do
let fp = linePartsToString f
dirs <- getResourcePath
result <- readFileFromDirs dirs fp
case result of
Nothing -> report $ CouldNotLoadIncludeFile fp pos
Just s -> getInput >>= setInput . (s ++)
return mempty
[] -> return mempty
resolveMacro :: PandocMonad m
=> String -> [Arg] -> SourcePos -> ManLexer m ManTokens
resolveMacro macroName args pos = do
macros <- customMacros <$> getState
case M.lookup macroName macros of
Nothing -> return $ singleTok $ MMacro macroName args pos
Just ts -> do
let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
fillLP (MacroArg i) zs =
case drop (i - 1) args of
[] -> zs
(ys:_) -> ys ++ zs
let fillMacroArg (MLine lineparts) =
MLine (foldr fillLP [] lineparts)
fillMacroArg x = x
return $ ManTokens . fmap fillMacroArg . unManTokens $ ts
lexStringDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
lexStringDef args = do -- string definition
case args of
[] -> fail "No argument to .ds"
(x:ys) -> do
let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys)
let stringName = linePartsToString x
modifyState $ \st ->
st{ customMacros = M.insert stringName ts (customMacros st) }
return mempty
lexMacroDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
lexMacroDef args = do -- macro definition
(macroName, stopMacro) <-
case args of
(x : y : _) -> return (linePartsToString x, linePartsToString y)
-- optional second arg
(x:_) -> return (linePartsToString x, ".")
[] -> fail "No argument to .de"
let stop = try $ do
char '.' <|> char '\''
many spacetab
string stopMacro
_ <- lexArgs
return ()
ts <- mconcat <$> manyTill manToken stop
modifyState $ \st ->
st{ customMacros = M.insert macroName ts (customMacros st) }
return mempty
lexArgs :: PandocMonad m => ManLexer m [Arg]
lexArgs = do
args <- many $ try oneArg
skipMany spacetab
eofline
return args
where
oneArg :: PandocMonad m => ManLexer m [LinePart]
oneArg = do
skipMany $ try $ string "\\\n" -- continuation line
try quotedArg <|> plainArg
-- try, because there are some erroneous files, e.g. linux/bpf.2
plainArg :: PandocMonad m => ManLexer m [LinePart]
plainArg = do
skipMany spacetab
mconcat <$> many1
(macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
where
unescapedQuote = do
char '"'
fonts <- currentFont
return [RoffStr ("\"", fonts)]
quotedArg :: PandocMonad m => ManLexer m [LinePart]
quotedArg = do
skipMany spacetab
char '"'
xs <- mconcat <$>
many (macroArg <|> esc <|> escStar <|> regularText
<|> spaceTabChar <|> escapedQuote)
char '"'
return xs
where
escapedQuote = try $ do
char '"'
char '"'
fonts <- currentFont
return [RoffStr ("\"", fonts)]
escStar :: PandocMonad m => ManLexer m [LinePart]
escStar = try $ do
pos <- getPosition
char '\\'
char '*'
c <- anyChar
case c of
'(' -> do
cs <- count 2 anyChar
resolveString cs pos
'[' -> do
cs <- many (noneOf "\t\n\r ]")
char ']'
resolveString cs pos
'S' -> return mempty -- switch back to default font size
_ -> resolveString [c] pos
where
-- strings and macros share namespace
resolveString stringname pos = do
ManTokens ts <- resolveMacro stringname [] pos
case Foldable.toList ts of
[MLine xs] -> return xs
_ -> do
report $ SkippedContent ("unknown string " ++ stringname) pos
return mempty
lexLine :: PandocMonad m => ManLexer m ManTokens
lexLine = do
lnparts <- mconcat <$> many1 linePart
eofline
go lnparts
where -- return empty line if we only have empty strings;
-- this can happen if the line just contains \f[C], for example.
go [] = return mempty
go (RoffStr ("",_):xs) = go xs
go xs = return $ singleTok $ MLine xs
linePart :: PandocMonad m => ManLexer m [LinePart]
linePart = macroArg <|> esc <|> escStar <|>
regularText <|> quoteChar <|> spaceTabChar
macroArg :: PandocMonad m => ManLexer m [LinePart]
macroArg = try $ do
string "\\\\$"
x <- digit
return [MacroArg $ ord x - ord '0']
esc :: PandocMonad m => ManLexer m [LinePart]
esc = do
s <- escapeLexer
font <- currentFont
return [RoffStr (s, font)]
regularText :: PandocMonad m => ManLexer m [LinePart]
regularText = do
s <- many1 $ noneOf "\n\r\t \\\""
font <- currentFont
return [RoffStr (s, font)]
quoteChar :: PandocMonad m => ManLexer m [LinePart]
quoteChar = do
char '"'
font <- currentFont
return [RoffStr ("\"", font)]
spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
spaceTabChar = do
c <- spacetab
font <- currentFont
return [RoffStr ([c], font)]
lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
manToken :: PandocMonad m => ManLexer m ManTokens
manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
linePartsToString :: [LinePart] -> String
linePartsToString = mconcat . map go
where
go (RoffStr (s, _)) = s
go _ = mempty

View file

@ -18,8 +18,6 @@ You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
@ -36,76 +34,25 @@ Conversion of man to 'Pandoc' document.
module Text.Pandoc.Readers.Man (readMan) where
import Prelude
import Control.Monad (liftM, void, mzero, guard, when)
import Control.Monad.Except (throwError)
import Text.Pandoc.Class
(getResourcePath, readFileFromDirs, PandocMonad(..), report)
import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
import Data.Default (Default)
import Control.Monad (liftM, mzero, guard)
import Control.Monad.Except (throwError)
import Text.Pandoc.Class (PandocMonad(..), report)
import Data.Maybe (catMaybes)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.List (intersperse, intercalate, isSuffixOf)
import Data.List (intersperse, intercalate)
import qualified Data.Text as T
import Text.Pandoc.Builder as B
import Text.Pandoc.Error (PandocError (PandocParsecError))
import Text.Pandoc.Logging (LogMessage(..))
import Text.Pandoc.Options
import Text.Pandoc.Parsing
import Text.Pandoc.Shared (crFilter, safeRead)
import Text.Pandoc.Shared (crFilter)
import Text.Pandoc.Readers.Groff -- TODO explicit imports
import Text.Parsec hiding (tokenPrim)
import qualified Text.Parsec as Parsec
import Text.Parsec.Pos (updatePosString)
import Text.Pandoc.GroffChar (characterCodes, combiningAccents)
import qualified Data.Sequence as Seq
import qualified Data.Foldable as Foldable
import qualified Data.Text.Normalize as Normalize
-- import Debug.Trace (traceShowId)
--
-- Data Types
--
data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
type MacroKind = String
type Font = Set FontKind
data LinePart = RoffStr (String, Font)
| MacroArg Int
deriving Show
type Arg = [LinePart]
-- TODO parse tables (see man tbl)
data ManToken = MLine [LinePart]
| MEmptyLine
| MMacro MacroKind [Arg] SourcePos
| MTable [Alignment] ManTokens [ManTokens] [[ManTokens]]
deriving Show
newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
deriving (Show, Semigroup, Monoid)
singleTok :: ManToken -> ManTokens
singleTok t = ManTokens (Seq.singleton t)
data RoffState = RoffState { fontKind :: Font
, customMacros :: M.Map String ManTokens
} deriving Show
instance Default RoffState where
def = RoffState { customMacros = M.fromList
$ map (\(n, s) ->
(n, singleTok
(MLine [RoffStr (s, mempty)])))
[ ("Tm", "\x2122")
, ("lq", "\x201C")
, ("rq", "\x201D")
, ("R", "\x00AE") ]
, fontKind = S.singleton Regular }
data ManState = ManState { readerOptions :: ReaderOptions
, metadata :: Meta
@ -115,7 +62,6 @@ instance Default ManState where
def = ManState { readerOptions = def
, metadata = nullMeta }
type ManLexer m = ParserT [Char] RoffState m
type ManParser m = ParserT [ManToken] ManState m
@ -147,405 +93,6 @@ readMan opts txt = do
mapLeft f (Left x) = Left $ f x
mapLeft _ (Right r) = Right r
--
-- Lexer: String -> ManToken
--
eofline :: Stream s m Char => ParsecT s u m ()
eofline = void newline <|> eof
spacetab :: Stream s m Char => ParsecT s u m Char
spacetab = char ' ' <|> char '\t'
characterCodeMap :: M.Map String Char
characterCodeMap =
M.fromList $ map (\(x,y) -> (y,x)) characterCodes
combiningAccentsMap :: M.Map String Char
combiningAccentsMap =
M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
escapeLexer :: PandocMonad m => ManLexer m String
escapeLexer = try $ do
char '\\'
c <- noneOf ['*','$'] -- see escStar, macroArg
case c of
'(' -> twoCharGlyph
'[' -> bracketedGlyph
'f' -> escFont
's' -> escFontSize
'"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
'#' -> mempty <$ manyTill anyChar newline
'%' -> return mempty
'{' -> return mempty
'}' -> return mempty
'&' -> return mempty
'\n' -> return mempty
':' -> return mempty
'0' -> return mempty
'c' -> return mempty
'-' -> return "-"
'_' -> return "_"
' ' -> return " "
'\\' -> return "\\"
't' -> return "\t"
'e' -> return "\\"
'`' -> return "`"
'^' -> return " "
'|' -> return " "
'\'' -> return "`"
'.' -> return "`"
'~' -> return "\160" -- nonbreaking space
_ -> escUnknown ['\\',c] "\xFFFD"
where
twoCharGlyph = do
cs <- count 2 anyChar
case M.lookup cs characterCodeMap of
Just c -> return [c]
Nothing -> escUnknown ('\\':'(':cs) "\xFFFD"
bracketedGlyph = unicodeGlyph <|> charGlyph
charGlyph = do
cs <- manyTill (noneOf ['[',']','\n']) (char ']')
(case words cs of
[] -> mzero
[s] -> case M.lookup s characterCodeMap of
Nothing -> mzero
Just c -> return [c]
(s:ss) -> do
basechar <- case M.lookup cs characterCodeMap of
Nothing ->
case s of
[ch] | isAscii ch && isAlphaNum ch ->
return ch
_ -> mzero
Just c -> return c
let addAccents [] xs = return $ T.unpack .
Normalize.normalize Normalize.NFC .
T.pack $ reverse xs
addAccents (a:as) xs =
case M.lookup a combiningAccentsMap of
Just x -> addAccents as (x:xs)
Nothing -> mzero
addAccents ss [basechar])
<|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD"
unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']'
ucharCode = try $ do
char 'u'
cs <- many1 (satisfy isHexDigit)
let lcs = length cs
guard $ lcs >= 4 && lcs <= 6
case chr <$> safeRead ('0':'x':cs) of
Nothing -> mzero
Just c -> return c
-- \s-1 \s0 -- we ignore these
escFontSize :: PandocMonad m => ManLexer m String
escFontSize = do
pos <- getPosition
pm <- option "" $ count 1 (oneOf "+-")
ds <- many1 digit
report $ SkippedContent ("\\s" ++ pm ++ ds) pos
return mempty
escFont :: PandocMonad m => ManLexer m String
escFont = do
font <- choice
[ S.singleton <$> letterFontKind
, char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
, char 'S' >> return (S.singleton Regular)
, try lettersFont
, digit >> return (S.singleton Regular)
]
modifyState (\r -> r {fontKind = font})
return mempty
lettersFont :: PandocMonad m => ManLexer m Font
lettersFont = do
char '['
fs <- many letterFontKind
skipMany letter
char ']'
return $ S.fromList fs
letterFontKind :: PandocMonad m => ManLexer m FontKind
letterFontKind = choice [
oneOf ['B','b'] >> return Bold
, oneOf ['I','i'] >> return Italic
, oneOf ['C','c'] >> return Monospace
, oneOf ['P','p','R','r'] >> return Regular
]
escUnknown :: PandocMonad m => String -> a -> ManLexer m a
escUnknown s x = do
pos <- getPosition
report $ SkippedContent ("Unknown escape sequence " ++ s) pos
return x
currentFont :: PandocMonad m => ManLexer m Font
currentFont = fontKind <$> getState
-- separate function from lexMacro since real man files sometimes do not follow the rules
lexComment :: PandocMonad m => ManLexer m ManTokens
lexComment = do
try $ string ".\\\""
many Parsec.space
skipMany $ noneOf "\n"
char '\n'
return mempty
lexMacro :: PandocMonad m => ManLexer m ManTokens
lexMacro = do
pos <- getPosition
char '.' <|> char '\''
many spacetab
macroName <- many (satisfy (not . isSpace))
case macroName of
"nop" -> return mempty
"ie" -> lexConditional
"if" -> lexConditional
"el" -> skipConditional
"TS" -> lexTable
_ -> do
args <- lexArgs
case macroName of
"" -> return mempty
"\\\"" -> return mempty
"\\#" -> return mempty
"de" -> lexMacroDef args
"de1" -> lexMacroDef args
"ds" -> lexStringDef args
"ds1" -> lexStringDef args
"sp" -> return $ singleTok MEmptyLine
"so" -> lexIncludeFile args
_ -> resolveMacro macroName args pos
-- | TODO placeholder
lexTable :: PandocMonad m => ManLexer m ManTokens
lexTable = do
pos <- getPosition
manyTill anyLine (try (string ".TE" >> many spacetab >> eofline))
report $ SkippedContent "table" pos
return mempty
-- We don't fully handle the conditional. But we do
-- include everything under '.ie n', which occurs commonly
-- in man pages. We always skip the '.el' part.
lexConditional :: PandocMonad m => ManLexer m ManTokens
lexConditional = do
skipMany spacetab
parseNCond <|> skipConditional
-- n means nroff mode
parseNCond :: PandocMonad m => ManLexer m ManTokens
parseNCond = do
char '\n'
many1 spacetab
lexGroup <|> manToken
lexGroup :: PandocMonad m => ManLexer m ManTokens
lexGroup = do
groupstart
mconcat <$> manyTill manToken groupend
where
groupstart = try $ string "\\{\\" >> newline
groupend = try $ string "\\}" >> eofline
skipConditional :: PandocMonad m => ManLexer m ManTokens
skipConditional = do
rest <- anyLine
when ("\\{\\" `isSuffixOf` rest) $
void $ manyTill anyChar (try (string "\\}"))
return mempty
lexIncludeFile :: PandocMonad m => [Arg] -> ManLexer m ManTokens
lexIncludeFile args = do
pos <- getPosition
case args of
(f:_) -> do
let fp = linePartsToString f
dirs <- getResourcePath
result <- readFileFromDirs dirs fp
case result of
Nothing -> report $ CouldNotLoadIncludeFile fp pos
Just s -> getInput >>= setInput . (s ++)
return mempty
[] -> return mempty
resolveMacro :: PandocMonad m
=> String -> [Arg] -> SourcePos -> ManLexer m ManTokens
resolveMacro macroName args pos = do
macros <- customMacros <$> getState
case M.lookup macroName macros of
Nothing -> return $ singleTok $ MMacro macroName args pos
Just ts -> do
let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
fillLP (MacroArg i) zs =
case drop (i - 1) args of
[] -> zs
(ys:_) -> ys ++ zs
let fillMacroArg (MLine lineparts) =
MLine (foldr fillLP [] lineparts)
fillMacroArg x = x
return $ ManTokens . fmap fillMacroArg . unManTokens $ ts
lexStringDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
lexStringDef args = do -- string definition
case args of
[] -> fail "No argument to .ds"
(x:ys) -> do
let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys)
let stringName = linePartsToString x
modifyState $ \st ->
st{ customMacros = M.insert stringName ts (customMacros st) }
return mempty
lexMacroDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
lexMacroDef args = do -- macro definition
(macroName, stopMacro) <-
case args of
(x : y : _) -> return (linePartsToString x, linePartsToString y)
-- optional second arg
(x:_) -> return (linePartsToString x, ".")
[] -> fail "No argument to .de"
let stop = try $ do
char '.' <|> char '\''
many spacetab
string stopMacro
_ <- lexArgs
return ()
ts <- mconcat <$> manyTill manToken stop
modifyState $ \st ->
st{ customMacros = M.insert macroName ts (customMacros st) }
return mempty
lexArgs :: PandocMonad m => ManLexer m [Arg]
lexArgs = do
args <- many $ try oneArg
skipMany spacetab
eofline
return args
where
oneArg :: PandocMonad m => ManLexer m [LinePart]
oneArg = do
skipMany $ try $ string "\\\n" -- continuation line
try quotedArg <|> plainArg
-- try, because there are some erroneous files, e.g. linux/bpf.2
plainArg :: PandocMonad m => ManLexer m [LinePart]
plainArg = do
skipMany spacetab
mconcat <$> many1
(macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
where
unescapedQuote = do
char '"'
fonts <- currentFont
return [RoffStr ("\"", fonts)]
quotedArg :: PandocMonad m => ManLexer m [LinePart]
quotedArg = do
skipMany spacetab
char '"'
xs <- mconcat <$>
many (macroArg <|> esc <|> escStar <|> regularText
<|> spaceTabChar <|> escapedQuote)
char '"'
return xs
where
escapedQuote = try $ do
char '"'
char '"'
fonts <- currentFont
return [RoffStr ("\"", fonts)]
escStar :: PandocMonad m => ManLexer m [LinePart]
escStar = try $ do
pos <- getPosition
char '\\'
char '*'
c <- anyChar
case c of
'(' -> do
cs <- count 2 anyChar
resolveString cs pos
'[' -> do
cs <- many (noneOf "\t\n\r ]")
char ']'
resolveString cs pos
'S' -> return mempty -- switch back to default font size
_ -> resolveString [c] pos
where
-- strings and macros share namespace
resolveString stringname pos = do
ManTokens ts <- resolveMacro stringname [] pos
case Foldable.toList ts of
[MLine xs] -> return xs
_ -> do
report $ SkippedContent ("unknown string " ++ stringname) pos
return mempty
lexLine :: PandocMonad m => ManLexer m ManTokens
lexLine = do
lnparts <- mconcat <$> many1 linePart
eofline
go lnparts
where -- return empty line if we only have empty strings;
-- this can happen if the line just contains \f[C], for example.
go [] = return mempty
go (RoffStr ("",_):xs) = go xs
go xs = return $ singleTok $ MLine xs
linePart :: PandocMonad m => ManLexer m [LinePart]
linePart = macroArg <|> esc <|> escStar <|>
regularText <|> quoteChar <|> spaceTabChar
macroArg :: PandocMonad m => ManLexer m [LinePart]
macroArg = try $ do
string "\\\\$"
x <- digit
return [MacroArg $ ord x - ord '0']
esc :: PandocMonad m => ManLexer m [LinePart]
esc = do
s <- escapeLexer
font <- currentFont
return [RoffStr (s, font)]
regularText :: PandocMonad m => ManLexer m [LinePart]
regularText = do
s <- many1 $ noneOf "\n\r\t \\\""
font <- currentFont
return [RoffStr (s, font)]
quoteChar :: PandocMonad m => ManLexer m [LinePart]
quoteChar = do
char '"'
font <- currentFont
return [RoffStr ("\"", font)]
spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
spaceTabChar = do
c <- spacetab
font <- currentFont
return [RoffStr ([c], font)]
lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
manToken :: PandocMonad m => ManLexer m ManTokens
manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
parseMan :: PandocMonad m => ManParser m Pandoc
parseMan = do
@ -656,12 +203,6 @@ linePartsToInlines = go
isItalic (RoffStr (_,f)) = Italic `S.member` f
isItalic _ = False
linePartsToString :: [LinePart] -> String
linePartsToString = mconcat . map go
where
go (RoffStr (s, _)) = s
go _ = mempty
parsePara :: PandocMonad m => ManParser m Blocks
parsePara = para . trimInlines <$> parseInlines