Added Text.Pandoc.Readers.Groff.
This is an internal module that exports a tokenizer for groff formats. Closes #4998.
This commit is contained in:
parent
123381765c
commit
6c71100fcf
3 changed files with 531 additions and 465 deletions
|
@ -534,6 +534,7 @@ library
|
|||
Text.Pandoc.Readers.Org.ParserState,
|
||||
Text.Pandoc.Readers.Org.Parsing,
|
||||
Text.Pandoc.Readers.Org.Shared,
|
||||
Text.Pandoc.Readers.Groff,
|
||||
Text.Pandoc.Writers.Groff,
|
||||
Text.Pandoc.Writers.Powerpoint.Presentation,
|
||||
Text.Pandoc.Writers.Powerpoint.Output,
|
||||
|
|
524
src/Text/Pandoc/Readers/Groff.hs
Normal file
524
src/Text/Pandoc/Readers/Groff.hs
Normal file
|
@ -0,0 +1,524 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||
{-
|
||||
Copyright (C) 2018 Yan Pashkovsky <yanp.bugz@gmail.com>
|
||||
and John MacFarlane
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
-}
|
||||
|
||||
{- |
|
||||
Module : Text.Pandoc.Readers.Groff
|
||||
Copyright : Copyright (C) 2018 Yan Pashkovsky and John MacFarlane
|
||||
License : GNU GPL, version 2 or above
|
||||
|
||||
Maintainer : Yan Pashkovsky <yanp.bugz@gmail.com>
|
||||
Stability : WIP
|
||||
Portability : portable
|
||||
|
||||
Tokenizer for groff formats (man, ms).
|
||||
-}
|
||||
module Text.Pandoc.Readers.Groff
|
||||
( FontKind(..)
|
||||
, Font
|
||||
, MacroKind
|
||||
, LinePart(..)
|
||||
, Arg
|
||||
, ManToken(..)
|
||||
, ManTokens(..)
|
||||
, singleTok
|
||||
, RoffState(..)
|
||||
, ManLexer
|
||||
, manToken
|
||||
, linePartsToString
|
||||
)
|
||||
where
|
||||
|
||||
import Prelude
|
||||
import Control.Monad (void, mzero, guard, when)
|
||||
import Text.Pandoc.Class
|
||||
(getResourcePath, readFileFromDirs, PandocMonad(..), report)
|
||||
import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
|
||||
import Data.Default (Default)
|
||||
import qualified Data.Map as M
|
||||
import Data.Set (Set)
|
||||
import qualified Data.Set as S
|
||||
import Data.List (intercalate, isSuffixOf)
|
||||
import qualified Data.Text as T
|
||||
import Text.Pandoc.Builder as B
|
||||
import Text.Pandoc.Logging (LogMessage(..))
|
||||
import Text.Pandoc.Options
|
||||
import Text.Pandoc.Parsing
|
||||
import Text.Pandoc.Shared (safeRead)
|
||||
import Text.Parsec hiding (tokenPrim)
|
||||
import qualified Text.Parsec as Parsec
|
||||
import Text.Pandoc.GroffChar (characterCodes, combiningAccents)
|
||||
import qualified Data.Sequence as Seq
|
||||
import qualified Data.Foldable as Foldable
|
||||
import qualified Data.Text.Normalize as Normalize
|
||||
|
||||
-- import Debug.Trace (traceShowId)
|
||||
|
||||
--
|
||||
-- Data Types
|
||||
--
|
||||
data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
|
||||
|
||||
type MacroKind = String
|
||||
|
||||
type Font = Set FontKind
|
||||
|
||||
data LinePart = RoffStr (String, Font)
|
||||
| MacroArg Int
|
||||
deriving Show
|
||||
|
||||
type Arg = [LinePart]
|
||||
|
||||
-- TODO parse tables (see man tbl)
|
||||
data ManToken = MLine [LinePart]
|
||||
| MEmptyLine
|
||||
| MMacro MacroKind [Arg] SourcePos
|
||||
| MTable [Alignment] ManTokens [ManTokens] [[ManTokens]]
|
||||
deriving Show
|
||||
|
||||
newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
|
||||
deriving (Show, Semigroup, Monoid)
|
||||
|
||||
singleTok :: ManToken -> ManTokens
|
||||
singleTok t = ManTokens (Seq.singleton t)
|
||||
|
||||
data RoffState = RoffState { fontKind :: Font
|
||||
, customMacros :: M.Map String ManTokens
|
||||
} deriving Show
|
||||
|
||||
instance Default RoffState where
|
||||
def = RoffState { customMacros = M.fromList
|
||||
$ map (\(n, s) ->
|
||||
(n, singleTok
|
||||
(MLine [RoffStr (s, mempty)])))
|
||||
[ ("Tm", "\x2122")
|
||||
, ("lq", "\x201C")
|
||||
, ("rq", "\x201D")
|
||||
, ("R", "\x00AE") ]
|
||||
, fontKind = S.singleton Regular }
|
||||
|
||||
type ManLexer m = ParserT [Char] RoffState m
|
||||
|
||||
--
|
||||
-- Lexer: String -> ManToken
|
||||
--
|
||||
|
||||
eofline :: Stream s m Char => ParsecT s u m ()
|
||||
eofline = void newline <|> eof
|
||||
|
||||
spacetab :: Stream s m Char => ParsecT s u m Char
|
||||
spacetab = char ' ' <|> char '\t'
|
||||
|
||||
characterCodeMap :: M.Map String Char
|
||||
characterCodeMap =
|
||||
M.fromList $ map (\(x,y) -> (y,x)) characterCodes
|
||||
|
||||
combiningAccentsMap :: M.Map String Char
|
||||
combiningAccentsMap =
|
||||
M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
|
||||
|
||||
escapeLexer :: PandocMonad m => ManLexer m String
|
||||
escapeLexer = try $ do
|
||||
char '\\'
|
||||
c <- noneOf ['*','$'] -- see escStar, macroArg
|
||||
case c of
|
||||
'(' -> twoCharGlyph
|
||||
'[' -> bracketedGlyph
|
||||
'f' -> escFont
|
||||
's' -> escFontSize
|
||||
'"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
|
||||
'#' -> mempty <$ manyTill anyChar newline
|
||||
'%' -> return mempty
|
||||
'{' -> return mempty
|
||||
'}' -> return mempty
|
||||
'&' -> return mempty
|
||||
'\n' -> return mempty
|
||||
':' -> return mempty
|
||||
'0' -> return mempty
|
||||
'c' -> return mempty
|
||||
'-' -> return "-"
|
||||
'_' -> return "_"
|
||||
' ' -> return " "
|
||||
'\\' -> return "\\"
|
||||
't' -> return "\t"
|
||||
'e' -> return "\\"
|
||||
'`' -> return "`"
|
||||
'^' -> return " "
|
||||
'|' -> return " "
|
||||
'\'' -> return "`"
|
||||
'.' -> return "`"
|
||||
'~' -> return "\160" -- nonbreaking space
|
||||
_ -> escUnknown ['\\',c] "\xFFFD"
|
||||
|
||||
where
|
||||
|
||||
twoCharGlyph = do
|
||||
cs <- count 2 anyChar
|
||||
case M.lookup cs characterCodeMap of
|
||||
Just c -> return [c]
|
||||
Nothing -> escUnknown ('\\':'(':cs) "\xFFFD"
|
||||
|
||||
bracketedGlyph = unicodeGlyph <|> charGlyph
|
||||
|
||||
charGlyph = do
|
||||
cs <- manyTill (noneOf ['[',']','\n']) (char ']')
|
||||
(case words cs of
|
||||
[] -> mzero
|
||||
[s] -> case M.lookup s characterCodeMap of
|
||||
Nothing -> mzero
|
||||
Just c -> return [c]
|
||||
(s:ss) -> do
|
||||
basechar <- case M.lookup cs characterCodeMap of
|
||||
Nothing ->
|
||||
case s of
|
||||
[ch] | isAscii ch && isAlphaNum ch ->
|
||||
return ch
|
||||
_ -> mzero
|
||||
Just c -> return c
|
||||
let addAccents [] xs = return $ T.unpack .
|
||||
Normalize.normalize Normalize.NFC .
|
||||
T.pack $ reverse xs
|
||||
addAccents (a:as) xs =
|
||||
case M.lookup a combiningAccentsMap of
|
||||
Just x -> addAccents as (x:xs)
|
||||
Nothing -> mzero
|
||||
addAccents ss [basechar])
|
||||
<|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD"
|
||||
|
||||
unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']'
|
||||
|
||||
ucharCode = try $ do
|
||||
char 'u'
|
||||
cs <- many1 (satisfy isHexDigit)
|
||||
let lcs = length cs
|
||||
guard $ lcs >= 4 && lcs <= 6
|
||||
case chr <$> safeRead ('0':'x':cs) of
|
||||
Nothing -> mzero
|
||||
Just c -> return c
|
||||
|
||||
-- \s-1 \s0 -- we ignore these
|
||||
escFontSize :: PandocMonad m => ManLexer m String
|
||||
escFontSize = do
|
||||
pos <- getPosition
|
||||
pm <- option "" $ count 1 (oneOf "+-")
|
||||
ds <- many1 digit
|
||||
report $ SkippedContent ("\\s" ++ pm ++ ds) pos
|
||||
return mempty
|
||||
|
||||
escFont :: PandocMonad m => ManLexer m String
|
||||
escFont = do
|
||||
font <- choice
|
||||
[ S.singleton <$> letterFontKind
|
||||
, char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
|
||||
, char 'S' >> return (S.singleton Regular)
|
||||
, try lettersFont
|
||||
, digit >> return (S.singleton Regular)
|
||||
]
|
||||
modifyState (\r -> r {fontKind = font})
|
||||
return mempty
|
||||
|
||||
lettersFont :: PandocMonad m => ManLexer m Font
|
||||
lettersFont = do
|
||||
char '['
|
||||
fs <- many letterFontKind
|
||||
skipMany letter
|
||||
char ']'
|
||||
return $ S.fromList fs
|
||||
|
||||
letterFontKind :: PandocMonad m => ManLexer m FontKind
|
||||
letterFontKind = choice [
|
||||
oneOf ['B','b'] >> return Bold
|
||||
, oneOf ['I','i'] >> return Italic
|
||||
, oneOf ['C','c'] >> return Monospace
|
||||
, oneOf ['P','p','R','r'] >> return Regular
|
||||
]
|
||||
|
||||
escUnknown :: PandocMonad m => String -> a -> ManLexer m a
|
||||
escUnknown s x = do
|
||||
pos <- getPosition
|
||||
report $ SkippedContent ("Unknown escape sequence " ++ s) pos
|
||||
return x
|
||||
|
||||
currentFont :: PandocMonad m => ManLexer m Font
|
||||
currentFont = fontKind <$> getState
|
||||
|
||||
-- separate function from lexMacro since real man files sometimes do not follow the rules
|
||||
lexComment :: PandocMonad m => ManLexer m ManTokens
|
||||
lexComment = do
|
||||
try $ string ".\\\""
|
||||
many Parsec.space
|
||||
skipMany $ noneOf "\n"
|
||||
char '\n'
|
||||
return mempty
|
||||
|
||||
lexMacro :: PandocMonad m => ManLexer m ManTokens
|
||||
lexMacro = do
|
||||
pos <- getPosition
|
||||
char '.' <|> char '\''
|
||||
many spacetab
|
||||
macroName <- many (satisfy (not . isSpace))
|
||||
case macroName of
|
||||
"nop" -> return mempty
|
||||
"ie" -> lexConditional
|
||||
"if" -> lexConditional
|
||||
"el" -> skipConditional
|
||||
"TS" -> lexTable
|
||||
|
||||
_ -> do
|
||||
args <- lexArgs
|
||||
case macroName of
|
||||
"" -> return mempty
|
||||
"\\\"" -> return mempty
|
||||
"\\#" -> return mempty
|
||||
"de" -> lexMacroDef args
|
||||
"de1" -> lexMacroDef args
|
||||
"ds" -> lexStringDef args
|
||||
"ds1" -> lexStringDef args
|
||||
"sp" -> return $ singleTok MEmptyLine
|
||||
"so" -> lexIncludeFile args
|
||||
_ -> resolveMacro macroName args pos
|
||||
|
||||
-- | TODO placeholder
|
||||
lexTable :: PandocMonad m => ManLexer m ManTokens
|
||||
lexTable = do
|
||||
pos <- getPosition
|
||||
manyTill anyLine (try (string ".TE" >> many spacetab >> eofline))
|
||||
report $ SkippedContent "table" pos
|
||||
return mempty
|
||||
|
||||
-- We don't fully handle the conditional. But we do
|
||||
-- include everything under '.ie n', which occurs commonly
|
||||
-- in man pages. We always skip the '.el' part.
|
||||
lexConditional :: PandocMonad m => ManLexer m ManTokens
|
||||
lexConditional = do
|
||||
skipMany spacetab
|
||||
parseNCond <|> skipConditional
|
||||
|
||||
-- n means nroff mode
|
||||
parseNCond :: PandocMonad m => ManLexer m ManTokens
|
||||
parseNCond = do
|
||||
char '\n'
|
||||
many1 spacetab
|
||||
lexGroup <|> manToken
|
||||
|
||||
lexGroup :: PandocMonad m => ManLexer m ManTokens
|
||||
lexGroup = do
|
||||
groupstart
|
||||
mconcat <$> manyTill manToken groupend
|
||||
where
|
||||
groupstart = try $ string "\\{\\" >> newline
|
||||
groupend = try $ string "\\}" >> eofline
|
||||
|
||||
skipConditional :: PandocMonad m => ManLexer m ManTokens
|
||||
skipConditional = do
|
||||
rest <- anyLine
|
||||
when ("\\{\\" `isSuffixOf` rest) $
|
||||
void $ manyTill anyChar (try (string "\\}"))
|
||||
return mempty
|
||||
|
||||
lexIncludeFile :: PandocMonad m => [Arg] -> ManLexer m ManTokens
|
||||
lexIncludeFile args = do
|
||||
pos <- getPosition
|
||||
case args of
|
||||
(f:_) -> do
|
||||
let fp = linePartsToString f
|
||||
dirs <- getResourcePath
|
||||
result <- readFileFromDirs dirs fp
|
||||
case result of
|
||||
Nothing -> report $ CouldNotLoadIncludeFile fp pos
|
||||
Just s -> getInput >>= setInput . (s ++)
|
||||
return mempty
|
||||
[] -> return mempty
|
||||
|
||||
resolveMacro :: PandocMonad m
|
||||
=> String -> [Arg] -> SourcePos -> ManLexer m ManTokens
|
||||
resolveMacro macroName args pos = do
|
||||
macros <- customMacros <$> getState
|
||||
case M.lookup macroName macros of
|
||||
Nothing -> return $ singleTok $ MMacro macroName args pos
|
||||
Just ts -> do
|
||||
let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
|
||||
fillLP (MacroArg i) zs =
|
||||
case drop (i - 1) args of
|
||||
[] -> zs
|
||||
(ys:_) -> ys ++ zs
|
||||
let fillMacroArg (MLine lineparts) =
|
||||
MLine (foldr fillLP [] lineparts)
|
||||
fillMacroArg x = x
|
||||
return $ ManTokens . fmap fillMacroArg . unManTokens $ ts
|
||||
|
||||
lexStringDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
|
||||
lexStringDef args = do -- string definition
|
||||
case args of
|
||||
[] -> fail "No argument to .ds"
|
||||
(x:ys) -> do
|
||||
let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys)
|
||||
let stringName = linePartsToString x
|
||||
modifyState $ \st ->
|
||||
st{ customMacros = M.insert stringName ts (customMacros st) }
|
||||
return mempty
|
||||
|
||||
lexMacroDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
|
||||
lexMacroDef args = do -- macro definition
|
||||
(macroName, stopMacro) <-
|
||||
case args of
|
||||
(x : y : _) -> return (linePartsToString x, linePartsToString y)
|
||||
-- optional second arg
|
||||
(x:_) -> return (linePartsToString x, ".")
|
||||
[] -> fail "No argument to .de"
|
||||
let stop = try $ do
|
||||
char '.' <|> char '\''
|
||||
many spacetab
|
||||
string stopMacro
|
||||
_ <- lexArgs
|
||||
return ()
|
||||
ts <- mconcat <$> manyTill manToken stop
|
||||
modifyState $ \st ->
|
||||
st{ customMacros = M.insert macroName ts (customMacros st) }
|
||||
return mempty
|
||||
|
||||
lexArgs :: PandocMonad m => ManLexer m [Arg]
|
||||
lexArgs = do
|
||||
args <- many $ try oneArg
|
||||
skipMany spacetab
|
||||
eofline
|
||||
return args
|
||||
|
||||
where
|
||||
|
||||
oneArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
oneArg = do
|
||||
skipMany $ try $ string "\\\n" -- continuation line
|
||||
try quotedArg <|> plainArg
|
||||
-- try, because there are some erroneous files, e.g. linux/bpf.2
|
||||
|
||||
plainArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
plainArg = do
|
||||
skipMany spacetab
|
||||
mconcat <$> many1
|
||||
(macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
|
||||
where
|
||||
unescapedQuote = do
|
||||
char '"'
|
||||
fonts <- currentFont
|
||||
return [RoffStr ("\"", fonts)]
|
||||
|
||||
|
||||
quotedArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
quotedArg = do
|
||||
skipMany spacetab
|
||||
char '"'
|
||||
xs <- mconcat <$>
|
||||
many (macroArg <|> esc <|> escStar <|> regularText
|
||||
<|> spaceTabChar <|> escapedQuote)
|
||||
char '"'
|
||||
return xs
|
||||
where
|
||||
escapedQuote = try $ do
|
||||
char '"'
|
||||
char '"'
|
||||
fonts <- currentFont
|
||||
return [RoffStr ("\"", fonts)]
|
||||
|
||||
escStar :: PandocMonad m => ManLexer m [LinePart]
|
||||
escStar = try $ do
|
||||
pos <- getPosition
|
||||
char '\\'
|
||||
char '*'
|
||||
c <- anyChar
|
||||
case c of
|
||||
'(' -> do
|
||||
cs <- count 2 anyChar
|
||||
resolveString cs pos
|
||||
'[' -> do
|
||||
cs <- many (noneOf "\t\n\r ]")
|
||||
char ']'
|
||||
resolveString cs pos
|
||||
'S' -> return mempty -- switch back to default font size
|
||||
_ -> resolveString [c] pos
|
||||
|
||||
where
|
||||
|
||||
-- strings and macros share namespace
|
||||
resolveString stringname pos = do
|
||||
ManTokens ts <- resolveMacro stringname [] pos
|
||||
case Foldable.toList ts of
|
||||
[MLine xs] -> return xs
|
||||
_ -> do
|
||||
report $ SkippedContent ("unknown string " ++ stringname) pos
|
||||
return mempty
|
||||
|
||||
lexLine :: PandocMonad m => ManLexer m ManTokens
|
||||
lexLine = do
|
||||
lnparts <- mconcat <$> many1 linePart
|
||||
eofline
|
||||
go lnparts
|
||||
where -- return empty line if we only have empty strings;
|
||||
-- this can happen if the line just contains \f[C], for example.
|
||||
go [] = return mempty
|
||||
go (RoffStr ("",_):xs) = go xs
|
||||
go xs = return $ singleTok $ MLine xs
|
||||
|
||||
linePart :: PandocMonad m => ManLexer m [LinePart]
|
||||
linePart = macroArg <|> esc <|> escStar <|>
|
||||
regularText <|> quoteChar <|> spaceTabChar
|
||||
|
||||
macroArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
macroArg = try $ do
|
||||
string "\\\\$"
|
||||
x <- digit
|
||||
return [MacroArg $ ord x - ord '0']
|
||||
|
||||
esc :: PandocMonad m => ManLexer m [LinePart]
|
||||
esc = do
|
||||
s <- escapeLexer
|
||||
font <- currentFont
|
||||
return [RoffStr (s, font)]
|
||||
|
||||
regularText :: PandocMonad m => ManLexer m [LinePart]
|
||||
regularText = do
|
||||
s <- many1 $ noneOf "\n\r\t \\\""
|
||||
font <- currentFont
|
||||
return [RoffStr (s, font)]
|
||||
|
||||
quoteChar :: PandocMonad m => ManLexer m [LinePart]
|
||||
quoteChar = do
|
||||
char '"'
|
||||
font <- currentFont
|
||||
return [RoffStr ("\"", font)]
|
||||
|
||||
spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
|
||||
spaceTabChar = do
|
||||
c <- spacetab
|
||||
font <- currentFont
|
||||
return [RoffStr ([c], font)]
|
||||
|
||||
lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
|
||||
lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
|
||||
|
||||
manToken :: PandocMonad m => ManLexer m ManTokens
|
||||
manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
|
||||
|
||||
linePartsToString :: [LinePart] -> String
|
||||
linePartsToString = mconcat . map go
|
||||
where
|
||||
go (RoffStr (s, _)) = s
|
||||
go _ = mempty
|
|
@ -18,8 +18,6 @@ You should have received a copy of the GNU General Public License
|
|||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
|
||||
|
||||
-}
|
||||
|
||||
{- |
|
||||
|
@ -36,76 +34,25 @@ Conversion of man to 'Pandoc' document.
|
|||
module Text.Pandoc.Readers.Man (readMan) where
|
||||
|
||||
import Prelude
|
||||
import Control.Monad (liftM, void, mzero, guard, when)
|
||||
import Control.Monad.Except (throwError)
|
||||
import Text.Pandoc.Class
|
||||
(getResourcePath, readFileFromDirs, PandocMonad(..), report)
|
||||
import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
|
||||
import Data.Default (Default)
|
||||
import Control.Monad (liftM, mzero, guard)
|
||||
import Control.Monad.Except (throwError)
|
||||
import Text.Pandoc.Class (PandocMonad(..), report)
|
||||
import Data.Maybe (catMaybes)
|
||||
import qualified Data.Map as M
|
||||
import Data.Set (Set)
|
||||
import qualified Data.Set as S
|
||||
import Data.List (intersperse, intercalate, isSuffixOf)
|
||||
import Data.List (intersperse, intercalate)
|
||||
import qualified Data.Text as T
|
||||
import Text.Pandoc.Builder as B
|
||||
import Text.Pandoc.Error (PandocError (PandocParsecError))
|
||||
import Text.Pandoc.Logging (LogMessage(..))
|
||||
import Text.Pandoc.Options
|
||||
import Text.Pandoc.Parsing
|
||||
import Text.Pandoc.Shared (crFilter, safeRead)
|
||||
import Text.Pandoc.Shared (crFilter)
|
||||
import Text.Pandoc.Readers.Groff -- TODO explicit imports
|
||||
import Text.Parsec hiding (tokenPrim)
|
||||
import qualified Text.Parsec as Parsec
|
||||
import Text.Parsec.Pos (updatePosString)
|
||||
import Text.Pandoc.GroffChar (characterCodes, combiningAccents)
|
||||
import qualified Data.Sequence as Seq
|
||||
import qualified Data.Foldable as Foldable
|
||||
import qualified Data.Text.Normalize as Normalize
|
||||
|
||||
-- import Debug.Trace (traceShowId)
|
||||
|
||||
--
|
||||
-- Data Types
|
||||
--
|
||||
data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
|
||||
|
||||
type MacroKind = String
|
||||
|
||||
type Font = Set FontKind
|
||||
|
||||
data LinePart = RoffStr (String, Font)
|
||||
| MacroArg Int
|
||||
deriving Show
|
||||
|
||||
type Arg = [LinePart]
|
||||
|
||||
-- TODO parse tables (see man tbl)
|
||||
data ManToken = MLine [LinePart]
|
||||
| MEmptyLine
|
||||
| MMacro MacroKind [Arg] SourcePos
|
||||
| MTable [Alignment] ManTokens [ManTokens] [[ManTokens]]
|
||||
deriving Show
|
||||
|
||||
newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
|
||||
deriving (Show, Semigroup, Monoid)
|
||||
|
||||
singleTok :: ManToken -> ManTokens
|
||||
singleTok t = ManTokens (Seq.singleton t)
|
||||
|
||||
data RoffState = RoffState { fontKind :: Font
|
||||
, customMacros :: M.Map String ManTokens
|
||||
} deriving Show
|
||||
|
||||
instance Default RoffState where
|
||||
def = RoffState { customMacros = M.fromList
|
||||
$ map (\(n, s) ->
|
||||
(n, singleTok
|
||||
(MLine [RoffStr (s, mempty)])))
|
||||
[ ("Tm", "\x2122")
|
||||
, ("lq", "\x201C")
|
||||
, ("rq", "\x201D")
|
||||
, ("R", "\x00AE") ]
|
||||
, fontKind = S.singleton Regular }
|
||||
|
||||
data ManState = ManState { readerOptions :: ReaderOptions
|
||||
, metadata :: Meta
|
||||
|
@ -115,7 +62,6 @@ instance Default ManState where
|
|||
def = ManState { readerOptions = def
|
||||
, metadata = nullMeta }
|
||||
|
||||
type ManLexer m = ParserT [Char] RoffState m
|
||||
type ManParser m = ParserT [ManToken] ManState m
|
||||
|
||||
|
||||
|
@ -147,405 +93,6 @@ readMan opts txt = do
|
|||
mapLeft f (Left x) = Left $ f x
|
||||
mapLeft _ (Right r) = Right r
|
||||
|
||||
--
|
||||
-- Lexer: String -> ManToken
|
||||
--
|
||||
|
||||
eofline :: Stream s m Char => ParsecT s u m ()
|
||||
eofline = void newline <|> eof
|
||||
|
||||
spacetab :: Stream s m Char => ParsecT s u m Char
|
||||
spacetab = char ' ' <|> char '\t'
|
||||
|
||||
characterCodeMap :: M.Map String Char
|
||||
characterCodeMap =
|
||||
M.fromList $ map (\(x,y) -> (y,x)) characterCodes
|
||||
|
||||
combiningAccentsMap :: M.Map String Char
|
||||
combiningAccentsMap =
|
||||
M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
|
||||
|
||||
escapeLexer :: PandocMonad m => ManLexer m String
|
||||
escapeLexer = try $ do
|
||||
char '\\'
|
||||
c <- noneOf ['*','$'] -- see escStar, macroArg
|
||||
case c of
|
||||
'(' -> twoCharGlyph
|
||||
'[' -> bracketedGlyph
|
||||
'f' -> escFont
|
||||
's' -> escFontSize
|
||||
'"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
|
||||
'#' -> mempty <$ manyTill anyChar newline
|
||||
'%' -> return mempty
|
||||
'{' -> return mempty
|
||||
'}' -> return mempty
|
||||
'&' -> return mempty
|
||||
'\n' -> return mempty
|
||||
':' -> return mempty
|
||||
'0' -> return mempty
|
||||
'c' -> return mempty
|
||||
'-' -> return "-"
|
||||
'_' -> return "_"
|
||||
' ' -> return " "
|
||||
'\\' -> return "\\"
|
||||
't' -> return "\t"
|
||||
'e' -> return "\\"
|
||||
'`' -> return "`"
|
||||
'^' -> return " "
|
||||
'|' -> return " "
|
||||
'\'' -> return "`"
|
||||
'.' -> return "`"
|
||||
'~' -> return "\160" -- nonbreaking space
|
||||
_ -> escUnknown ['\\',c] "\xFFFD"
|
||||
|
||||
where
|
||||
|
||||
twoCharGlyph = do
|
||||
cs <- count 2 anyChar
|
||||
case M.lookup cs characterCodeMap of
|
||||
Just c -> return [c]
|
||||
Nothing -> escUnknown ('\\':'(':cs) "\xFFFD"
|
||||
|
||||
bracketedGlyph = unicodeGlyph <|> charGlyph
|
||||
|
||||
charGlyph = do
|
||||
cs <- manyTill (noneOf ['[',']','\n']) (char ']')
|
||||
(case words cs of
|
||||
[] -> mzero
|
||||
[s] -> case M.lookup s characterCodeMap of
|
||||
Nothing -> mzero
|
||||
Just c -> return [c]
|
||||
(s:ss) -> do
|
||||
basechar <- case M.lookup cs characterCodeMap of
|
||||
Nothing ->
|
||||
case s of
|
||||
[ch] | isAscii ch && isAlphaNum ch ->
|
||||
return ch
|
||||
_ -> mzero
|
||||
Just c -> return c
|
||||
let addAccents [] xs = return $ T.unpack .
|
||||
Normalize.normalize Normalize.NFC .
|
||||
T.pack $ reverse xs
|
||||
addAccents (a:as) xs =
|
||||
case M.lookup a combiningAccentsMap of
|
||||
Just x -> addAccents as (x:xs)
|
||||
Nothing -> mzero
|
||||
addAccents ss [basechar])
|
||||
<|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD"
|
||||
|
||||
unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']'
|
||||
|
||||
ucharCode = try $ do
|
||||
char 'u'
|
||||
cs <- many1 (satisfy isHexDigit)
|
||||
let lcs = length cs
|
||||
guard $ lcs >= 4 && lcs <= 6
|
||||
case chr <$> safeRead ('0':'x':cs) of
|
||||
Nothing -> mzero
|
||||
Just c -> return c
|
||||
|
||||
-- \s-1 \s0 -- we ignore these
|
||||
escFontSize :: PandocMonad m => ManLexer m String
|
||||
escFontSize = do
|
||||
pos <- getPosition
|
||||
pm <- option "" $ count 1 (oneOf "+-")
|
||||
ds <- many1 digit
|
||||
report $ SkippedContent ("\\s" ++ pm ++ ds) pos
|
||||
return mempty
|
||||
|
||||
escFont :: PandocMonad m => ManLexer m String
|
||||
escFont = do
|
||||
font <- choice
|
||||
[ S.singleton <$> letterFontKind
|
||||
, char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
|
||||
, char 'S' >> return (S.singleton Regular)
|
||||
, try lettersFont
|
||||
, digit >> return (S.singleton Regular)
|
||||
]
|
||||
modifyState (\r -> r {fontKind = font})
|
||||
return mempty
|
||||
|
||||
lettersFont :: PandocMonad m => ManLexer m Font
|
||||
lettersFont = do
|
||||
char '['
|
||||
fs <- many letterFontKind
|
||||
skipMany letter
|
||||
char ']'
|
||||
return $ S.fromList fs
|
||||
|
||||
letterFontKind :: PandocMonad m => ManLexer m FontKind
|
||||
letterFontKind = choice [
|
||||
oneOf ['B','b'] >> return Bold
|
||||
, oneOf ['I','i'] >> return Italic
|
||||
, oneOf ['C','c'] >> return Monospace
|
||||
, oneOf ['P','p','R','r'] >> return Regular
|
||||
]
|
||||
|
||||
escUnknown :: PandocMonad m => String -> a -> ManLexer m a
|
||||
escUnknown s x = do
|
||||
pos <- getPosition
|
||||
report $ SkippedContent ("Unknown escape sequence " ++ s) pos
|
||||
return x
|
||||
|
||||
currentFont :: PandocMonad m => ManLexer m Font
|
||||
currentFont = fontKind <$> getState
|
||||
|
||||
-- separate function from lexMacro since real man files sometimes do not follow the rules
|
||||
lexComment :: PandocMonad m => ManLexer m ManTokens
|
||||
lexComment = do
|
||||
try $ string ".\\\""
|
||||
many Parsec.space
|
||||
skipMany $ noneOf "\n"
|
||||
char '\n'
|
||||
return mempty
|
||||
|
||||
lexMacro :: PandocMonad m => ManLexer m ManTokens
|
||||
lexMacro = do
|
||||
pos <- getPosition
|
||||
char '.' <|> char '\''
|
||||
many spacetab
|
||||
macroName <- many (satisfy (not . isSpace))
|
||||
case macroName of
|
||||
"nop" -> return mempty
|
||||
"ie" -> lexConditional
|
||||
"if" -> lexConditional
|
||||
"el" -> skipConditional
|
||||
"TS" -> lexTable
|
||||
|
||||
_ -> do
|
||||
args <- lexArgs
|
||||
case macroName of
|
||||
"" -> return mempty
|
||||
"\\\"" -> return mempty
|
||||
"\\#" -> return mempty
|
||||
"de" -> lexMacroDef args
|
||||
"de1" -> lexMacroDef args
|
||||
"ds" -> lexStringDef args
|
||||
"ds1" -> lexStringDef args
|
||||
"sp" -> return $ singleTok MEmptyLine
|
||||
"so" -> lexIncludeFile args
|
||||
_ -> resolveMacro macroName args pos
|
||||
|
||||
-- | TODO placeholder
|
||||
lexTable :: PandocMonad m => ManLexer m ManTokens
|
||||
lexTable = do
|
||||
pos <- getPosition
|
||||
manyTill anyLine (try (string ".TE" >> many spacetab >> eofline))
|
||||
report $ SkippedContent "table" pos
|
||||
return mempty
|
||||
|
||||
-- We don't fully handle the conditional. But we do
|
||||
-- include everything under '.ie n', which occurs commonly
|
||||
-- in man pages. We always skip the '.el' part.
|
||||
lexConditional :: PandocMonad m => ManLexer m ManTokens
|
||||
lexConditional = do
|
||||
skipMany spacetab
|
||||
parseNCond <|> skipConditional
|
||||
|
||||
-- n means nroff mode
|
||||
parseNCond :: PandocMonad m => ManLexer m ManTokens
|
||||
parseNCond = do
|
||||
char '\n'
|
||||
many1 spacetab
|
||||
lexGroup <|> manToken
|
||||
|
||||
lexGroup :: PandocMonad m => ManLexer m ManTokens
|
||||
lexGroup = do
|
||||
groupstart
|
||||
mconcat <$> manyTill manToken groupend
|
||||
where
|
||||
groupstart = try $ string "\\{\\" >> newline
|
||||
groupend = try $ string "\\}" >> eofline
|
||||
|
||||
skipConditional :: PandocMonad m => ManLexer m ManTokens
|
||||
skipConditional = do
|
||||
rest <- anyLine
|
||||
when ("\\{\\" `isSuffixOf` rest) $
|
||||
void $ manyTill anyChar (try (string "\\}"))
|
||||
return mempty
|
||||
|
||||
lexIncludeFile :: PandocMonad m => [Arg] -> ManLexer m ManTokens
|
||||
lexIncludeFile args = do
|
||||
pos <- getPosition
|
||||
case args of
|
||||
(f:_) -> do
|
||||
let fp = linePartsToString f
|
||||
dirs <- getResourcePath
|
||||
result <- readFileFromDirs dirs fp
|
||||
case result of
|
||||
Nothing -> report $ CouldNotLoadIncludeFile fp pos
|
||||
Just s -> getInput >>= setInput . (s ++)
|
||||
return mempty
|
||||
[] -> return mempty
|
||||
|
||||
resolveMacro :: PandocMonad m
|
||||
=> String -> [Arg] -> SourcePos -> ManLexer m ManTokens
|
||||
resolveMacro macroName args pos = do
|
||||
macros <- customMacros <$> getState
|
||||
case M.lookup macroName macros of
|
||||
Nothing -> return $ singleTok $ MMacro macroName args pos
|
||||
Just ts -> do
|
||||
let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
|
||||
fillLP (MacroArg i) zs =
|
||||
case drop (i - 1) args of
|
||||
[] -> zs
|
||||
(ys:_) -> ys ++ zs
|
||||
let fillMacroArg (MLine lineparts) =
|
||||
MLine (foldr fillLP [] lineparts)
|
||||
fillMacroArg x = x
|
||||
return $ ManTokens . fmap fillMacroArg . unManTokens $ ts
|
||||
|
||||
lexStringDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
|
||||
lexStringDef args = do -- string definition
|
||||
case args of
|
||||
[] -> fail "No argument to .ds"
|
||||
(x:ys) -> do
|
||||
let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys)
|
||||
let stringName = linePartsToString x
|
||||
modifyState $ \st ->
|
||||
st{ customMacros = M.insert stringName ts (customMacros st) }
|
||||
return mempty
|
||||
|
||||
lexMacroDef :: PandocMonad m => [Arg] -> ManLexer m ManTokens
|
||||
lexMacroDef args = do -- macro definition
|
||||
(macroName, stopMacro) <-
|
||||
case args of
|
||||
(x : y : _) -> return (linePartsToString x, linePartsToString y)
|
||||
-- optional second arg
|
||||
(x:_) -> return (linePartsToString x, ".")
|
||||
[] -> fail "No argument to .de"
|
||||
let stop = try $ do
|
||||
char '.' <|> char '\''
|
||||
many spacetab
|
||||
string stopMacro
|
||||
_ <- lexArgs
|
||||
return ()
|
||||
ts <- mconcat <$> manyTill manToken stop
|
||||
modifyState $ \st ->
|
||||
st{ customMacros = M.insert macroName ts (customMacros st) }
|
||||
return mempty
|
||||
|
||||
lexArgs :: PandocMonad m => ManLexer m [Arg]
|
||||
lexArgs = do
|
||||
args <- many $ try oneArg
|
||||
skipMany spacetab
|
||||
eofline
|
||||
return args
|
||||
|
||||
where
|
||||
|
||||
oneArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
oneArg = do
|
||||
skipMany $ try $ string "\\\n" -- continuation line
|
||||
try quotedArg <|> plainArg
|
||||
-- try, because there are some erroneous files, e.g. linux/bpf.2
|
||||
|
||||
plainArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
plainArg = do
|
||||
skipMany spacetab
|
||||
mconcat <$> many1
|
||||
(macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
|
||||
where
|
||||
unescapedQuote = do
|
||||
char '"'
|
||||
fonts <- currentFont
|
||||
return [RoffStr ("\"", fonts)]
|
||||
|
||||
|
||||
quotedArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
quotedArg = do
|
||||
skipMany spacetab
|
||||
char '"'
|
||||
xs <- mconcat <$>
|
||||
many (macroArg <|> esc <|> escStar <|> regularText
|
||||
<|> spaceTabChar <|> escapedQuote)
|
||||
char '"'
|
||||
return xs
|
||||
where
|
||||
escapedQuote = try $ do
|
||||
char '"'
|
||||
char '"'
|
||||
fonts <- currentFont
|
||||
return [RoffStr ("\"", fonts)]
|
||||
|
||||
escStar :: PandocMonad m => ManLexer m [LinePart]
|
||||
escStar = try $ do
|
||||
pos <- getPosition
|
||||
char '\\'
|
||||
char '*'
|
||||
c <- anyChar
|
||||
case c of
|
||||
'(' -> do
|
||||
cs <- count 2 anyChar
|
||||
resolveString cs pos
|
||||
'[' -> do
|
||||
cs <- many (noneOf "\t\n\r ]")
|
||||
char ']'
|
||||
resolveString cs pos
|
||||
'S' -> return mempty -- switch back to default font size
|
||||
_ -> resolveString [c] pos
|
||||
|
||||
where
|
||||
|
||||
-- strings and macros share namespace
|
||||
resolveString stringname pos = do
|
||||
ManTokens ts <- resolveMacro stringname [] pos
|
||||
case Foldable.toList ts of
|
||||
[MLine xs] -> return xs
|
||||
_ -> do
|
||||
report $ SkippedContent ("unknown string " ++ stringname) pos
|
||||
return mempty
|
||||
|
||||
lexLine :: PandocMonad m => ManLexer m ManTokens
|
||||
lexLine = do
|
||||
lnparts <- mconcat <$> many1 linePart
|
||||
eofline
|
||||
go lnparts
|
||||
where -- return empty line if we only have empty strings;
|
||||
-- this can happen if the line just contains \f[C], for example.
|
||||
go [] = return mempty
|
||||
go (RoffStr ("",_):xs) = go xs
|
||||
go xs = return $ singleTok $ MLine xs
|
||||
|
||||
linePart :: PandocMonad m => ManLexer m [LinePart]
|
||||
linePart = macroArg <|> esc <|> escStar <|>
|
||||
regularText <|> quoteChar <|> spaceTabChar
|
||||
|
||||
macroArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
macroArg = try $ do
|
||||
string "\\\\$"
|
||||
x <- digit
|
||||
return [MacroArg $ ord x - ord '0']
|
||||
|
||||
esc :: PandocMonad m => ManLexer m [LinePart]
|
||||
esc = do
|
||||
s <- escapeLexer
|
||||
font <- currentFont
|
||||
return [RoffStr (s, font)]
|
||||
|
||||
regularText :: PandocMonad m => ManLexer m [LinePart]
|
||||
regularText = do
|
||||
s <- many1 $ noneOf "\n\r\t \\\""
|
||||
font <- currentFont
|
||||
return [RoffStr (s, font)]
|
||||
|
||||
quoteChar :: PandocMonad m => ManLexer m [LinePart]
|
||||
quoteChar = do
|
||||
char '"'
|
||||
font <- currentFont
|
||||
return [RoffStr ("\"", font)]
|
||||
|
||||
spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
|
||||
spaceTabChar = do
|
||||
c <- spacetab
|
||||
font <- currentFont
|
||||
return [RoffStr ([c], font)]
|
||||
|
||||
lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
|
||||
lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
|
||||
|
||||
manToken :: PandocMonad m => ManLexer m ManTokens
|
||||
manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
|
||||
|
||||
parseMan :: PandocMonad m => ManParser m Pandoc
|
||||
parseMan = do
|
||||
|
@ -656,12 +203,6 @@ linePartsToInlines = go
|
|||
isItalic (RoffStr (_,f)) = Italic `S.member` f
|
||||
isItalic _ = False
|
||||
|
||||
linePartsToString :: [LinePart] -> String
|
||||
linePartsToString = mconcat . map go
|
||||
where
|
||||
go (RoffStr (s, _)) = s
|
||||
go _ = mempty
|
||||
|
||||
parsePara :: PandocMonad m => ManParser m Blocks
|
||||
parsePara = para . trimInlines <$> parseInlines
|
||||
|
||||
|
|
Loading…
Reference in a new issue