T.P.Readers.Groff: improve LinePart.
Separate font change and font size change tokens. With this change, emphasis no longer works. This needs to be implemented in the parser, not the lexer.
This commit is contained in:
parent
6c71100fcf
commit
e4726518af
2 changed files with 120 additions and 138 deletions
|
@ -33,7 +33,6 @@ Tokenizer for groff formats (man, ms).
|
|||
-}
|
||||
module Text.Pandoc.Readers.Groff
|
||||
( FontKind(..)
|
||||
, Font
|
||||
, MacroKind
|
||||
, LinePart(..)
|
||||
, Arg
|
||||
|
@ -54,8 +53,6 @@ import Text.Pandoc.Class
|
|||
import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
|
||||
import Data.Default (Default)
|
||||
import qualified Data.Map as M
|
||||
import Data.Set (Set)
|
||||
import qualified Data.Set as S
|
||||
import Data.List (intercalate, isSuffixOf)
|
||||
import qualified Data.Text as T
|
||||
import Text.Pandoc.Builder as B
|
||||
|
@ -79,9 +76,9 @@ data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
|
|||
|
||||
type MacroKind = String
|
||||
|
||||
type Font = Set FontKind
|
||||
|
||||
data LinePart = RoffStr (String, Font)
|
||||
data LinePart = RoffStr String
|
||||
| Font [FontKind]
|
||||
| FontSize Int
|
||||
| MacroArg Int
|
||||
deriving Show
|
||||
|
||||
|
@ -100,20 +97,19 @@ newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
|
|||
singleTok :: ManToken -> ManTokens
|
||||
singleTok t = ManTokens (Seq.singleton t)
|
||||
|
||||
data RoffState = RoffState { fontKind :: Font
|
||||
, customMacros :: M.Map String ManTokens
|
||||
data RoffState = RoffState { customMacros :: M.Map String ManTokens
|
||||
} deriving Show
|
||||
|
||||
instance Default RoffState where
|
||||
def = RoffState { customMacros = M.fromList
|
||||
$ map (\(n, s) ->
|
||||
(n, singleTok
|
||||
(MLine [RoffStr (s, mempty)])))
|
||||
(MLine [RoffStr s])))
|
||||
[ ("Tm", "\x2122")
|
||||
, ("lq", "\x201C")
|
||||
, ("rq", "\x201D")
|
||||
, ("R", "\x00AE") ]
|
||||
, fontKind = S.singleton Regular }
|
||||
}
|
||||
|
||||
type ManLexer m = ParserT [Char] RoffState m
|
||||
|
||||
|
@ -135,15 +131,16 @@ combiningAccentsMap :: M.Map String Char
|
|||
combiningAccentsMap =
|
||||
M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
|
||||
|
||||
escapeLexer :: PandocMonad m => ManLexer m String
|
||||
escapeLexer = try $ do
|
||||
escape :: PandocMonad m => ManLexer m [LinePart]
|
||||
escape = do
|
||||
char '\\'
|
||||
c <- noneOf ['*','$'] -- see escStar, macroArg
|
||||
c <- anyChar
|
||||
case c of
|
||||
'(' -> twoCharGlyph
|
||||
'[' -> bracketedGlyph
|
||||
'f' -> escFont
|
||||
's' -> escFontSize
|
||||
'*' -> escStar
|
||||
'(' -> twoCharGlyph
|
||||
'[' -> bracketedGlyph
|
||||
'"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
|
||||
'#' -> mempty <$ manyTill anyChar newline
|
||||
'%' -> return mempty
|
||||
|
@ -154,27 +151,27 @@ escapeLexer = try $ do
|
|||
':' -> return mempty
|
||||
'0' -> return mempty
|
||||
'c' -> return mempty
|
||||
'-' -> return "-"
|
||||
'_' -> return "_"
|
||||
' ' -> return " "
|
||||
'\\' -> return "\\"
|
||||
't' -> return "\t"
|
||||
'e' -> return "\\"
|
||||
'`' -> return "`"
|
||||
'^' -> return " "
|
||||
'|' -> return " "
|
||||
'\'' -> return "`"
|
||||
'.' -> return "`"
|
||||
'~' -> return "\160" -- nonbreaking space
|
||||
_ -> escUnknown ['\\',c] "\xFFFD"
|
||||
'-' -> return [RoffStr "-"]
|
||||
'_' -> return [RoffStr "_"]
|
||||
' ' -> return [RoffStr " "]
|
||||
'\\' -> return [RoffStr "\\"]
|
||||
't' -> return [RoffStr "\t"]
|
||||
'e' -> return [RoffStr "\\"]
|
||||
'`' -> return [RoffStr "`"]
|
||||
'^' -> return [RoffStr " "]
|
||||
'|' -> return [RoffStr " "]
|
||||
'\'' -> return [RoffStr "`"]
|
||||
'.' -> return [RoffStr "`"]
|
||||
'~' -> return [RoffStr "\160"] -- nonbreaking space
|
||||
_ -> escUnknown ['\\',c]
|
||||
|
||||
where
|
||||
|
||||
twoCharGlyph = do
|
||||
cs <- count 2 anyChar
|
||||
case M.lookup cs characterCodeMap of
|
||||
Just c -> return [c]
|
||||
Nothing -> escUnknown ('\\':'(':cs) "\xFFFD"
|
||||
Just c -> return [RoffStr [c]]
|
||||
Nothing -> escUnknown ('\\':'(':cs)
|
||||
|
||||
bracketedGlyph = unicodeGlyph <|> charGlyph
|
||||
|
||||
|
@ -184,7 +181,7 @@ escapeLexer = try $ do
|
|||
[] -> mzero
|
||||
[s] -> case M.lookup s characterCodeMap of
|
||||
Nothing -> mzero
|
||||
Just c -> return [c]
|
||||
Just c -> return [RoffStr [c]]
|
||||
(s:ss) -> do
|
||||
basechar <- case M.lookup cs characterCodeMap of
|
||||
Nothing ->
|
||||
|
@ -200,10 +197,12 @@ escapeLexer = try $ do
|
|||
case M.lookup a combiningAccentsMap of
|
||||
Just x -> addAccents as (x:xs)
|
||||
Nothing -> mzero
|
||||
addAccents ss [basechar])
|
||||
<|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD"
|
||||
addAccents ss [basechar] >>= \xs -> return [RoffStr xs])
|
||||
<|> escUnknown ("\\[" ++ cs ++ "]")
|
||||
|
||||
unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']'
|
||||
unicodeGlyph = try $ do
|
||||
xs <- ucharCode `sepBy1` (char '_') <* char ']'
|
||||
return [RoffStr xs]
|
||||
|
||||
ucharCode = try $ do
|
||||
char 'u'
|
||||
|
@ -214,53 +213,66 @@ escapeLexer = try $ do
|
|||
Nothing -> mzero
|
||||
Just c -> return c
|
||||
|
||||
-- \s-1 \s0 -- we ignore these
|
||||
escFontSize :: PandocMonad m => ManLexer m String
|
||||
escFontSize = do
|
||||
pos <- getPosition
|
||||
pm <- option "" $ count 1 (oneOf "+-")
|
||||
ds <- many1 digit
|
||||
report $ SkippedContent ("\\s" ++ pm ++ ds) pos
|
||||
return mempty
|
||||
|
||||
escFont :: PandocMonad m => ManLexer m String
|
||||
escFont = do
|
||||
font <- choice
|
||||
[ S.singleton <$> letterFontKind
|
||||
, char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
|
||||
, char 'S' >> return (S.singleton Regular)
|
||||
, try lettersFont
|
||||
, digit >> return (S.singleton Regular)
|
||||
]
|
||||
modifyState (\r -> r {fontKind = font})
|
||||
return mempty
|
||||
|
||||
lettersFont :: PandocMonad m => ManLexer m Font
|
||||
lettersFont = do
|
||||
char '['
|
||||
fs <- many letterFontKind
|
||||
skipMany letter
|
||||
char ']'
|
||||
return $ S.fromList fs
|
||||
|
||||
letterFontKind :: PandocMonad m => ManLexer m FontKind
|
||||
letterFontKind = choice [
|
||||
oneOf ['B','b'] >> return Bold
|
||||
, oneOf ['I','i'] >> return Italic
|
||||
, oneOf ['C','c'] >> return Monospace
|
||||
, oneOf ['P','p','R','r'] >> return Regular
|
||||
]
|
||||
|
||||
escUnknown :: PandocMonad m => String -> a -> ManLexer m a
|
||||
escUnknown s x = do
|
||||
escUnknown :: PandocMonad m => String -> ManLexer m [LinePart]
|
||||
escUnknown s = do
|
||||
pos <- getPosition
|
||||
report $ SkippedContent ("Unknown escape sequence " ++ s) pos
|
||||
return x
|
||||
return [RoffStr "\xFFFD"]
|
||||
|
||||
currentFont :: PandocMonad m => ManLexer m Font
|
||||
currentFont = fontKind <$> getState
|
||||
-- \s-1 \s0
|
||||
escFontSize :: PandocMonad m => ManLexer m [LinePart]
|
||||
escFontSize = do
|
||||
let sign = option "" $ count 1 (oneOf "+-")
|
||||
let toFontSize xs =
|
||||
case safeRead xs of
|
||||
Nothing -> mzero
|
||||
Just n -> return [FontSize n]
|
||||
choice
|
||||
[ do char '('
|
||||
s <- sign
|
||||
ds <- count 2 digit
|
||||
toFontSize (s ++ ds)
|
||||
, do char '['
|
||||
s <- sign
|
||||
ds <- many1 digit
|
||||
char ']'
|
||||
toFontSize (s ++ ds)
|
||||
, do s <- sign
|
||||
ds <- count 1 digit
|
||||
toFontSize (s ++ ds)
|
||||
]
|
||||
|
||||
-- separate function from lexMacro since real man files sometimes do not follow the rules
|
||||
escFont :: PandocMonad m => ManLexer m [LinePart]
|
||||
escFont = do
|
||||
font <- choice
|
||||
[ char 'S' >> return [Regular]
|
||||
, digit >> return [Regular]
|
||||
, (:[]) <$> letterFontKind
|
||||
, char '(' >> anyChar >> anyChar >> return [Regular]
|
||||
, lettersFont
|
||||
, digit >> return [Regular]
|
||||
]
|
||||
return [Font font]
|
||||
|
||||
lettersFont :: PandocMonad m => ManLexer m [FontKind]
|
||||
lettersFont = try $ do
|
||||
char '['
|
||||
fs <- many letterFontKind
|
||||
skipMany letter
|
||||
char ']'
|
||||
return fs
|
||||
|
||||
letterFontKind :: PandocMonad m => ManLexer m FontKind
|
||||
letterFontKind = choice [
|
||||
oneOf ['B','b'] >> return Bold
|
||||
, oneOf ['I','i'] >> return Italic
|
||||
, oneOf ['C','c'] >> return Monospace
|
||||
, oneOf ['P','p','R','r'] >> return Regular
|
||||
]
|
||||
|
||||
|
||||
-- separate function from lexMacro since real man files sometimes do not
|
||||
-- follow the rules
|
||||
lexComment :: PandocMonad m => ManLexer m ManTokens
|
||||
lexComment = do
|
||||
try $ string ".\\\""
|
||||
|
@ -310,11 +322,11 @@ lexTable = do
|
|||
lexConditional :: PandocMonad m => ManLexer m ManTokens
|
||||
lexConditional = do
|
||||
skipMany spacetab
|
||||
parseNCond <|> skipConditional
|
||||
lexNCond <|> skipConditional
|
||||
|
||||
-- n means nroff mode
|
||||
parseNCond :: PandocMonad m => ManLexer m ManTokens
|
||||
parseNCond = do
|
||||
lexNCond :: PandocMonad m => ManLexer m ManTokens
|
||||
lexNCond = do
|
||||
char '\n'
|
||||
many1 spacetab
|
||||
lexGroup <|> manToken
|
||||
|
@ -355,11 +367,11 @@ resolveMacro macroName args pos = do
|
|||
case M.lookup macroName macros of
|
||||
Nothing -> return $ singleTok $ MMacro macroName args pos
|
||||
Just ts -> do
|
||||
let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs
|
||||
fillLP (MacroArg i) zs =
|
||||
let fillLP (MacroArg i) zs =
|
||||
case drop (i - 1) args of
|
||||
[] -> zs
|
||||
(ys:_) -> ys ++ zs
|
||||
fillLP z zs = z : zs
|
||||
let fillMacroArg (MLine lineparts) =
|
||||
MLine (foldr fillLP [] lineparts)
|
||||
fillMacroArg x = x
|
||||
|
@ -370,7 +382,7 @@ lexStringDef args = do -- string definition
|
|||
case args of
|
||||
[] -> fail "No argument to .ds"
|
||||
(x:ys) -> do
|
||||
let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys)
|
||||
let ts = singleTok $ MLine (intercalate [RoffStr " " ] ys)
|
||||
let stringName = linePartsToString x
|
||||
modifyState $ \st ->
|
||||
st{ customMacros = M.insert stringName ts (customMacros st) }
|
||||
|
@ -413,21 +425,16 @@ lexArgs = do
|
|||
plainArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
plainArg = do
|
||||
skipMany spacetab
|
||||
mconcat <$> many1
|
||||
(macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
|
||||
mconcat <$> many1 (macroArg <|> escape <|> regularText <|> unescapedQuote)
|
||||
where
|
||||
unescapedQuote = do
|
||||
char '"'
|
||||
fonts <- currentFont
|
||||
return [RoffStr ("\"", fonts)]
|
||||
|
||||
unescapedQuote = char '"' >> return [RoffStr "\""]
|
||||
|
||||
quotedArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
quotedArg = do
|
||||
skipMany spacetab
|
||||
char '"'
|
||||
xs <- mconcat <$>
|
||||
many (macroArg <|> esc <|> escStar <|> regularText
|
||||
many (macroArg <|> escape <|> regularText
|
||||
<|> spaceTabChar <|> escapedQuote)
|
||||
char '"'
|
||||
return xs
|
||||
|
@ -435,14 +442,11 @@ lexArgs = do
|
|||
escapedQuote = try $ do
|
||||
char '"'
|
||||
char '"'
|
||||
fonts <- currentFont
|
||||
return [RoffStr ("\"", fonts)]
|
||||
return [RoffStr "\""]
|
||||
|
||||
escStar :: PandocMonad m => ManLexer m [LinePart]
|
||||
escStar = try $ do
|
||||
pos <- getPosition
|
||||
char '\\'
|
||||
char '*'
|
||||
c <- anyChar
|
||||
case c of
|
||||
'(' -> do
|
||||
|
@ -474,11 +478,11 @@ lexLine = do
|
|||
where -- return empty line if we only have empty strings;
|
||||
-- this can happen if the line just contains \f[C], for example.
|
||||
go [] = return mempty
|
||||
go (RoffStr ("",_):xs) = go xs
|
||||
go (RoffStr "" : xs) = go xs
|
||||
go xs = return $ singleTok $ MLine xs
|
||||
|
||||
linePart :: PandocMonad m => ManLexer m [LinePart]
|
||||
linePart = macroArg <|> esc <|> escStar <|>
|
||||
linePart = macroArg <|> escape <|>
|
||||
regularText <|> quoteChar <|> spaceTabChar
|
||||
|
||||
macroArg :: PandocMonad m => ManLexer m [LinePart]
|
||||
|
@ -487,29 +491,20 @@ macroArg = try $ do
|
|||
x <- digit
|
||||
return [MacroArg $ ord x - ord '0']
|
||||
|
||||
esc :: PandocMonad m => ManLexer m [LinePart]
|
||||
esc = do
|
||||
s <- escapeLexer
|
||||
font <- currentFont
|
||||
return [RoffStr (s, font)]
|
||||
|
||||
regularText :: PandocMonad m => ManLexer m [LinePart]
|
||||
regularText = do
|
||||
s <- many1 $ noneOf "\n\r\t \\\""
|
||||
font <- currentFont
|
||||
return [RoffStr (s, font)]
|
||||
return [RoffStr s]
|
||||
|
||||
quoteChar :: PandocMonad m => ManLexer m [LinePart]
|
||||
quoteChar = do
|
||||
char '"'
|
||||
font <- currentFont
|
||||
return [RoffStr ("\"", font)]
|
||||
return [RoffStr "\""]
|
||||
|
||||
spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
|
||||
spaceTabChar = do
|
||||
c <- spacetab
|
||||
font <- currentFont
|
||||
return [RoffStr ([c], font)]
|
||||
return [RoffStr [c]]
|
||||
|
||||
lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
|
||||
lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
|
||||
|
@ -520,5 +515,5 @@ manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
|
|||
linePartsToString :: [LinePart] -> String
|
||||
linePartsToString = mconcat . map go
|
||||
where
|
||||
go (RoffStr (s, _)) = s
|
||||
go (RoffStr s) = s
|
||||
go _ = mempty
|
||||
|
|
|
@ -39,7 +39,6 @@ import Control.Monad (liftM, mzero, guard)
|
|||
import Control.Monad.Except (throwError)
|
||||
import Text.Pandoc.Class (PandocMonad(..), report)
|
||||
import Data.Maybe (catMaybes)
|
||||
import qualified Data.Set as S
|
||||
import Data.List (intersperse, intercalate)
|
||||
import qualified Data.Text as T
|
||||
import Text.Pandoc.Builder as B
|
||||
|
@ -174,34 +173,16 @@ parseTitle = do
|
|||
return mempty
|
||||
|
||||
linePartsToInlines :: [LinePart] -> Inlines
|
||||
linePartsToInlines = go
|
||||
linePartsToInlines = go []
|
||||
|
||||
where
|
||||
go [] = mempty
|
||||
go (MacroArg _:xs) = go xs -- shouldn't happen
|
||||
go xs@(RoffStr{} : _) =
|
||||
if lb > 0 && lb >= li
|
||||
then strong (go (removeFont Bold bolds)) <> go (drop lb xs)
|
||||
else if li > 0
|
||||
then emph (go (removeFont Italic italics)) <> go (drop li xs)
|
||||
else text (linePartsToString regulars) <> go (drop lr xs)
|
||||
|
||||
where
|
||||
(lb, li, lr) = (length bolds, length italics, length regulars)
|
||||
|
||||
removeFont font = map (removeFont' font)
|
||||
removeFont' font (RoffStr (s,f)) = RoffStr (s, S.delete font f)
|
||||
removeFont' _ x = x
|
||||
|
||||
bolds = takeWhile isBold xs
|
||||
italics = takeWhile isItalic xs
|
||||
regulars = takeWhile (\x -> not (isBold x || isItalic x)) xs
|
||||
|
||||
isBold (RoffStr (_,f)) = Bold `S.member` f
|
||||
isBold _ = False
|
||||
|
||||
isItalic (RoffStr (_,f)) = Italic `S.member` f
|
||||
isItalic _ = False
|
||||
go :: [[FontKind]] -> [LinePart] -> Inlines
|
||||
go _ [] = mempty
|
||||
go fs (MacroArg _:xs) = go fs xs -- shouldn't happen
|
||||
go fs (RoffStr s : xs) = text s <> go fs xs
|
||||
go (_:fs) (Font [] : xs) = go fs xs -- return to previous font
|
||||
go fs (Font _newfonts : xs) = go fs xs
|
||||
go fonts (FontSize _fs : xs) = go fonts xs
|
||||
|
||||
parsePara :: PandocMonad m => ManParser m Blocks
|
||||
parsePara = para . trimInlines <$> parseInlines
|
||||
|
@ -289,7 +270,13 @@ parseCodeBlock = try $ do
|
|||
where
|
||||
|
||||
extractText :: ManToken -> Maybe String
|
||||
extractText (MLine ss) = Just $ linePartsToString ss
|
||||
extractText (MLine ss)
|
||||
| not (null ss)
|
||||
, all isFontToken ss = Nothing
|
||||
| otherwise = Just $ linePartsToString ss
|
||||
where isFontToken (FontSize{}) = True
|
||||
isFontToken (Font{}) = True
|
||||
isFontToken _ = False
|
||||
extractText MEmptyLine = Just ""
|
||||
-- string are intercalated with '\n', this prevents double '\n'
|
||||
extractText _ = Nothing
|
||||
|
|
Loading…
Reference in a new issue