T.P.Readers.Groff: improve LinePart.

Separate font change and font size change tokens.

With this change, emphasis no longer works. This needs to
be implemented in the parser, not the lexer.
This commit is contained in:
John MacFarlane 2018-10-24 22:04:15 -07:00
parent 6c71100fcf
commit e4726518af
2 changed files with 120 additions and 138 deletions

View file

@ -33,7 +33,6 @@ Tokenizer for groff formats (man, ms).
-} -}
module Text.Pandoc.Readers.Groff module Text.Pandoc.Readers.Groff
( FontKind(..) ( FontKind(..)
, Font
, MacroKind , MacroKind
, LinePart(..) , LinePart(..)
, Arg , Arg
@ -54,8 +53,6 @@ import Text.Pandoc.Class
import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace) import Data.Char (isHexDigit, chr, ord, isAscii, isAlphaNum, isSpace)
import Data.Default (Default) import Data.Default (Default)
import qualified Data.Map as M import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.List (intercalate, isSuffixOf) import Data.List (intercalate, isSuffixOf)
import qualified Data.Text as T import qualified Data.Text as T
import Text.Pandoc.Builder as B import Text.Pandoc.Builder as B
@ -79,9 +76,9 @@ data FontKind = Bold | Italic | Monospace | Regular deriving (Show, Eq, Ord)
type MacroKind = String type MacroKind = String
type Font = Set FontKind data LinePart = RoffStr String
| Font [FontKind]
data LinePart = RoffStr (String, Font) | FontSize Int
| MacroArg Int | MacroArg Int
deriving Show deriving Show
@ -100,20 +97,19 @@ newtype ManTokens = ManTokens { unManTokens :: Seq.Seq ManToken }
singleTok :: ManToken -> ManTokens singleTok :: ManToken -> ManTokens
singleTok t = ManTokens (Seq.singleton t) singleTok t = ManTokens (Seq.singleton t)
data RoffState = RoffState { fontKind :: Font data RoffState = RoffState { customMacros :: M.Map String ManTokens
, customMacros :: M.Map String ManTokens
} deriving Show } deriving Show
instance Default RoffState where instance Default RoffState where
def = RoffState { customMacros = M.fromList def = RoffState { customMacros = M.fromList
$ map (\(n, s) -> $ map (\(n, s) ->
(n, singleTok (n, singleTok
(MLine [RoffStr (s, mempty)]))) (MLine [RoffStr s])))
[ ("Tm", "\x2122") [ ("Tm", "\x2122")
, ("lq", "\x201C") , ("lq", "\x201C")
, ("rq", "\x201D") , ("rq", "\x201D")
, ("R", "\x00AE") ] , ("R", "\x00AE") ]
, fontKind = S.singleton Regular } }
type ManLexer m = ParserT [Char] RoffState m type ManLexer m = ParserT [Char] RoffState m
@ -135,15 +131,16 @@ combiningAccentsMap :: M.Map String Char
combiningAccentsMap = combiningAccentsMap =
M.fromList $ map (\(x,y) -> (y,x)) combiningAccents M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
escapeLexer :: PandocMonad m => ManLexer m String escape :: PandocMonad m => ManLexer m [LinePart]
escapeLexer = try $ do escape = do
char '\\' char '\\'
c <- noneOf ['*','$'] -- see escStar, macroArg c <- anyChar
case c of case c of
'(' -> twoCharGlyph
'[' -> bracketedGlyph
'f' -> escFont 'f' -> escFont
's' -> escFontSize 's' -> escFontSize
'*' -> escStar
'(' -> twoCharGlyph
'[' -> bracketedGlyph
'"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
'#' -> mempty <$ manyTill anyChar newline '#' -> mempty <$ manyTill anyChar newline
'%' -> return mempty '%' -> return mempty
@ -154,27 +151,27 @@ escapeLexer = try $ do
':' -> return mempty ':' -> return mempty
'0' -> return mempty '0' -> return mempty
'c' -> return mempty 'c' -> return mempty
'-' -> return "-" '-' -> return [RoffStr "-"]
'_' -> return "_" '_' -> return [RoffStr "_"]
' ' -> return " " ' ' -> return [RoffStr " "]
'\\' -> return "\\" '\\' -> return [RoffStr "\\"]
't' -> return "\t" 't' -> return [RoffStr "\t"]
'e' -> return "\\" 'e' -> return [RoffStr "\\"]
'`' -> return "`" '`' -> return [RoffStr "`"]
'^' -> return " " '^' -> return [RoffStr " "]
'|' -> return " " '|' -> return [RoffStr " "]
'\'' -> return "`" '\'' -> return [RoffStr "`"]
'.' -> return "`" '.' -> return [RoffStr "`"]
'~' -> return "\160" -- nonbreaking space '~' -> return [RoffStr "\160"] -- nonbreaking space
_ -> escUnknown ['\\',c] "\xFFFD" _ -> escUnknown ['\\',c]
where where
twoCharGlyph = do twoCharGlyph = do
cs <- count 2 anyChar cs <- count 2 anyChar
case M.lookup cs characterCodeMap of case M.lookup cs characterCodeMap of
Just c -> return [c] Just c -> return [RoffStr [c]]
Nothing -> escUnknown ('\\':'(':cs) "\xFFFD" Nothing -> escUnknown ('\\':'(':cs)
bracketedGlyph = unicodeGlyph <|> charGlyph bracketedGlyph = unicodeGlyph <|> charGlyph
@ -184,7 +181,7 @@ escapeLexer = try $ do
[] -> mzero [] -> mzero
[s] -> case M.lookup s characterCodeMap of [s] -> case M.lookup s characterCodeMap of
Nothing -> mzero Nothing -> mzero
Just c -> return [c] Just c -> return [RoffStr [c]]
(s:ss) -> do (s:ss) -> do
basechar <- case M.lookup cs characterCodeMap of basechar <- case M.lookup cs characterCodeMap of
Nothing -> Nothing ->
@ -200,10 +197,12 @@ escapeLexer = try $ do
case M.lookup a combiningAccentsMap of case M.lookup a combiningAccentsMap of
Just x -> addAccents as (x:xs) Just x -> addAccents as (x:xs)
Nothing -> mzero Nothing -> mzero
addAccents ss [basechar]) addAccents ss [basechar] >>= \xs -> return [RoffStr xs])
<|> escUnknown ("\\[" ++ cs ++ "]") "\xFFFD" <|> escUnknown ("\\[" ++ cs ++ "]")
unicodeGlyph = try $ ucharCode `sepBy1` (char '_') <* char ']' unicodeGlyph = try $ do
xs <- ucharCode `sepBy1` (char '_') <* char ']'
return [RoffStr xs]
ucharCode = try $ do ucharCode = try $ do
char 'u' char 'u'
@ -214,53 +213,66 @@ escapeLexer = try $ do
Nothing -> mzero Nothing -> mzero
Just c -> return c Just c -> return c
-- \s-1 \s0 -- we ignore these escUnknown :: PandocMonad m => String -> ManLexer m [LinePart]
escFontSize :: PandocMonad m => ManLexer m String escUnknown s = do
escFontSize = do
pos <- getPosition
pm <- option "" $ count 1 (oneOf "+-")
ds <- many1 digit
report $ SkippedContent ("\\s" ++ pm ++ ds) pos
return mempty
escFont :: PandocMonad m => ManLexer m String
escFont = do
font <- choice
[ S.singleton <$> letterFontKind
, char '(' >> anyChar >> anyChar >> return (S.singleton Regular)
, char 'S' >> return (S.singleton Regular)
, try lettersFont
, digit >> return (S.singleton Regular)
]
modifyState (\r -> r {fontKind = font})
return mempty
lettersFont :: PandocMonad m => ManLexer m Font
lettersFont = do
char '['
fs <- many letterFontKind
skipMany letter
char ']'
return $ S.fromList fs
letterFontKind :: PandocMonad m => ManLexer m FontKind
letterFontKind = choice [
oneOf ['B','b'] >> return Bold
, oneOf ['I','i'] >> return Italic
, oneOf ['C','c'] >> return Monospace
, oneOf ['P','p','R','r'] >> return Regular
]
escUnknown :: PandocMonad m => String -> a -> ManLexer m a
escUnknown s x = do
pos <- getPosition pos <- getPosition
report $ SkippedContent ("Unknown escape sequence " ++ s) pos report $ SkippedContent ("Unknown escape sequence " ++ s) pos
return x return [RoffStr "\xFFFD"]
currentFont :: PandocMonad m => ManLexer m Font -- \s-1 \s0
currentFont = fontKind <$> getState escFontSize :: PandocMonad m => ManLexer m [LinePart]
escFontSize = do
let sign = option "" $ count 1 (oneOf "+-")
let toFontSize xs =
case safeRead xs of
Nothing -> mzero
Just n -> return [FontSize n]
choice
[ do char '('
s <- sign
ds <- count 2 digit
toFontSize (s ++ ds)
, do char '['
s <- sign
ds <- many1 digit
char ']'
toFontSize (s ++ ds)
, do s <- sign
ds <- count 1 digit
toFontSize (s ++ ds)
]
-- separate function from lexMacro since real man files sometimes do not follow the rules escFont :: PandocMonad m => ManLexer m [LinePart]
escFont = do
font <- choice
[ char 'S' >> return [Regular]
, digit >> return [Regular]
, (:[]) <$> letterFontKind
, char '(' >> anyChar >> anyChar >> return [Regular]
, lettersFont
, digit >> return [Regular]
]
return [Font font]
lettersFont :: PandocMonad m => ManLexer m [FontKind]
lettersFont = try $ do
char '['
fs <- many letterFontKind
skipMany letter
char ']'
return fs
letterFontKind :: PandocMonad m => ManLexer m FontKind
letterFontKind = choice [
oneOf ['B','b'] >> return Bold
, oneOf ['I','i'] >> return Italic
, oneOf ['C','c'] >> return Monospace
, oneOf ['P','p','R','r'] >> return Regular
]
-- separate function from lexMacro since real man files sometimes do not
-- follow the rules
lexComment :: PandocMonad m => ManLexer m ManTokens lexComment :: PandocMonad m => ManLexer m ManTokens
lexComment = do lexComment = do
try $ string ".\\\"" try $ string ".\\\""
@ -310,11 +322,11 @@ lexTable = do
lexConditional :: PandocMonad m => ManLexer m ManTokens lexConditional :: PandocMonad m => ManLexer m ManTokens
lexConditional = do lexConditional = do
skipMany spacetab skipMany spacetab
parseNCond <|> skipConditional lexNCond <|> skipConditional
-- n means nroff mode -- n means nroff mode
parseNCond :: PandocMonad m => ManLexer m ManTokens lexNCond :: PandocMonad m => ManLexer m ManTokens
parseNCond = do lexNCond = do
char '\n' char '\n'
many1 spacetab many1 spacetab
lexGroup <|> manToken lexGroup <|> manToken
@ -355,11 +367,11 @@ resolveMacro macroName args pos = do
case M.lookup macroName macros of case M.lookup macroName macros of
Nothing -> return $ singleTok $ MMacro macroName args pos Nothing -> return $ singleTok $ MMacro macroName args pos
Just ts -> do Just ts -> do
let fillLP (RoffStr (x,y)) zs = RoffStr (x,y) : zs let fillLP (MacroArg i) zs =
fillLP (MacroArg i) zs =
case drop (i - 1) args of case drop (i - 1) args of
[] -> zs [] -> zs
(ys:_) -> ys ++ zs (ys:_) -> ys ++ zs
fillLP z zs = z : zs
let fillMacroArg (MLine lineparts) = let fillMacroArg (MLine lineparts) =
MLine (foldr fillLP [] lineparts) MLine (foldr fillLP [] lineparts)
fillMacroArg x = x fillMacroArg x = x
@ -370,7 +382,7 @@ lexStringDef args = do -- string definition
case args of case args of
[] -> fail "No argument to .ds" [] -> fail "No argument to .ds"
(x:ys) -> do (x:ys) -> do
let ts = singleTok $ MLine (intercalate [RoffStr (" ", mempty)] ys) let ts = singleTok $ MLine (intercalate [RoffStr " " ] ys)
let stringName = linePartsToString x let stringName = linePartsToString x
modifyState $ \st -> modifyState $ \st ->
st{ customMacros = M.insert stringName ts (customMacros st) } st{ customMacros = M.insert stringName ts (customMacros st) }
@ -413,21 +425,16 @@ lexArgs = do
plainArg :: PandocMonad m => ManLexer m [LinePart] plainArg :: PandocMonad m => ManLexer m [LinePart]
plainArg = do plainArg = do
skipMany spacetab skipMany spacetab
mconcat <$> many1 mconcat <$> many1 (macroArg <|> escape <|> regularText <|> unescapedQuote)
(macroArg <|> esc <|> regularText <|> unescapedQuote <|> escStar)
where where
unescapedQuote = do unescapedQuote = char '"' >> return [RoffStr "\""]
char '"'
fonts <- currentFont
return [RoffStr ("\"", fonts)]
quotedArg :: PandocMonad m => ManLexer m [LinePart] quotedArg :: PandocMonad m => ManLexer m [LinePart]
quotedArg = do quotedArg = do
skipMany spacetab skipMany spacetab
char '"' char '"'
xs <- mconcat <$> xs <- mconcat <$>
many (macroArg <|> esc <|> escStar <|> regularText many (macroArg <|> escape <|> regularText
<|> spaceTabChar <|> escapedQuote) <|> spaceTabChar <|> escapedQuote)
char '"' char '"'
return xs return xs
@ -435,14 +442,11 @@ lexArgs = do
escapedQuote = try $ do escapedQuote = try $ do
char '"' char '"'
char '"' char '"'
fonts <- currentFont return [RoffStr "\""]
return [RoffStr ("\"", fonts)]
escStar :: PandocMonad m => ManLexer m [LinePart] escStar :: PandocMonad m => ManLexer m [LinePart]
escStar = try $ do escStar = try $ do
pos <- getPosition pos <- getPosition
char '\\'
char '*'
c <- anyChar c <- anyChar
case c of case c of
'(' -> do '(' -> do
@ -474,11 +478,11 @@ lexLine = do
where -- return empty line if we only have empty strings; where -- return empty line if we only have empty strings;
-- this can happen if the line just contains \f[C], for example. -- this can happen if the line just contains \f[C], for example.
go [] = return mempty go [] = return mempty
go (RoffStr ("",_):xs) = go xs go (RoffStr "" : xs) = go xs
go xs = return $ singleTok $ MLine xs go xs = return $ singleTok $ MLine xs
linePart :: PandocMonad m => ManLexer m [LinePart] linePart :: PandocMonad m => ManLexer m [LinePart]
linePart = macroArg <|> esc <|> escStar <|> linePart = macroArg <|> escape <|>
regularText <|> quoteChar <|> spaceTabChar regularText <|> quoteChar <|> spaceTabChar
macroArg :: PandocMonad m => ManLexer m [LinePart] macroArg :: PandocMonad m => ManLexer m [LinePart]
@ -487,29 +491,20 @@ macroArg = try $ do
x <- digit x <- digit
return [MacroArg $ ord x - ord '0'] return [MacroArg $ ord x - ord '0']
esc :: PandocMonad m => ManLexer m [LinePart]
esc = do
s <- escapeLexer
font <- currentFont
return [RoffStr (s, font)]
regularText :: PandocMonad m => ManLexer m [LinePart] regularText :: PandocMonad m => ManLexer m [LinePart]
regularText = do regularText = do
s <- many1 $ noneOf "\n\r\t \\\"" s <- many1 $ noneOf "\n\r\t \\\""
font <- currentFont return [RoffStr s]
return [RoffStr (s, font)]
quoteChar :: PandocMonad m => ManLexer m [LinePart] quoteChar :: PandocMonad m => ManLexer m [LinePart]
quoteChar = do quoteChar = do
char '"' char '"'
font <- currentFont return [RoffStr "\""]
return [RoffStr ("\"", font)]
spaceTabChar :: PandocMonad m => ManLexer m [LinePart] spaceTabChar :: PandocMonad m => ManLexer m [LinePart]
spaceTabChar = do spaceTabChar = do
c <- spacetab c <- spacetab
font <- currentFont return [RoffStr [c]]
return [RoffStr ([c], font)]
lexEmptyLine :: PandocMonad m => ManLexer m ManTokens lexEmptyLine :: PandocMonad m => ManLexer m ManTokens
lexEmptyLine = char '\n' >> return (singleTok MEmptyLine) lexEmptyLine = char '\n' >> return (singleTok MEmptyLine)
@ -520,5 +515,5 @@ manToken = lexComment <|> lexMacro <|> lexLine <|> lexEmptyLine
linePartsToString :: [LinePart] -> String linePartsToString :: [LinePart] -> String
linePartsToString = mconcat . map go linePartsToString = mconcat . map go
where where
go (RoffStr (s, _)) = s go (RoffStr s) = s
go _ = mempty go _ = mempty

View file

@ -39,7 +39,6 @@ import Control.Monad (liftM, mzero, guard)
import Control.Monad.Except (throwError) import Control.Monad.Except (throwError)
import Text.Pandoc.Class (PandocMonad(..), report) import Text.Pandoc.Class (PandocMonad(..), report)
import Data.Maybe (catMaybes) import Data.Maybe (catMaybes)
import qualified Data.Set as S
import Data.List (intersperse, intercalate) import Data.List (intersperse, intercalate)
import qualified Data.Text as T import qualified Data.Text as T
import Text.Pandoc.Builder as B import Text.Pandoc.Builder as B
@ -174,34 +173,16 @@ parseTitle = do
return mempty return mempty
linePartsToInlines :: [LinePart] -> Inlines linePartsToInlines :: [LinePart] -> Inlines
linePartsToInlines = go linePartsToInlines = go []
where where
go [] = mempty go :: [[FontKind]] -> [LinePart] -> Inlines
go (MacroArg _:xs) = go xs -- shouldn't happen go _ [] = mempty
go xs@(RoffStr{} : _) = go fs (MacroArg _:xs) = go fs xs -- shouldn't happen
if lb > 0 && lb >= li go fs (RoffStr s : xs) = text s <> go fs xs
then strong (go (removeFont Bold bolds)) <> go (drop lb xs) go (_:fs) (Font [] : xs) = go fs xs -- return to previous font
else if li > 0 go fs (Font _newfonts : xs) = go fs xs
then emph (go (removeFont Italic italics)) <> go (drop li xs) go fonts (FontSize _fs : xs) = go fonts xs
else text (linePartsToString regulars) <> go (drop lr xs)
where
(lb, li, lr) = (length bolds, length italics, length regulars)
removeFont font = map (removeFont' font)
removeFont' font (RoffStr (s,f)) = RoffStr (s, S.delete font f)
removeFont' _ x = x
bolds = takeWhile isBold xs
italics = takeWhile isItalic xs
regulars = takeWhile (\x -> not (isBold x || isItalic x)) xs
isBold (RoffStr (_,f)) = Bold `S.member` f
isBold _ = False
isItalic (RoffStr (_,f)) = Italic `S.member` f
isItalic _ = False
parsePara :: PandocMonad m => ManParser m Blocks parsePara :: PandocMonad m => ManParser m Blocks
parsePara = para . trimInlines <$> parseInlines parsePara = para . trimInlines <$> parseInlines
@ -289,7 +270,13 @@ parseCodeBlock = try $ do
where where
extractText :: ManToken -> Maybe String extractText :: ManToken -> Maybe String
extractText (MLine ss) = Just $ linePartsToString ss extractText (MLine ss)
| not (null ss)
, all isFontToken ss = Nothing
| otherwise = Just $ linePartsToString ss
where isFontToken (FontSize{}) = True
isFontToken (Font{}) = True
isFontToken _ = False
extractText MEmptyLine = Just "" extractText MEmptyLine = Just ""
-- string are intercalated with '\n', this prevents double '\n' -- string are intercalated with '\n', this prevents double '\n'
extractText _ = Nothing extractText _ = Nothing