Fix loose parser not making sure endOfInput is reached; add two families of operators and simplify the «Show» instance with a dedicated function to allow deleting lines of uninteresting code

This commit is contained in:
Tissevert 2020-02-06 16:54:27 +01:00
parent 3f6b0651f3
commit 57996749c6

View file

@ -13,7 +13,8 @@ import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (MonadState, evalStateT, get, modify, put) import Control.Monad.State (MonadState, evalStateT, get, modify, put)
import Data.Attoparsec.ByteString.Char8 (choice, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack) import Data.ByteString.Char8 (pack, unpack)
import Data.Char (toLower)
import Data.Map ((!), (!?), Map) import Data.Map ((!), (!?), Map)
import qualified Data.Map as Map (fromList) import qualified Data.Map as Map (fromList)
import PDF.Font (Font, FontSet, emptyFont) import PDF.Font (Font, FontSet, emptyFont)
@ -21,22 +22,43 @@ import PDF.Object (
DirectObject(..), StringObject(..) DirectObject(..), StringObject(..)
, array, blank, name, regular, stringObject, toByteString , array, blank, name, regular, stringObject, toByteString
) )
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll1) import PDF.Parser (MonadParser(..), (<?>), Parser, evalParser)
data StateOperator = data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
deriving (Bounded, Enum) deriving (Bounded, Enum, Show)
data PathOperator =
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
| W | Wstar -- clipping path
deriving (Bounded, Enum, Show)
data ColorOperator =
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
deriving (Bounded, Enum, Show)
data TextOperator = data TextOperator =
Td | TD | Tm | Tstar -- text positioning Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing | TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum) deriving (Bounded, Enum, Show)
data Argument = Raw ByteString | Typed DirectObject deriving Show data Argument = Raw ByteString | Typed DirectObject deriving Show
type Call a = (a, [Argument]) type Call a = (a, [Argument])
type Operator a = (Bounded a, Enum a, Show a)
code :: Operator a => a -> ByteString
code = pack . expand . show
where
expand "" = ""
expand (c:'_':s) = toLower c : expand s
expand ('s':'t':'a':'r':s) = '*' : expand s
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
expand (c:s) = c : expand s
{-
instance Show StateOperator where instance Show StateOperator where
show Cm = "cm" show Cm = "cm"
show W = "w" show W_ = "w"
show J = "J" show J = "J"
show J_ = "j" show J_ = "j"
show M = "M" show M = "M"
@ -45,6 +67,35 @@ instance Show StateOperator where
show I = "i" show I = "i"
show Gs = "gs" show Gs = "gs"
instance Show PathOperator where
show M_ = "m"
show L_ = "l"
show C_ = "c"
show V_ = "v"
show Y_ = "y"
show H_
("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
, ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
, ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("h", (L_, \l -> case l of [] -> True ; _ -> False))
, ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
, ("S", (L_, \l -> case l of [] -> True ; _ -> False))
, ("s", (L_, \l -> case l of [] -> True ; _ -> False))
, ("f", (L_, \l -> case l of [] -> True ; _ -> False))
, ("F", (L_, \l -> case l of [] -> True ; _ -> False))
, ("F*", (L_, \l -> case l of [] -> True ; _ -> False))
, ("B", (L_, \l -> case l of [] -> True ; _ -> False))
, ("B*", (L_, \l -> case l of [] -> True ; _ -> False))
, ("b", (L_, \l -> case l of [] -> True ; _ -> False))
, ("b*", (L_, \l -> case l of [] -> True ; _ -> False))
, ("n", (L_, \l -> case l of [] -> True ; _ -> False))
, ("W", (L_, \l -> case l of [] -> True ; _ -> False))
, ("W*", (L_, \l -> case l of [] -> True ; _ -> False))
instance Show ColorOperator where
instance Show TextOperator where instance Show TextOperator where
show Td = "Td" show Td = "Td"
show TD = "TD" show TD = "TD"
@ -61,18 +112,58 @@ instance Show TextOperator where
show Tf = "Tf" show Tf = "Tf"
show Tr = "Tr" show Tr = "Tr"
show Ts = "Ts" show Ts = "Ts"
-}
stateOperator :: OperatorTable StateOperator stateOperator :: OperatorTable StateOperator
stateOperator = Map.fromList [ stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
("cm", (Cm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)) (C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, ("w", (W, \l -> case l of [Raw _] -> True ; _ -> False)) , (W_, \l -> case l of [Raw _] -> True ; _ -> False)
, ("J", (J, \l -> case l of [Raw _] -> True ; _ -> False)) , (J, \l -> case l of [Raw _] -> True ; _ -> False)
, ("j", (J_, \l -> case l of [Raw _] -> True ; _ -> False)) , (J_, \l -> case l of [Raw _] -> True ; _ -> False)
, ("M", (M, \l -> case l of [Raw _] -> True ; _ -> False)) , (M, \l -> case l of [Raw _] -> True ; _ -> False)
, ("d", (D, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)) , (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, ("ri", (Ri, \l -> case l of [Raw _] -> True ; _ -> False)) , (R_i, \l -> case l of [Raw _] -> True ; _ -> False)
, ("i", (I, \l -> case l of [Raw _] -> True ; _ -> False)) , (I_, \l -> case l of [Raw _] -> True ; _ -> False)
, ("gs", (Gs, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)) , (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
]
pathOperator :: OperatorTable PathOperator
pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
(M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (H_, \l -> case l of [] -> True ; _ -> False)
, (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (S, \l -> case l of [] -> True ; _ -> False)
, (S_, \l -> case l of [] -> True ; _ -> False)
, (F_, \l -> case l of [] -> True ; _ -> False)
, (F, \l -> case l of [] -> True ; _ -> False)
, (Fstar, \l -> case l of [] -> True ; _ -> False)
, (B, \l -> case l of [] -> True ; _ -> False)
, (Bstar, \l -> case l of [] -> True ; _ -> False)
, (B_, \l -> case l of [] -> True ; _ -> False)
, (B_star, \l -> case l of [] -> True ; _ -> False)
, (N_, \l -> case l of [] -> True ; _ -> False)
, (W, \l -> case l of [] -> True ; _ -> False)
, (Wstar, \l -> case l of [] -> True ; _ -> False)
]
colorOperator :: OperatorTable ColorOperator
colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
(CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
, (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
, (SC, \_ -> True)
, (SCN, \_ -> True)
, (S_c, \_ -> True)
, (S_cn, \_ -> True)
, (G, \l -> case l of [Raw _] -> True ; _ -> False)
, (G_, \l -> case l of [Raw _] -> True ; _ -> False)
, (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
, (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
, (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
, (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
] ]
{- {-
@ -89,7 +180,7 @@ stateOperator _ = False
-} -}
textOperator :: OperatorTable TextOperator textOperator :: OperatorTable TextOperator
textOperator = Map.fromList $ (\(op, checker) -> (pack $ show op, (op, checker))) <$> [ textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
(Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False) (Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False) , (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
, (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False) , (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
@ -127,7 +218,7 @@ textOperator _ = False
-} -}
type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m) type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
type Operator a = (Bounded a, Enum a, Show a) --type Operator a = (Bounded a, Enum a, Show a)
type OperatorTable a = Map ByteString (TypeChecker a) type OperatorTable a = Map ByteString (TypeChecker a)
type TypeChecker a = (a, [Argument] -> Bool) type TypeChecker a = (a, [Argument] -> Bool)
@ -146,14 +237,14 @@ callChunk table =
Nothing -> return . Right $ Raw chunk Nothing -> return . Right $ Raw chunk
Just typeChecker -> return $ Left typeChecker Just typeChecker -> return $ Left typeChecker
stackParser :: (ArgumentStackParser m, Show a) => OperatorTable a -> m (Call a) stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a)
stackParser table = either popCall push =<< (callChunk table) stackParser table = either popCall push =<< (callChunk table)
where where
push arg = modify (arg:) >> stackParser table push arg = modify (arg:) >> stackParser table
popCall (operator, predicate) = do popCall (operator, predicate) = do
arguments <- reverse <$> get arguments <- reverse <$> get
let call = (operator, arguments) let call = (operator, arguments)
if predicate arguments then return call else fail (show call) if predicate arguments then return call else fail (unpack $ code operator)
a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a) a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
a table = evalStateT (stackParser table) [] a table = evalStateT (stackParser table) []
@ -173,24 +264,29 @@ stringArg = Typed . StringObject <$> stringObject <* blank
type ParserWithFont = ReaderT FontSet (Parser Font) type ParserWithFont = ReaderT FontSet (Parser Font)
pageContents :: FontSet -> ByteString -> Either String [ByteString] pageContents :: FontSet -> ByteString -> Either String [ByteString]
pageContents font input = pageContents fontSet input =
evalParser (runReaderT page font) emptyFont input evalParser (runReaderT (page) fontSet) emptyFont input
several :: MonadParser m => m [a] -> m [a]
several p = concat <$> (p `sepBy` blank)
page :: ParserWithFont [ByteString] page :: ParserWithFont [ByteString]
page = graphicState <|> text <?> "Text page contents" page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
graphicState :: ParserWithFont [ByteString] graphicState :: ParserWithFont [ByteString]
graphicState = graphicState =
(string "q" *> blank *> insideQ <* blank <* string "Q") <?> "Graphic state" string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
where where
insideQ = concat <$> ((command <|> page) `sepBy` blank ) insideQ = several (command <|> graphicState <|> text)
command = a stateOperator *> return [] ignore x = a x *> return []
command =
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
text :: ParserWithFont [ByteString] text :: ParserWithFont [ByteString]
text = text =
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators" string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
where where
commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank commands = several (a textOperator >>= runOperator)
runOperator :: Call TextOperator -> ParserWithFont [ByteString] runOperator :: Call TextOperator -> ParserWithFont [ByteString]
runOperator (Tf, [Typed (NameObject fontName), _]) = runOperator (Tf, [Typed (NameObject fontName), _]) =