Fix loose parser not making sure endOfInput is reached; add two families of operators and simplify the «Show» instance with a dedicated function to allow deleting lines of uninteresting code
This commit is contained in:
parent
3f6b0651f3
commit
57996749c6
1 changed files with 123 additions and 27 deletions
150
src/PDF/Text.hs
150
src/PDF/Text.hs
|
@ -13,7 +13,8 @@ import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
||||||
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
|
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
|
||||||
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
|
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import Data.ByteString.Char8 (pack)
|
import Data.ByteString.Char8 (pack, unpack)
|
||||||
|
import Data.Char (toLower)
|
||||||
import Data.Map ((!), (!?), Map)
|
import Data.Map ((!), (!?), Map)
|
||||||
import qualified Data.Map as Map (fromList)
|
import qualified Data.Map as Map (fromList)
|
||||||
import PDF.Font (Font, FontSet, emptyFont)
|
import PDF.Font (Font, FontSet, emptyFont)
|
||||||
|
@ -21,22 +22,43 @@ import PDF.Object (
|
||||||
DirectObject(..), StringObject(..)
|
DirectObject(..), StringObject(..)
|
||||||
, array, blank, name, regular, stringObject, toByteString
|
, array, blank, name, regular, stringObject, toByteString
|
||||||
)
|
)
|
||||||
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll1)
|
import PDF.Parser (MonadParser(..), (<?>), Parser, evalParser)
|
||||||
|
|
||||||
data StateOperator =
|
data StateOperator =
|
||||||
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state
|
||||||
deriving (Bounded, Enum)
|
deriving (Bounded, Enum, Show)
|
||||||
|
data PathOperator =
|
||||||
|
M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction
|
||||||
|
| S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting
|
||||||
|
| W | Wstar -- clipping path
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
|
data ColorOperator =
|
||||||
|
CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_
|
||||||
|
deriving (Bounded, Enum, Show)
|
||||||
data TextOperator =
|
data TextOperator =
|
||||||
Td | TD | Tm | Tstar -- text positioning
|
Td | TD | Tm | Tstar -- text positioning
|
||||||
| TJ | Tj | Quote | DQuote -- text showing
|
| TJ | Tj | Quote | DQuote -- text showing
|
||||||
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
||||||
deriving (Bounded, Enum)
|
deriving (Bounded, Enum, Show)
|
||||||
data Argument = Raw ByteString | Typed DirectObject deriving Show
|
data Argument = Raw ByteString | Typed DirectObject deriving Show
|
||||||
type Call a = (a, [Argument])
|
type Call a = (a, [Argument])
|
||||||
|
|
||||||
|
type Operator a = (Bounded a, Enum a, Show a)
|
||||||
|
|
||||||
|
code :: Operator a => a -> ByteString
|
||||||
|
code = pack . expand . show
|
||||||
|
where
|
||||||
|
expand "" = ""
|
||||||
|
expand (c:'_':s) = toLower c : expand s
|
||||||
|
expand ('s':'t':'a':'r':s) = '*' : expand s
|
||||||
|
expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s
|
||||||
|
expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s
|
||||||
|
expand (c:s) = c : expand s
|
||||||
|
|
||||||
|
{-
|
||||||
instance Show StateOperator where
|
instance Show StateOperator where
|
||||||
show Cm = "cm"
|
show Cm = "cm"
|
||||||
show W = "w"
|
show W_ = "w"
|
||||||
show J = "J"
|
show J = "J"
|
||||||
show J_ = "j"
|
show J_ = "j"
|
||||||
show M = "M"
|
show M = "M"
|
||||||
|
@ -45,6 +67,35 @@ instance Show StateOperator where
|
||||||
show I = "i"
|
show I = "i"
|
||||||
show Gs = "gs"
|
show Gs = "gs"
|
||||||
|
|
||||||
|
instance Show PathOperator where
|
||||||
|
show M_ = "m"
|
||||||
|
show L_ = "l"
|
||||||
|
show C_ = "c"
|
||||||
|
show V_ = "v"
|
||||||
|
show Y_ = "y"
|
||||||
|
show H_
|
||||||
|
("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("h", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
||||||
|
, ("S", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("s", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("f", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("F", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("F*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("B", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("B*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("b", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("b*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("n", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("W", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
, ("W*", (L_, \l -> case l of [] -> True ; _ -> False))
|
||||||
|
|
||||||
|
instance Show ColorOperator where
|
||||||
|
|
||||||
instance Show TextOperator where
|
instance Show TextOperator where
|
||||||
show Td = "Td"
|
show Td = "Td"
|
||||||
show TD = "TD"
|
show TD = "TD"
|
||||||
|
@ -61,18 +112,58 @@ instance Show TextOperator where
|
||||||
show Tf = "Tf"
|
show Tf = "Tf"
|
||||||
show Tr = "Tr"
|
show Tr = "Tr"
|
||||||
show Ts = "Ts"
|
show Ts = "Ts"
|
||||||
|
-}
|
||||||
|
|
||||||
stateOperator :: OperatorTable StateOperator
|
stateOperator :: OperatorTable StateOperator
|
||||||
stateOperator = Map.fromList [
|
stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||||
("cm", (Cm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
(C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
, ("w", (W, \l -> case l of [Raw _] -> True ; _ -> False))
|
, (W_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
, ("J", (J, \l -> case l of [Raw _] -> True ; _ -> False))
|
, (J, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
, ("j", (J_, \l -> case l of [Raw _] -> True ; _ -> False))
|
, (J_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
, ("M", (M, \l -> case l of [Raw _] -> True ; _ -> False))
|
, (M, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
, ("d", (D, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
|
, (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||||
, ("ri", (Ri, \l -> case l of [Raw _] -> True ; _ -> False))
|
, (R_i, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
, ("i", (I, \l -> case l of [Raw _] -> True ; _ -> False))
|
, (I_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
, ("gs", (Gs, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False))
|
, (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
|
||||||
|
]
|
||||||
|
|
||||||
|
pathOperator :: OperatorTable PathOperator
|
||||||
|
pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||||
|
(M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (H_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (S, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (S_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (F_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (F, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Fstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Bstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (B_star, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (N_, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (W, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
, (Wstar, \l -> case l of [] -> True ; _ -> False)
|
||||||
|
]
|
||||||
|
|
||||||
|
colorOperator :: OperatorTable ColorOperator
|
||||||
|
colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||||
|
(CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
|
||||||
|
, (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False)
|
||||||
|
, (SC, \_ -> True)
|
||||||
|
, (SCN, \_ -> True)
|
||||||
|
, (S_c, \_ -> True)
|
||||||
|
, (S_cn, \_ -> True)
|
||||||
|
, (G, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (G_, \l -> case l of [Raw _] -> True ; _ -> False)
|
||||||
|
, (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
, (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
]
|
]
|
||||||
|
|
||||||
{-
|
{-
|
||||||
|
@ -89,7 +180,7 @@ stateOperator _ = False
|
||||||
-}
|
-}
|
||||||
|
|
||||||
textOperator :: OperatorTable TextOperator
|
textOperator :: OperatorTable TextOperator
|
||||||
textOperator = Map.fromList $ (\(op, checker) -> (pack $ show op, (op, checker))) <$> [
|
textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [
|
||||||
(Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
(Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||||
, (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
, (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
||||||
, (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
, (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
||||||
|
@ -127,7 +218,7 @@ textOperator _ = False
|
||||||
-}
|
-}
|
||||||
|
|
||||||
type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
|
type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
|
||||||
type Operator a = (Bounded a, Enum a, Show a)
|
--type Operator a = (Bounded a, Enum a, Show a)
|
||||||
type OperatorTable a = Map ByteString (TypeChecker a)
|
type OperatorTable a = Map ByteString (TypeChecker a)
|
||||||
type TypeChecker a = (a, [Argument] -> Bool)
|
type TypeChecker a = (a, [Argument] -> Bool)
|
||||||
|
|
||||||
|
@ -146,14 +237,14 @@ callChunk table =
|
||||||
Nothing -> return . Right $ Raw chunk
|
Nothing -> return . Right $ Raw chunk
|
||||||
Just typeChecker -> return $ Left typeChecker
|
Just typeChecker -> return $ Left typeChecker
|
||||||
|
|
||||||
stackParser :: (ArgumentStackParser m, Show a) => OperatorTable a -> m (Call a)
|
stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a)
|
||||||
stackParser table = either popCall push =<< (callChunk table)
|
stackParser table = either popCall push =<< (callChunk table)
|
||||||
where
|
where
|
||||||
push arg = modify (arg:) >> stackParser table
|
push arg = modify (arg:) >> stackParser table
|
||||||
popCall (operator, predicate) = do
|
popCall (operator, predicate) = do
|
||||||
arguments <- reverse <$> get
|
arguments <- reverse <$> get
|
||||||
let call = (operator, arguments)
|
let call = (operator, arguments)
|
||||||
if predicate arguments then return call else fail (show call)
|
if predicate arguments then return call else fail (unpack $ code operator)
|
||||||
|
|
||||||
a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
|
a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
|
||||||
a table = evalStateT (stackParser table) []
|
a table = evalStateT (stackParser table) []
|
||||||
|
@ -173,24 +264,29 @@ stringArg = Typed . StringObject <$> stringObject <* blank
|
||||||
type ParserWithFont = ReaderT FontSet (Parser Font)
|
type ParserWithFont = ReaderT FontSet (Parser Font)
|
||||||
|
|
||||||
pageContents :: FontSet -> ByteString -> Either String [ByteString]
|
pageContents :: FontSet -> ByteString -> Either String [ByteString]
|
||||||
pageContents font input =
|
pageContents fontSet input =
|
||||||
evalParser (runReaderT page font) emptyFont input
|
evalParser (runReaderT (page) fontSet) emptyFont input
|
||||||
|
|
||||||
|
several :: MonadParser m => m [a] -> m [a]
|
||||||
|
several p = concat <$> (p `sepBy` blank)
|
||||||
|
|
||||||
page :: ParserWithFont [ByteString]
|
page :: ParserWithFont [ByteString]
|
||||||
page = graphicState <|> text <?> "Text page contents"
|
page = several (graphicState <|> text) <* blank <* endOfInput <?> "Text page contents"
|
||||||
|
|
||||||
graphicState :: ParserWithFont [ByteString]
|
graphicState :: ParserWithFont [ByteString]
|
||||||
graphicState =
|
graphicState =
|
||||||
(string "q" *> blank *> insideQ <* blank <* string "Q") <?> "Graphic state"
|
string "q" *> blank *> insideQ <* blank <* string "Q" <?> "Graphic state"
|
||||||
where
|
where
|
||||||
insideQ = concat <$> ((command <|> page) `sepBy` blank )
|
insideQ = several (command <|> graphicState <|> text)
|
||||||
command = a stateOperator *> return []
|
ignore x = a x *> return []
|
||||||
|
command =
|
||||||
|
ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator
|
||||||
|
|
||||||
text :: ParserWithFont [ByteString]
|
text :: ParserWithFont [ByteString]
|
||||||
text =
|
text =
|
||||||
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
|
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
|
||||||
where
|
where
|
||||||
commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank
|
commands = several (a textOperator >>= runOperator)
|
||||||
|
|
||||||
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
|
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
|
||||||
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
||||||
|
|
Loading…
Reference in a new issue