Finish implementing the new stack-based call parser

This commit is contained in:
Tissevert 2019-11-30 12:39:40 +01:00
parent 8373bd1ea0
commit afbbcbffc5

View file

@ -1,17 +1,19 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
module PDF.Text ( {-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.Text {-(
pageContents pageContents
) where )-} where
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import Control.Monad (foldM) import Control.Monad (foldM)
import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.Reader (ReaderT, runReaderT, asks)
import Control.Monad.State (get, put) import Control.Monad.State (MonadState, evalStateT, get, modify, put)
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy) import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (drop, null, take) import qualified Data.ByteString as BS (drop, null, take)
import Data.ByteString.Char8 (unpack) import Data.ByteString.Char8 (pack, unpack)
import Data.Map ((!)) import Data.Map ((!))
import qualified Data.Map as Map (lookup, toList) import qualified Data.Map as Map (lookup, toList)
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap) import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
@ -19,64 +21,103 @@ import PDF.Object (
DirectObject(..), StringObject(..) DirectObject(..), StringObject(..)
, array, blank, name, regular, stringObject, toByteString , array, blank, name, regular, stringObject, toByteString
) )
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll) import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll1)
data StateOperator = data StateOperator =
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
deriving (Bounded, Enum, Show) deriving (Bounded, Enum)
data TextOperator = data TextOperator =
Td | TD | Tm | Tstar -- text positioning Td | TD | Tm | Tstar -- text positioning
| TJ | Tj | Quote | DQuote -- text showing | TJ | Tj | Quote | DQuote -- text showing
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
deriving (Bounded, Enum, Show) deriving (Bounded, Enum)
data Argument = Raw ByteString | Typed DirectObject deriving Show data Argument = Raw ByteString | Typed DirectObject deriving Show
type Call a = (a, [Argument]) type Call a = (a, [Argument])
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator) instance Show StateOperator where
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm" show Cm = "cm"
stateOperator W = (,) W <$> count 1 argument <* string "w" show W = "w"
stateOperator J = (,) J <$> count 1 argument <* string "J" show J = "J"
stateOperator J_ = (,) J_ <$> count 1 argument <* string "j" show J_ = "j"
stateOperator M = (,) M <$> count 1 argument <* string "M" show M = "M"
stateOperator D = (,) D <$> count 2 argument <* string "d" show D = "d"
stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri" show Ri = "ri"
stateOperator I = (,) I <$> count 1 argument <* string "i" show I = "i"
stateOperator Gs = (,) Gs <$> count 1 nameArg <* string "gs" show Gs = "gs"
{- instance Show TextOperator where
textOperator :: MonadParser m => TextOperator -> m (Call TextOperator) show Td = "Td"
textOperator Td = (,) Td <$> count 2 argument <* string "Td" show TD = "TD"
textOperator TD = (,) TD <$> count 2 argument <* string "TD" show Tm = "Tm"
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm" show Tstar = "T*"
textOperator Tstar = (,) Td <$> return [] <* string "T*" show TJ = "TJ"
textOperator TJ = (,) TJ <$> count 1 arrayArg <* string "TJ" show Tj = "Tj"
textOperator Tj = (,) Tj <$> count 1 stringArg <* string "Tj" show Quote = "'"
textOperator Quote = (,) Quote <$> count 1 stringArg <* string "'" show DQuote = "\""
textOperator DQuote = (,) DQuote <$> count 1 stringArg <* string "\"" show Tc = "Tc"
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc" show Tw = "Tw"
textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw" show Tz = "Tz"
textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz" show TL = "TL"
textOperator TL = (,) TL <$> count 1 argument <* string "TL" show Tf = "Tf"
textOperator Tf = (,) Tf <$> sequence [nameArg, argument] <* string "Tf" show Tr = "Tr"
textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr" show Ts = "Ts"
textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
-}
textOperator :: MonadParser m => TextOperator -> m TextOperator stateOperator :: TypeChecker StateOperator
textOperator textOp = string (show textOp) *> return textOp stateOperator (Cm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
stateOperator (W, [Raw _]) = True
stateOperator (J, [Raw _]) = True
stateOperator (J_, [Raw _]) = True
stateOperator (M, [Raw _]) = True
stateOperator (D, [Raw _, Raw _]) = True
stateOperator (Ri, [Raw _]) = True
stateOperator (I, [Raw _]) = True
stateOperator (Gs, [Typed (NameObject _)]) = True
stateOperator _ = False
textChunk :: MonadParser m => m (Either TextOperator Argument) textOperator :: TypeChecker TextOperator
textChunk = textOperator (Td, [Raw _, Raw _]) = True
choice $ Left . textOperator <$> [minBound .. maxBound] textOperator (TD, [Raw _, Raw _]) = True
<|> choice $ Right <$> [stringArg, nameArg, arrayArg, argument] textOperator (Tm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
textOperator (Tstar, []) = True
textOperator (TJ, [Typed (Array _)]) = True
textOperator (Tj, [Typed (StringObject _)]) = True
textOperator (Quote, [Typed (StringObject _)]) = True
textOperator (DQuote, [Typed (StringObject _)]) = True
textOperator (Tc, [Raw _]) = True
textOperator (Tw, [Raw _]) = True
textOperator (Tz, [Raw _]) = True
textOperator (TL, [Raw _]) = True
textOperator (Tf, [Typed (NameObject _), Raw _]) = True
textOperator (Tr, [Raw _]) = True
textOperator (Ts, [Raw _]) = True
textOperator _ = False
aTextOperator :: MonadParser m => m (Call TextOperator) type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
type Operator a = (Bounded a, Enum a, Show a)
type TypeChecker a = Call a -> Bool
a :: (Bounded o, Enum o, MonadParser m) => (o -> m (Call o)) -> m (Call o) parseShowable :: (Show a, MonadParser m) => a -> m a
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound] parseShowable textOp = string (pack $ show textOp) *> return textOp
callChunk :: (MonadParser m, Operator a) => m (Either a Argument)
callChunk =
Left <$> choice (parseShowable <$> [minBound .. maxBound])
<|> Right <$> choice [stringArg, nameArg, arrayArg, argument]
<?> "call chunk"
stackParser :: (ArgumentStackParser m, Operator a) => TypeChecker a -> m (Call a)
stackParser typeChecker = either popCall push =<< callChunk
where
push arg = modify (arg:) >> stackParser typeChecker
popCall operator = do
call <- (,) operator . reverse <$> get
if typeChecker call then return call else fail (show call)
a :: (Operator a, MonadParser m) => TypeChecker a -> m (Call a)
a typeChecker = evalStateT (stackParser typeChecker) []
argument :: MonadParser m => m Argument argument :: MonadParser m => m Argument
argument = Raw <$> takeAll regular <* blank argument = Raw <$> takeAll1 regular <* blank
arrayArg :: MonadParser m => m Argument arrayArg :: MonadParser m => m Argument
arrayArg = Typed . Array <$> array <* blank arrayArg = Typed . Array <$> array <* blank
@ -107,7 +148,7 @@ text :: ParserWithFont [ByteString]
text = text =
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators" string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
where where
commands = concat <$> (aTextOperator >>= runOperator) `sepBy` blank commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank
runOperator :: Call TextOperator -> ParserWithFont [ByteString] runOperator :: Call TextOperator -> ParserWithFont [ByteString]
runOperator (Tf, [Typed (NameObject fontName), _]) = runOperator (Tf, [Typed (NameObject fontName), _]) =