{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE FlexibleContexts #-} module PDF.Text {-( pageContents )-} where import Control.Applicative ((<|>)) import Control.Monad (foldM) import Control.Monad.Fail (MonadFail) import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.State (MonadState, evalStateT, get, modify, put) import Data.Attoparsec.ByteString.Char8 (choice, sepBy) import Data.ByteString (ByteString) import Data.ByteString.Char8 (pack, unpack) import Data.Char (toLower) import Data.Map ((!), (!?), Map) import qualified Data.Map as Map (fromList) import Data.Text (Text) import PDF.Font (Font, FontSet, emptyFont) import PDF.Object ( DirectObject(..), StringObject(..) , array, blank, name, regular, stringObject, toByteString ) import PDF.Parser (MonadParser(..), (), Parser, evalParser) data StateOperator = C_m | W_ | J | J_ | M | D_ | R_i | I_ | G_s -- general graphic state deriving (Bounded, Enum, Show) data PathOperator = M_ | L_ | C_ | V_ | Y_ | H_ | R_e -- path construction | S | S_ | F_ | F | Fstar | B | Bstar | B_ | B_star | N_ -- path painting | W | Wstar -- clipping path deriving (Bounded, Enum, Show) data ColorOperator = CS | C_s | SC | SCN | S_c | S_cn | G | G_ | RG | R_g | K | K_ deriving (Bounded, Enum, Show) data TextOperator = Td | TD | Tm | Tstar -- text positioning | TJ | Tj | Quote | DQuote -- text showing | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state deriving (Bounded, Enum, Show) data Argument = Raw ByteString | Typed DirectObject deriving Show type Call a = (a, [Argument]) type Operator a = (Bounded a, Enum a, Show a) code :: Operator a => a -> ByteString code = pack . expand . show where expand "" = "" expand (c:'_':s) = toLower c : expand s expand ('s':'t':'a':'r':s) = '*' : expand s expand ('Q':'u':'o':'t':'e':s) = '\'' : expand s expand ('D':'Q':'u':'o':'t':'e':s) = '"' : expand s expand (c:s) = c : expand s {- instance Show StateOperator where show Cm = "cm" show W_ = "w" show J = "J" show J_ = "j" show M = "M" show D = "d" show Ri = "ri" show I = "i" show Gs = "gs" instance Show PathOperator where show M_ = "m" show L_ = "l" show C_ = "c" show V_ = "v" show Y_ = "y" show H_ ("m", (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)) , ("l", (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)) , ("c", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)) , ("v", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)) , ("y", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)) , ("h", (L_, \l -> case l of [] -> True ; _ -> False)) , ("re", (L_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)) , ("S", (L_, \l -> case l of [] -> True ; _ -> False)) , ("s", (L_, \l -> case l of [] -> True ; _ -> False)) , ("f", (L_, \l -> case l of [] -> True ; _ -> False)) , ("F", (L_, \l -> case l of [] -> True ; _ -> False)) , ("F*", (L_, \l -> case l of [] -> True ; _ -> False)) , ("B", (L_, \l -> case l of [] -> True ; _ -> False)) , ("B*", (L_, \l -> case l of [] -> True ; _ -> False)) , ("b", (L_, \l -> case l of [] -> True ; _ -> False)) , ("b*", (L_, \l -> case l of [] -> True ; _ -> False)) , ("n", (L_, \l -> case l of [] -> True ; _ -> False)) , ("W", (L_, \l -> case l of [] -> True ; _ -> False)) , ("W*", (L_, \l -> case l of [] -> True ; _ -> False)) instance Show ColorOperator where instance Show TextOperator where show Td = "Td" show TD = "TD" show Tm = "Tm" show Tstar = "T*" show TJ = "TJ" show Tj = "Tj" show Quote = "'" show DQuote = "\"" show Tc = "Tc" show Tw = "Tw" show Tz = "Tz" show TL = "TL" show Tf = "Tf" show Tr = "Tr" show Ts = "Ts" -} stateOperator :: OperatorTable StateOperator stateOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [ (C_m, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False) , (W_, \l -> case l of [Raw _] -> True ; _ -> False) , (J, \l -> case l of [Raw _] -> True ; _ -> False) , (J_, \l -> case l of [Raw _] -> True ; _ -> False) , (M, \l -> case l of [Raw _] -> True ; _ -> False) , (D_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False) , (R_i, \l -> case l of [Raw _] -> True ; _ -> False) , (I_, \l -> case l of [Raw _] -> True ; _ -> False) , (G_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False) ] pathOperator :: OperatorTable PathOperator pathOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [ (M_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False) , (L_, \l -> case l of [Raw _, Raw _] -> True ; _ -> False) , (C_, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False) , (V_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False) , (Y_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False) , (H_, \l -> case l of [] -> True ; _ -> False) , (R_e, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False) , (S, \l -> case l of [] -> True ; _ -> False) , (S_, \l -> case l of [] -> True ; _ -> False) , (F_, \l -> case l of [] -> True ; _ -> False) , (F, \l -> case l of [] -> True ; _ -> False) , (Fstar, \l -> case l of [] -> True ; _ -> False) , (B, \l -> case l of [] -> True ; _ -> False) , (Bstar, \l -> case l of [] -> True ; _ -> False) , (B_, \l -> case l of [] -> True ; _ -> False) , (B_star, \l -> case l of [] -> True ; _ -> False) , (N_, \l -> case l of [] -> True ; _ -> False) , (W, \l -> case l of [] -> True ; _ -> False) , (Wstar, \l -> case l of [] -> True ; _ -> False) ] colorOperator :: OperatorTable ColorOperator colorOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [ (CS, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False) , (C_s, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False) , (SC, \_ -> True) , (SCN, \_ -> True) , (S_c, \_ -> True) , (S_cn, \_ -> True) , (G, \l -> case l of [Raw _] -> True ; _ -> False) , (G_, \l -> case l of [Raw _] -> True ; _ -> False) , (RG, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False) , (R_g, \l -> case l of [Raw _, Raw _, Raw _] -> True ; _ -> False) , (K, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False) , (K_, \l -> case l of [Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False) ] {- stateOperator (Cm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True stateOperator (W, [Raw _]) = True stateOperator (J, [Raw _]) = True stateOperator (J_, [Raw _]) = True stateOperator (M, [Raw _]) = True stateOperator (D, [Raw _, Raw _]) = True stateOperator (Ri, [Raw _]) = True stateOperator (I, [Raw _]) = True stateOperator (Gs, [Typed (NameObject _)]) = True stateOperator _ = False -} textOperator :: OperatorTable TextOperator textOperator = Map.fromList $ (\(op, checker) -> (code op, (op, checker))) <$> [ (Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False) , (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False) , (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False) , (Tstar, \l -> case l of [] -> True ; _ -> False) , (TJ, \l -> case l of [Typed (Array _)] -> True ; _ -> False) , (Tj, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False) , (Quote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False) , (DQuote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False) , (Tc, \l -> case l of [Raw _] -> True ; _ -> False) , (Tw, \l -> case l of [Raw _] -> True ; _ -> False) , (Tz, \l -> case l of [Raw _] -> True ; _ -> False) , (TL, \l -> case l of [Raw _] -> True ; _ -> False) , (Tf, \l -> case l of [Typed (NameObject _), Raw _] -> True ; _ -> False) , (Tr, \l -> case l of [Raw _] -> True ; _ -> False) , (Ts, \l -> case l of [Raw _] -> True ; _ -> False) ] {- textOperator (Td, [Raw _, Raw _]) = True textOperator (TD, [Raw _, Raw _]) = True textOperator (Tm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True textOperator (Tstar, []) = True textOperator (TJ, [Typed (Array _)]) = True textOperator (Tj, [Typed (StringObject _)]) = True textOperator (Quote, [Typed (StringObject _)]) = True textOperator (DQuote, [Typed (StringObject _)]) = True textOperator (Tc, [Raw _]) = True textOperator (Tw, [Raw _]) = True textOperator (Tz, [Raw _]) = True textOperator (TL, [Raw _]) = True textOperator (Tf, [Typed (NameObject _), Raw _]) = True textOperator (Tr, [Raw _]) = True textOperator (Ts, [Raw _]) = True textOperator _ = False -} type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m) --type Operator a = (Bounded a, Enum a, Show a) type OperatorTable a = Map ByteString (TypeChecker a) type TypeChecker a = (a, [Argument] -> Bool) parseShowable :: (Show a, MonadParser m) => a -> m a parseShowable textOp = string (pack $ show textOp) *> return textOp callChunk :: MonadParser m => OperatorTable a -> m (Either (TypeChecker a) Argument) callChunk table = (Right <$> choice [stringArg, nameArg, arrayArg]) <|> operatorOrRawArg "call chunk" where operatorOrRawArg = do chunk <- takeAll1 regular <* blank case table !? chunk of Nothing -> return . Right $ Raw chunk Just typeChecker -> return $ Left typeChecker stackParser :: (ArgumentStackParser m, Operator a) => OperatorTable a -> m (Call a) stackParser table = either popCall push =<< (callChunk table) where push arg = modify (arg:) >> stackParser table popCall (operator, predicate) = do arguments <- reverse <$> get let call = (operator, arguments) if predicate arguments then return call else fail (unpack $ code operator) a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a) a table = evalStateT (stackParser table) [] argument :: MonadParser m => m Argument argument = Raw <$> takeAll1 regular <* blank arrayArg :: MonadParser m => m Argument arrayArg = Typed . Array <$> array <* blank nameArg :: MonadParser m => m Argument nameArg = Typed . NameObject <$> name <* blank stringArg :: MonadParser m => m Argument stringArg = Typed . StringObject <$> stringObject <* blank type ParserWithFont = ReaderT FontSet (Parser Font) pageContents :: FontSet -> ByteString -> Either String [Text] pageContents fontSet input = evalParser (runReaderT (page) fontSet) emptyFont input several :: MonadParser m => m [a] -> m [a] several p = concat <$> (p `sepBy` blank) page :: ParserWithFont [Text] page = several (graphicState <|> text) <* blank <* endOfInput "Text page contents" graphicState :: ParserWithFont [Text] graphicState = string "q" *> blank *> insideQ <* blank <* string "Q" "Graphic state" where insideQ = several (command <|> graphicState <|> text) ignore x = a x *> return [] command = ignore stateOperator <|> ignore pathOperator <|> ignore colorOperator text :: ParserWithFont [Text] text = string "BT" *> blank *> commands <* blank <* string "ET" "Text operators" where commands = several (a textOperator >>= runOperator) runOperator :: Call TextOperator -> ParserWithFont [Text] runOperator (Tf, [Typed (NameObject fontName), _]) = asks (! fontName) >>= put >> return [] runOperator (Tstar, []) = return ["\n"] runOperator (TJ, [Typed (Array arrayObject)]) = replicate 1 <$> foldM appendText "" arrayObject where appendText bs (StringObject outputString) = mappend bs <$> decodeString outputString appendText bs _ = return bs runOperator (Tj, [Typed (StringObject outputString)]) = replicate 1 <$> decodeString outputString runOperator (Quote, [Typed (StringObject outputString)]) = (\bs -> ["\n", bs]) <$> decodeString outputString runOperator (DQuote, [Typed (StringObject outputString)]) = (\bs -> ["\n", bs]) <$> decodeString outputString runOperator _ = return [] decodeString :: (MonadFail m, MonadState Font m) => StringObject -> m Text decodeString input = do font <- get either fail return . font $ toByteString input