2019-10-03 07:59:09 +02:00
|
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
2019-09-23 23:19:27 +02:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
2019-11-30 12:39:40 +01:00
|
|
|
{-# LANGUAGE ConstraintKinds #-}
|
|
|
|
{-# LANGUAGE FlexibleContexts #-}
|
|
|
|
module PDF.Text {-(
|
2019-11-29 11:48:28 +01:00
|
|
|
pageContents
|
2019-11-30 12:39:40 +01:00
|
|
|
)-} where
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-27 18:16:12 +02:00
|
|
|
import Control.Applicative ((<|>))
|
2019-09-28 09:25:59 +02:00
|
|
|
import Control.Monad (foldM)
|
2019-09-25 18:42:34 +02:00
|
|
|
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
2019-11-30 12:39:40 +01:00
|
|
|
import Control.Monad.State (MonadState, evalStateT, get, modify, put)
|
|
|
|
import Data.Attoparsec.ByteString.Char8 (choice, sepBy)
|
2019-09-27 18:16:12 +02:00
|
|
|
import Data.ByteString (ByteString)
|
2019-10-03 07:59:09 +02:00
|
|
|
import qualified Data.ByteString as BS (drop, null, take)
|
2019-11-30 12:39:40 +01:00
|
|
|
import Data.ByteString.Char8 (pack, unpack)
|
2019-11-30 21:46:22 +01:00
|
|
|
import Data.Map ((!), (!?), Map)
|
|
|
|
import qualified Data.Map as Map (fromList, lookup, toList)
|
2019-10-03 07:59:09 +02:00
|
|
|
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
|
2019-09-25 18:42:34 +02:00
|
|
|
import PDF.Object (
|
2019-09-27 18:16:12 +02:00
|
|
|
DirectObject(..), StringObject(..)
|
2019-10-04 18:46:07 +02:00
|
|
|
, array, blank, name, regular, stringObject, toByteString
|
2019-09-25 18:42:34 +02:00
|
|
|
)
|
2019-11-30 12:39:40 +01:00
|
|
|
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll1)
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
data StateOperator =
|
|
|
|
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
2019-11-30 12:39:40 +01:00
|
|
|
deriving (Bounded, Enum)
|
2019-09-25 18:42:34 +02:00
|
|
|
data TextOperator =
|
|
|
|
Td | TD | Tm | Tstar -- text positioning
|
|
|
|
| TJ | Tj | Quote | DQuote -- text showing
|
|
|
|
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
2019-11-30 12:39:40 +01:00
|
|
|
deriving (Bounded, Enum)
|
2019-09-30 14:13:12 +02:00
|
|
|
data Argument = Raw ByteString | Typed DirectObject deriving Show
|
2019-09-25 18:42:34 +02:00
|
|
|
type Call a = (a, [Argument])
|
|
|
|
|
2019-11-30 12:39:40 +01:00
|
|
|
instance Show StateOperator where
|
|
|
|
show Cm = "cm"
|
|
|
|
show W = "w"
|
|
|
|
show J = "J"
|
|
|
|
show J_ = "j"
|
|
|
|
show M = "M"
|
|
|
|
show D = "d"
|
|
|
|
show Ri = "ri"
|
|
|
|
show I = "i"
|
|
|
|
show Gs = "gs"
|
|
|
|
|
|
|
|
instance Show TextOperator where
|
|
|
|
show Td = "Td"
|
|
|
|
show TD = "TD"
|
|
|
|
show Tm = "Tm"
|
|
|
|
show Tstar = "T*"
|
|
|
|
show TJ = "TJ"
|
|
|
|
show Tj = "Tj"
|
|
|
|
show Quote = "'"
|
|
|
|
show DQuote = "\""
|
|
|
|
show Tc = "Tc"
|
|
|
|
show Tw = "Tw"
|
|
|
|
show Tz = "Tz"
|
|
|
|
show TL = "TL"
|
|
|
|
show Tf = "Tf"
|
|
|
|
show Tr = "Tr"
|
|
|
|
show Ts = "Ts"
|
|
|
|
|
2019-11-30 21:46:22 +01:00
|
|
|
stateOperator :: OperatorTable StateOperator
|
|
|
|
stateOperator = Map.fromList [
|
|
|
|
("cm", (Cm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False))
|
|
|
|
, ("w", (W, \l -> case l of [Raw _] -> True ; _ -> False))
|
|
|
|
, ("J", (J, \l -> case l of [Raw _] -> True ; _ -> False))
|
|
|
|
, ("j", (J_, \l -> case l of [Raw _] -> True ; _ -> False))
|
|
|
|
, ("M", (M, \l -> case l of [Raw _] -> True ; _ -> False))
|
|
|
|
, ("d", (D, \l -> case l of [Raw _, Raw _] -> True ; _ -> False))
|
|
|
|
, ("ri", (Ri, \l -> case l of [Raw _] -> True ; _ -> False))
|
|
|
|
, ("i", (I, \l -> case l of [Raw _] -> True ; _ -> False))
|
|
|
|
, ("gs", (Gs, \l -> case l of [Typed (NameObject _)] -> True ; _ -> False))
|
|
|
|
]
|
|
|
|
|
|
|
|
{-
|
2019-11-30 12:39:40 +01:00
|
|
|
stateOperator (Cm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
|
|
|
|
stateOperator (W, [Raw _]) = True
|
|
|
|
stateOperator (J, [Raw _]) = True
|
|
|
|
stateOperator (J_, [Raw _]) = True
|
|
|
|
stateOperator (M, [Raw _]) = True
|
|
|
|
stateOperator (D, [Raw _, Raw _]) = True
|
|
|
|
stateOperator (Ri, [Raw _]) = True
|
|
|
|
stateOperator (I, [Raw _]) = True
|
|
|
|
stateOperator (Gs, [Typed (NameObject _)]) = True
|
|
|
|
stateOperator _ = False
|
2019-11-30 21:46:22 +01:00
|
|
|
-}
|
|
|
|
|
|
|
|
textOperator :: OperatorTable TextOperator
|
|
|
|
textOperator = Map.fromList $ (\(op, checker) -> (pack $ show op, (op, checker))) <$> [
|
|
|
|
(Td, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
|
|
|
, (TD, \l -> case l of [Raw _, Raw _] -> True ; _ -> False)
|
|
|
|
, (Tm, \l -> case l of [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _] -> True ; _ -> False)
|
|
|
|
, (Tstar, \l -> case l of [] -> True ; _ -> False)
|
|
|
|
, (TJ, \l -> case l of [Typed (Array _)] -> True ; _ -> False)
|
|
|
|
, (Tj, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
|
|
|
|
, (Quote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
|
|
|
|
, (DQuote, \l -> case l of [Typed (StringObject _)] -> True ; _ -> False)
|
|
|
|
, (Tc, \l -> case l of [Raw _] -> True ; _ -> False)
|
|
|
|
, (Tw, \l -> case l of [Raw _] -> True ; _ -> False)
|
|
|
|
, (Tz, \l -> case l of [Raw _] -> True ; _ -> False)
|
|
|
|
, (TL, \l -> case l of [Raw _] -> True ; _ -> False)
|
|
|
|
, (Tf, \l -> case l of [Typed (NameObject _), Raw _] -> True ; _ -> False)
|
|
|
|
, (Tr, \l -> case l of [Raw _] -> True ; _ -> False)
|
|
|
|
, (Ts, \l -> case l of [Raw _] -> True ; _ -> False)
|
|
|
|
]
|
|
|
|
|
|
|
|
{-
|
2019-11-30 12:39:40 +01:00
|
|
|
textOperator (Td, [Raw _, Raw _]) = True
|
|
|
|
textOperator (TD, [Raw _, Raw _]) = True
|
|
|
|
textOperator (Tm, [Raw _, Raw _, Raw _, Raw _, Raw _, Raw _]) = True
|
|
|
|
textOperator (Tstar, []) = True
|
|
|
|
textOperator (TJ, [Typed (Array _)]) = True
|
|
|
|
textOperator (Tj, [Typed (StringObject _)]) = True
|
|
|
|
textOperator (Quote, [Typed (StringObject _)]) = True
|
|
|
|
textOperator (DQuote, [Typed (StringObject _)]) = True
|
|
|
|
textOperator (Tc, [Raw _]) = True
|
|
|
|
textOperator (Tw, [Raw _]) = True
|
|
|
|
textOperator (Tz, [Raw _]) = True
|
|
|
|
textOperator (TL, [Raw _]) = True
|
|
|
|
textOperator (Tf, [Typed (NameObject _), Raw _]) = True
|
|
|
|
textOperator (Tr, [Raw _]) = True
|
|
|
|
textOperator (Ts, [Raw _]) = True
|
|
|
|
textOperator _ = False
|
2019-11-30 21:46:22 +01:00
|
|
|
-}
|
2019-11-30 12:39:40 +01:00
|
|
|
|
|
|
|
type ArgumentStackParser m = (MonadState [Argument] m, MonadParser m)
|
|
|
|
type Operator a = (Bounded a, Enum a, Show a)
|
2019-11-30 21:46:22 +01:00
|
|
|
type OperatorTable a = Map ByteString (TypeChecker a)
|
|
|
|
type TypeChecker a = (a, [Argument] -> Bool)
|
2019-11-30 12:39:40 +01:00
|
|
|
|
|
|
|
parseShowable :: (Show a, MonadParser m) => a -> m a
|
|
|
|
parseShowable textOp = string (pack $ show textOp) *> return textOp
|
|
|
|
|
2019-11-30 21:46:22 +01:00
|
|
|
callChunk :: MonadParser m => OperatorTable a -> m (Either (TypeChecker a) Argument)
|
|
|
|
callChunk table =
|
|
|
|
(Right <$> choice [stringArg, nameArg, arrayArg])
|
|
|
|
<|> operatorOrRawArg
|
2019-11-30 12:39:40 +01:00
|
|
|
<?> "call chunk"
|
|
|
|
where
|
2019-11-30 21:46:22 +01:00
|
|
|
operatorOrRawArg = do
|
|
|
|
chunk <- takeAll1 regular <* blank
|
|
|
|
case table !? chunk of
|
|
|
|
Nothing -> return . Right $ Raw chunk
|
|
|
|
Just typeChecker -> return $ Left typeChecker
|
|
|
|
|
|
|
|
stackParser :: (ArgumentStackParser m, Show a) => OperatorTable a -> m (Call a)
|
|
|
|
stackParser table = either popCall push =<< (callChunk table)
|
|
|
|
where
|
|
|
|
push arg = modify (arg:) >> stackParser table
|
|
|
|
popCall (operator, predicate) = do
|
|
|
|
arguments <- reverse <$> get
|
|
|
|
let call = (operator, arguments)
|
|
|
|
if predicate arguments then return call else fail (show call)
|
|
|
|
|
|
|
|
a :: (Operator a, MonadParser m) => OperatorTable a -> m (Call a)
|
|
|
|
a table = evalStateT (stackParser table) []
|
2019-09-25 18:42:34 +02:00
|
|
|
|
2019-09-27 12:21:06 +02:00
|
|
|
argument :: MonadParser m => m Argument
|
2019-11-30 12:39:40 +01:00
|
|
|
argument = Raw <$> takeAll1 regular <* blank
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-28 09:25:59 +02:00
|
|
|
arrayArg :: MonadParser m => m Argument
|
|
|
|
arrayArg = Typed . Array <$> array <* blank
|
|
|
|
|
|
|
|
nameArg :: MonadParser m => m Argument
|
|
|
|
nameArg = Typed . NameObject <$> name <* blank
|
|
|
|
|
|
|
|
stringArg :: MonadParser m => m Argument
|
|
|
|
stringArg = Typed . StringObject <$> stringObject <* blank
|
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
type ParserWithFont = ReaderT CMappers (Parser CMap)
|
|
|
|
|
2019-11-29 11:48:28 +01:00
|
|
|
pageContents :: CMappers -> ByteString -> Either String [ByteString]
|
2019-09-24 18:38:12 +02:00
|
|
|
pageContents font input =
|
2019-11-29 11:48:28 +01:00
|
|
|
evalParser (runReaderT page font) emptyCMap input
|
2019-09-24 18:38:12 +02:00
|
|
|
|
|
|
|
page :: ParserWithFont [ByteString]
|
2019-09-28 09:25:59 +02:00
|
|
|
page = graphicState <|> text <?> "Text page contents"
|
2019-09-24 18:38:12 +02:00
|
|
|
|
|
|
|
graphicState :: ParserWithFont [ByteString]
|
2019-09-23 23:19:27 +02:00
|
|
|
graphicState =
|
2019-09-28 09:25:59 +02:00
|
|
|
(string "q" *> blank *> insideQ <* blank <* string "Q") <?> "Graphic state"
|
2019-09-23 23:19:27 +02:00
|
|
|
where
|
2019-09-28 09:25:59 +02:00
|
|
|
insideQ = concat <$> ((command <|> page) `sepBy` blank )
|
2019-09-25 18:42:34 +02:00
|
|
|
command = a stateOperator *> return []
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
text :: ParserWithFont [ByteString]
|
2019-09-25 18:42:34 +02:00
|
|
|
text =
|
2019-09-28 09:25:59 +02:00
|
|
|
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
|
2019-09-25 18:42:34 +02:00
|
|
|
where
|
2019-11-30 12:39:40 +01:00
|
|
|
commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank
|
2019-09-25 18:42:34 +02:00
|
|
|
|
|
|
|
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
|
|
|
|
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
|
|
|
asks (! fontName) >>= put >> return []
|
|
|
|
|
|
|
|
runOperator (Tstar, []) = return ["\n"]
|
|
|
|
|
|
|
|
runOperator (TJ, [Typed (Array arrayObject)]) =
|
|
|
|
replicate 1 <$> foldM appendText "" arrayObject
|
|
|
|
where
|
|
|
|
appendText bs (StringObject outputString) =
|
|
|
|
mappend bs <$> decodeString outputString
|
|
|
|
appendText bs _ = return bs
|
2019-09-30 14:13:12 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
runOperator (Tj, [Typed (StringObject outputString)]) =
|
|
|
|
replicate 1 <$> decodeString outputString
|
|
|
|
|
|
|
|
runOperator (Quote, [Typed (StringObject outputString)]) =
|
|
|
|
(\bs -> ["\n", bs]) <$> decodeString outputString
|
|
|
|
|
|
|
|
runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|
|
|
(\bs -> ["\n", bs]) <$> decodeString outputString
|
2019-09-24 18:38:12 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
runOperator _ = return []
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
decodeString :: StringObject -> ParserWithFont ByteString
|
2019-10-04 18:46:07 +02:00
|
|
|
decodeString = decode . toByteString
|
2019-10-03 07:59:09 +02:00
|
|
|
where
|
2019-10-04 18:46:07 +02:00
|
|
|
decode input
|
2019-10-03 07:59:09 +02:00
|
|
|
| BS.null input = return ""
|
|
|
|
| otherwise = do
|
2019-10-04 18:46:07 +02:00
|
|
|
(output, remainingInput) <- trySizes input =<< Map.toList <$> get
|
|
|
|
mappend output <$> decode remainingInput
|
|
|
|
trySizes :: ByteString -> [(Int, [CRange])] -> ParserWithFont (ByteString, ByteString)
|
|
|
|
trySizes s [] = fail $ "No matching code found in font for " ++ unpack s
|
|
|
|
trySizes s ((size, cRanges):others) =
|
2019-10-03 07:59:09 +02:00
|
|
|
let prefix = BS.take size s in
|
2019-10-04 18:46:07 +02:00
|
|
|
case tryRanges prefix cRanges of
|
|
|
|
Nothing -> trySizes s others
|
2019-10-03 07:59:09 +02:00
|
|
|
Just outputSequence -> return (outputSequence, BS.drop size s)
|
2019-10-04 18:46:07 +02:00
|
|
|
tryRanges :: ByteString -> [CRange] -> Maybe ByteString
|
|
|
|
tryRanges _ [] = Nothing
|
|
|
|
tryRanges prefix ((CRange {mapping}):cRanges) =
|
2019-10-03 07:59:09 +02:00
|
|
|
case Map.lookup prefix mapping of
|
2019-10-04 18:46:07 +02:00
|
|
|
Nothing -> tryRanges prefix cRanges
|
2019-10-03 07:59:09 +02:00
|
|
|
outputSequence -> outputSequence
|