2019-10-03 07:59:09 +02:00
|
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
2019-09-23 23:19:27 +02:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
2019-09-27 18:38:03 +02:00
|
|
|
module PDF.Text {-(
|
2019-09-27 18:16:12 +02:00
|
|
|
PageContents(..)
|
2019-09-23 18:00:47 +02:00
|
|
|
, pageContents
|
2019-09-27 18:38:03 +02:00
|
|
|
)-} where
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-27 18:16:12 +02:00
|
|
|
import Control.Applicative ((<|>))
|
2019-09-28 09:25:59 +02:00
|
|
|
import Control.Monad (foldM)
|
2019-09-25 18:42:34 +02:00
|
|
|
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
2019-09-27 12:21:06 +02:00
|
|
|
import Control.Monad.State (get, put)
|
2019-09-27 18:16:12 +02:00
|
|
|
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
|
|
|
|
import Data.ByteString (ByteString)
|
2019-10-03 07:59:09 +02:00
|
|
|
import qualified Data.ByteString as BS (drop, null, take)
|
2019-09-30 14:13:12 +02:00
|
|
|
import Data.ByteString.Char8.Util (decodeHex)
|
2019-10-03 07:59:09 +02:00
|
|
|
import Data.List (find)
|
2019-09-27 18:16:12 +02:00
|
|
|
import Data.Map ((!))
|
2019-10-03 07:59:09 +02:00
|
|
|
import qualified Data.Map as Map (lookup, toList)
|
|
|
|
import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap)
|
2019-09-25 18:42:34 +02:00
|
|
|
import PDF.Object (
|
2019-09-27 18:16:12 +02:00
|
|
|
DirectObject(..), StringObject(..)
|
|
|
|
, array, blank, name, parseBytes, regular, stringObject
|
2019-09-25 18:42:34 +02:00
|
|
|
)
|
2019-09-28 09:25:59 +02:00
|
|
|
import PDF.Parser (MonadParser, (<?>), Parser, evalParser, string, takeAll)
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
data StateOperator =
|
|
|
|
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
2019-09-30 14:13:12 +02:00
|
|
|
deriving (Bounded, Enum, Show)
|
2019-09-25 18:42:34 +02:00
|
|
|
data TextOperator =
|
|
|
|
Td | TD | Tm | Tstar -- text positioning
|
|
|
|
| TJ | Tj | Quote | DQuote -- text showing
|
|
|
|
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
2019-09-30 14:13:12 +02:00
|
|
|
deriving (Bounded, Enum, Show)
|
|
|
|
data Argument = Raw ByteString | Typed DirectObject deriving Show
|
2019-09-25 18:42:34 +02:00
|
|
|
type Call a = (a, [Argument])
|
|
|
|
|
2019-09-27 12:21:06 +02:00
|
|
|
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
|
2019-09-25 18:42:34 +02:00
|
|
|
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
|
|
|
|
stateOperator W = (,) W <$> count 1 argument <* string "w"
|
|
|
|
stateOperator J = (,) J <$> count 1 argument <* string "J"
|
|
|
|
stateOperator J_ = (,) J_ <$> count 1 argument <* string "j"
|
|
|
|
stateOperator M = (,) M <$> count 1 argument <* string "M"
|
|
|
|
stateOperator D = (,) D <$> count 2 argument <* string "d"
|
|
|
|
stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
|
|
|
|
stateOperator I = (,) I <$> count 1 argument <* string "i"
|
2019-09-28 09:25:59 +02:00
|
|
|
stateOperator Gs = (,) Gs <$> count 1 nameArg <* string "gs"
|
2019-09-25 18:42:34 +02:00
|
|
|
|
2019-09-27 12:21:06 +02:00
|
|
|
textOperator :: MonadParser m => TextOperator -> m (Call TextOperator)
|
2019-09-25 18:42:34 +02:00
|
|
|
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
|
|
|
|
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
|
|
|
|
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
|
|
|
|
textOperator Tstar = (,) Td <$> return [] <* string "T*"
|
2019-09-28 09:25:59 +02:00
|
|
|
textOperator TJ = (,) TJ <$> count 1 arrayArg <* string "TJ"
|
|
|
|
textOperator Tj = (,) Tj <$> count 1 stringArg <* string "Tj"
|
|
|
|
textOperator Quote = (,) Quote <$> count 1 stringArg <* string "'"
|
|
|
|
textOperator DQuote = (,) DQuote <$> count 1 stringArg <* string "\""
|
2019-09-25 18:42:34 +02:00
|
|
|
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
|
|
|
|
textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw"
|
|
|
|
textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz"
|
|
|
|
textOperator TL = (,) TL <$> count 1 argument <* string "TL"
|
2019-09-28 09:25:59 +02:00
|
|
|
textOperator Tf = (,) Tf <$> sequence [nameArg, argument] <* string "Tf"
|
2019-09-25 18:42:34 +02:00
|
|
|
textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr"
|
|
|
|
textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-27 18:38:03 +02:00
|
|
|
a :: (Bounded o, Enum o, MonadParser m) => (o -> m (Call o)) -> m (Call o)
|
2019-09-25 18:42:34 +02:00
|
|
|
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
|
|
|
|
|
2019-09-27 12:21:06 +02:00
|
|
|
argument :: MonadParser m => m Argument
|
2019-09-25 18:42:34 +02:00
|
|
|
argument = Raw <$> takeAll regular <* blank
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-28 09:25:59 +02:00
|
|
|
arrayArg :: MonadParser m => m Argument
|
|
|
|
arrayArg = Typed . Array <$> array <* blank
|
|
|
|
|
|
|
|
nameArg :: MonadParser m => m Argument
|
|
|
|
nameArg = Typed . NameObject <$> name <* blank
|
|
|
|
|
|
|
|
stringArg :: MonadParser m => m Argument
|
|
|
|
stringArg = Typed . StringObject <$> stringObject <* blank
|
|
|
|
|
2019-09-23 18:00:47 +02:00
|
|
|
data PageContents = PageContents {
|
|
|
|
chunks :: [ByteString]
|
|
|
|
}
|
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
type ParserWithFont = ReaderT CMappers (Parser CMap)
|
|
|
|
|
|
|
|
pageContents :: CMappers -> ByteString -> Either String PageContents
|
|
|
|
pageContents font input =
|
|
|
|
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
|
|
|
|
|
|
|
|
page :: ParserWithFont [ByteString]
|
2019-09-28 09:25:59 +02:00
|
|
|
page = graphicState <|> text <?> "Text page contents"
|
2019-09-24 18:38:12 +02:00
|
|
|
|
|
|
|
graphicState :: ParserWithFont [ByteString]
|
2019-09-23 23:19:27 +02:00
|
|
|
graphicState =
|
2019-09-28 09:25:59 +02:00
|
|
|
(string "q" *> blank *> insideQ <* blank <* string "Q") <?> "Graphic state"
|
2019-09-23 23:19:27 +02:00
|
|
|
where
|
2019-09-28 09:25:59 +02:00
|
|
|
insideQ = concat <$> ((command <|> page) `sepBy` blank )
|
2019-09-25 18:42:34 +02:00
|
|
|
command = a stateOperator *> return []
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
text :: ParserWithFont [ByteString]
|
2019-09-25 18:42:34 +02:00
|
|
|
text =
|
2019-09-28 09:25:59 +02:00
|
|
|
string "BT" *> blank *> commands <* blank <* string "ET" <?> "Text operators"
|
2019-09-25 18:42:34 +02:00
|
|
|
where
|
2019-09-28 09:25:59 +02:00
|
|
|
commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank
|
2019-09-25 18:42:34 +02:00
|
|
|
|
|
|
|
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
|
|
|
|
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
|
|
|
asks (! fontName) >>= put >> return []
|
|
|
|
|
|
|
|
runOperator (Tstar, []) = return ["\n"]
|
|
|
|
|
|
|
|
runOperator (TJ, [Typed (Array arrayObject)]) =
|
|
|
|
replicate 1 <$> foldM appendText "" arrayObject
|
|
|
|
where
|
|
|
|
appendText bs (StringObject outputString) =
|
|
|
|
mappend bs <$> decodeString outputString
|
|
|
|
appendText bs _ = return bs
|
2019-09-30 14:13:12 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
runOperator (Tj, [Typed (StringObject outputString)]) =
|
|
|
|
replicate 1 <$> decodeString outputString
|
|
|
|
|
|
|
|
runOperator (Quote, [Typed (StringObject outputString)]) =
|
|
|
|
(\bs -> ["\n", bs]) <$> decodeString outputString
|
|
|
|
|
|
|
|
runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|
|
|
(\bs -> ["\n", bs]) <$> decodeString outputString
|
2019-09-24 18:38:12 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
runOperator _ = return []
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
decodeString :: StringObject -> ParserWithFont ByteString
|
2019-09-30 14:13:12 +02:00
|
|
|
decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h))
|
2019-10-03 07:59:09 +02:00
|
|
|
decodeString (Literal litString) = do
|
|
|
|
cRangesBySize <- Map.toList <$> get
|
|
|
|
f cRangesBySize litString
|
|
|
|
where
|
|
|
|
f :: [(Int, [CRange])] -> ByteString -> ParserWithFont ByteString
|
|
|
|
f cRangesBySize input
|
|
|
|
| BS.null input = return ""
|
|
|
|
| otherwise = do
|
|
|
|
(output, newInput) <- g cRangesBySize input
|
|
|
|
mappend output <$> f cRangesBySize newInput
|
|
|
|
g :: [(Int, [CRange])] -> ByteString -> ParserWithFont (ByteString, ByteString)
|
|
|
|
g [] _ = fail "No matching code found in font"
|
|
|
|
g ((size, cRanges):others) s =
|
|
|
|
let prefix = BS.take size s in
|
|
|
|
case h prefix cRanges of
|
|
|
|
Nothing -> g others s
|
|
|
|
Just outputSequence -> return (outputSequence, BS.drop size s)
|
|
|
|
h :: ByteString -> [CRange] -> Maybe ByteString
|
|
|
|
h prefix [] = Nothing
|
|
|
|
h prefix ((CRange {mapping}):cRanges) =
|
|
|
|
case Map.lookup prefix mapping of
|
|
|
|
Nothing -> h prefix cRanges
|
|
|
|
outputSequence -> outputSequence
|
|
|
|
|
|
|
|
{-
|
2019-09-30 14:13:12 +02:00
|
|
|
get >>= convertBytes litString
|
2019-09-26 15:51:41 +02:00
|
|
|
where
|
2019-09-27 12:21:06 +02:00
|
|
|
convertBytes :: String -> CMap -> ParserWithFont ByteString
|
|
|
|
convertBytes [] _ = return ""
|
|
|
|
convertBytes (c:cs) someCMap = do
|
|
|
|
convertBytesAux (fromEnum c) 1 cs someCMap
|
|
|
|
convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
|
|
|
|
convertBytesAux code size s someCMap
|
|
|
|
| size > 4 = fail "Could not match any input code smaller than an int"
|
|
|
|
| otherwise =
|
|
|
|
case (Map.lookup code someCMap, s) of
|
|
|
|
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
|
|
|
|
(Nothing, []) -> fail "No character left to read but no code recognized"
|
2019-09-27 18:16:12 +02:00
|
|
|
(Just outputText, _) -> mappend outputText <$> convertBytes s someCMap
|
2019-10-03 07:59:09 +02:00
|
|
|
-}
|