2019-09-23 23:19:27 +02:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
2019-09-27 18:38:03 +02:00
|
|
|
module PDF.Text {-(
|
2019-09-27 18:16:12 +02:00
|
|
|
PageContents(..)
|
2019-09-23 18:00:47 +02:00
|
|
|
, pageContents
|
2019-09-27 18:38:03 +02:00
|
|
|
)-} where
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-27 18:16:12 +02:00
|
|
|
import Control.Applicative ((<|>))
|
2019-09-25 18:42:34 +02:00
|
|
|
import Control.Monad (foldM, join)
|
|
|
|
import Control.Monad.Reader (ReaderT, runReaderT, asks)
|
2019-09-27 12:21:06 +02:00
|
|
|
import Control.Monad.State (get, put)
|
2019-09-27 18:16:12 +02:00
|
|
|
import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy)
|
|
|
|
import Data.ByteString (ByteString)
|
|
|
|
import qualified Data.ByteString.Char8 as Char8 (unpack)
|
|
|
|
import Data.Map ((!))
|
|
|
|
import qualified Data.Map as Map (lookup)
|
|
|
|
import PDF.CMap (CMappers, CMap, emptyCMap)
|
2019-09-25 18:42:34 +02:00
|
|
|
import PDF.Object (
|
2019-09-27 18:16:12 +02:00
|
|
|
DirectObject(..), StringObject(..)
|
|
|
|
, array, blank, name, parseBytes, regular, stringObject
|
2019-09-25 18:42:34 +02:00
|
|
|
)
|
2019-09-27 12:21:06 +02:00
|
|
|
import PDF.Parser (MonadParser, Parser, evalParser, string, takeAll)
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
data StateOperator =
|
|
|
|
Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state
|
|
|
|
deriving (Bounded, Enum)
|
|
|
|
data TextOperator =
|
|
|
|
Td | TD | Tm | Tstar -- text positioning
|
|
|
|
| TJ | Tj | Quote | DQuote -- text showing
|
|
|
|
| Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state
|
|
|
|
deriving (Bounded, Enum)
|
|
|
|
data Argument = Raw ByteString | Typed DirectObject
|
|
|
|
type Call a = (a, [Argument])
|
|
|
|
|
2019-09-27 12:21:06 +02:00
|
|
|
stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator)
|
2019-09-25 18:42:34 +02:00
|
|
|
stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm"
|
|
|
|
stateOperator W = (,) W <$> count 1 argument <* string "w"
|
|
|
|
stateOperator J = (,) J <$> count 1 argument <* string "J"
|
|
|
|
stateOperator J_ = (,) J_ <$> count 1 argument <* string "j"
|
|
|
|
stateOperator M = (,) M <$> count 1 argument <* string "M"
|
|
|
|
stateOperator D = (,) D <$> count 2 argument <* string "d"
|
|
|
|
stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri"
|
|
|
|
stateOperator I = (,) I <$> count 1 argument <* string "i"
|
|
|
|
stateOperator Gs = (,) Gs <$> count 1 argument <* string "gs"
|
|
|
|
|
2019-09-27 12:21:06 +02:00
|
|
|
textOperator :: MonadParser m => TextOperator -> m (Call TextOperator)
|
2019-09-25 18:42:34 +02:00
|
|
|
textOperator Td = (,) Td <$> count 2 argument <* string "Td"
|
|
|
|
textOperator TD = (,) TD <$> count 2 argument <* string "TD"
|
|
|
|
textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm"
|
|
|
|
textOperator Tstar = (,) Td <$> return [] <* string "T*"
|
|
|
|
textOperator TJ =
|
2019-09-27 12:21:06 +02:00
|
|
|
(,) TJ <$> sequence [Typed . Array <$> array] <* blank <* string "TJ"
|
2019-09-25 18:42:34 +02:00
|
|
|
textOperator Tj =
|
2019-09-27 12:21:06 +02:00
|
|
|
(,) Tj <$> sequence [Typed . StringObject <$> stringObject] <* blank <* string "Tj"
|
2019-09-25 18:42:34 +02:00
|
|
|
textOperator Quote = (,) Quote <$> count 1 argument <* string "'"
|
|
|
|
textOperator DQuote = (,) DQuote <$> count 1 argument <* string "\""
|
|
|
|
textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc"
|
|
|
|
textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw"
|
|
|
|
textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz"
|
|
|
|
textOperator TL = (,) TL <$> count 1 argument <* string "TL"
|
|
|
|
textOperator Tf = (,) Tf <$> sequence [Typed . NameObject <$> name, argument] <* string "Tf"
|
|
|
|
textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr"
|
|
|
|
textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts"
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-27 18:38:03 +02:00
|
|
|
a :: (Bounded o, Enum o, MonadParser m) => (o -> m (Call o)) -> m (Call o)
|
2019-09-25 18:42:34 +02:00
|
|
|
a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound]
|
|
|
|
|
2019-09-27 12:21:06 +02:00
|
|
|
argument :: MonadParser m => m Argument
|
2019-09-25 18:42:34 +02:00
|
|
|
argument = Raw <$> takeAll regular <* blank
|
2019-09-23 18:00:47 +02:00
|
|
|
|
|
|
|
data PageContents = PageContents {
|
|
|
|
chunks :: [ByteString]
|
|
|
|
}
|
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
type ParserWithFont = ReaderT CMappers (Parser CMap)
|
|
|
|
|
|
|
|
pageContents :: CMappers -> ByteString -> Either String PageContents
|
|
|
|
pageContents font input =
|
|
|
|
evalParser (runReaderT (PageContents <$> page) font) emptyCMap input
|
|
|
|
|
|
|
|
page :: ParserWithFont [ByteString]
|
|
|
|
page = graphicState <|> text
|
|
|
|
|
|
|
|
graphicState :: ParserWithFont [ByteString]
|
2019-09-23 23:19:27 +02:00
|
|
|
graphicState =
|
2019-09-24 18:38:12 +02:00
|
|
|
string "q" *> blank *> insideQ <* string "Q"
|
2019-09-23 23:19:27 +02:00
|
|
|
where
|
2019-09-27 18:38:03 +02:00
|
|
|
insideQ = join <$> ((command <|> page) `sepBy` blank )
|
2019-09-25 18:42:34 +02:00
|
|
|
command = a stateOperator *> return []
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
text :: ParserWithFont [ByteString]
|
2019-09-25 18:42:34 +02:00
|
|
|
text =
|
|
|
|
string "BT" *> blank *> commands <* blank <* string "ET"
|
|
|
|
where
|
|
|
|
commands = join <$> (a textOperator >>= runOperator) `sepBy` blank
|
|
|
|
|
|
|
|
runOperator :: Call TextOperator -> ParserWithFont [ByteString]
|
|
|
|
runOperator (Tf, [Typed (NameObject fontName), _]) =
|
|
|
|
asks (! fontName) >>= put >> return []
|
|
|
|
|
|
|
|
runOperator (Tstar, []) = return ["\n"]
|
|
|
|
|
|
|
|
runOperator (TJ, [Typed (Array arrayObject)]) =
|
|
|
|
replicate 1 <$> foldM appendText "" arrayObject
|
|
|
|
where
|
|
|
|
appendText bs (StringObject outputString) =
|
|
|
|
mappend bs <$> decodeString outputString
|
|
|
|
appendText bs _ = return bs
|
|
|
|
|
|
|
|
runOperator (Tj, [Typed (StringObject outputString)]) =
|
|
|
|
replicate 1 <$> decodeString outputString
|
|
|
|
|
|
|
|
runOperator (Quote, [Typed (StringObject outputString)]) =
|
|
|
|
(\bs -> ["\n", bs]) <$> decodeString outputString
|
|
|
|
|
|
|
|
runOperator (DQuote, [Typed (StringObject outputString)]) =
|
|
|
|
(\bs -> ["\n", bs]) <$> decodeString outputString
|
2019-09-24 18:38:12 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
runOperator _ = return []
|
2019-09-23 23:19:27 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
decodeString :: StringObject -> ParserWithFont ByteString
|
2019-09-27 18:16:12 +02:00
|
|
|
decodeString (Hexadecimal h) = decodeString (Literal (Char8.unpack $ parseBytes h))
|
2019-09-27 12:21:06 +02:00
|
|
|
decodeString (Literal litString) = get >>= convertBytes litString
|
2019-09-26 15:51:41 +02:00
|
|
|
where
|
2019-09-27 12:21:06 +02:00
|
|
|
convertBytes :: String -> CMap -> ParserWithFont ByteString
|
|
|
|
convertBytes [] _ = return ""
|
|
|
|
convertBytes (c:cs) someCMap = do
|
|
|
|
convertBytesAux (fromEnum c) 1 cs someCMap
|
|
|
|
convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString
|
|
|
|
convertBytesAux code size s someCMap
|
|
|
|
| size > 4 = fail "Could not match any input code smaller than an int"
|
|
|
|
| otherwise =
|
|
|
|
case (Map.lookup code someCMap, s) of
|
|
|
|
(Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap
|
|
|
|
(Nothing, []) -> fail "No character left to read but no code recognized"
|
2019-09-27 18:16:12 +02:00
|
|
|
(Just outputText, _) -> mappend outputText <$> convertBytes s someCMap
|