{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} module PDF.Text {-( PageContents(..) , pageContents )-} where import Control.Applicative ((<|>)) import Control.Monad (foldM) import Control.Monad.Reader (ReaderT, runReaderT, asks) import Control.Monad.State (get, put) import Data.Attoparsec.ByteString.Char8 (choice, count, sepBy) import Data.ByteString (ByteString) import qualified Data.ByteString as BS (drop, null, take) import Data.ByteString.Char8.Util (decodeHex) import Data.List (find) import Data.Map ((!)) import qualified Data.Map as Map (lookup, toList) import PDF.CMap (CMappers, CMap, CRange(..), emptyCMap) import PDF.Object ( DirectObject(..), StringObject(..) , array, blank, name, parseBytes, regular, stringObject ) import PDF.Parser (MonadParser, (), Parser, evalParser, string, takeAll) data StateOperator = Cm | W | J | J_ | M | D | Ri | I | Gs -- general graphic state deriving (Bounded, Enum, Show) data TextOperator = Td | TD | Tm | Tstar -- text positioning | TJ | Tj | Quote | DQuote -- text showing | Tc | Tw | Tz | TL | Tf | Tr | Ts -- text state deriving (Bounded, Enum, Show) data Argument = Raw ByteString | Typed DirectObject deriving Show type Call a = (a, [Argument]) stateOperator :: MonadParser m => StateOperator -> m (Call StateOperator) stateOperator Cm = (,) Cm <$> count 6 argument <* string "cm" stateOperator W = (,) W <$> count 1 argument <* string "w" stateOperator J = (,) J <$> count 1 argument <* string "J" stateOperator J_ = (,) J_ <$> count 1 argument <* string "j" stateOperator M = (,) M <$> count 1 argument <* string "M" stateOperator D = (,) D <$> count 2 argument <* string "d" stateOperator Ri = (,) Ri <$> count 1 argument <* string "ri" stateOperator I = (,) I <$> count 1 argument <* string "i" stateOperator Gs = (,) Gs <$> count 1 nameArg <* string "gs" textOperator :: MonadParser m => TextOperator -> m (Call TextOperator) textOperator Td = (,) Td <$> count 2 argument <* string "Td" textOperator TD = (,) TD <$> count 2 argument <* string "TD" textOperator Tm = (,) Tm <$> count 6 argument <* string "Tm" textOperator Tstar = (,) Td <$> return [] <* string "T*" textOperator TJ = (,) TJ <$> count 1 arrayArg <* string "TJ" textOperator Tj = (,) Tj <$> count 1 stringArg <* string "Tj" textOperator Quote = (,) Quote <$> count 1 stringArg <* string "'" textOperator DQuote = (,) DQuote <$> count 1 stringArg <* string "\"" textOperator Tc = (,) Tc <$> count 1 argument <* string "Tc" textOperator Tw = (,) Tw <$> count 1 argument <* string "Tw" textOperator Tz = (,) Tz <$> count 1 argument <* string "Tz" textOperator TL = (,) TL <$> count 1 argument <* string "TL" textOperator Tf = (,) Tf <$> sequence [nameArg, argument] <* string "Tf" textOperator Tr = (,) Tr <$> count 1 argument <* string "Tr" textOperator Ts = (,) Ts <$> count 1 argument <* string "Ts" a :: (Bounded o, Enum o, MonadParser m) => (o -> m (Call o)) -> m (Call o) a parserGenerator = choice $ parserGenerator <$> [minBound .. maxBound] argument :: MonadParser m => m Argument argument = Raw <$> takeAll regular <* blank arrayArg :: MonadParser m => m Argument arrayArg = Typed . Array <$> array <* blank nameArg :: MonadParser m => m Argument nameArg = Typed . NameObject <$> name <* blank stringArg :: MonadParser m => m Argument stringArg = Typed . StringObject <$> stringObject <* blank data PageContents = PageContents { chunks :: [ByteString] } type ParserWithFont = ReaderT CMappers (Parser CMap) pageContents :: CMappers -> ByteString -> Either String PageContents pageContents font input = evalParser (runReaderT (PageContents <$> page) font) emptyCMap input page :: ParserWithFont [ByteString] page = graphicState <|> text "Text page contents" graphicState :: ParserWithFont [ByteString] graphicState = (string "q" *> blank *> insideQ <* blank <* string "Q") "Graphic state" where insideQ = concat <$> ((command <|> page) `sepBy` blank ) command = a stateOperator *> return [] text :: ParserWithFont [ByteString] text = string "BT" *> blank *> commands <* blank <* string "ET" "Text operators" where commands = concat <$> (a textOperator >>= runOperator) `sepBy` blank runOperator :: Call TextOperator -> ParserWithFont [ByteString] runOperator (Tf, [Typed (NameObject fontName), _]) = asks (! fontName) >>= put >> return [] runOperator (Tstar, []) = return ["\n"] runOperator (TJ, [Typed (Array arrayObject)]) = replicate 1 <$> foldM appendText "" arrayObject where appendText bs (StringObject outputString) = mappend bs <$> decodeString outputString appendText bs _ = return bs runOperator (Tj, [Typed (StringObject outputString)]) = replicate 1 <$> decodeString outputString runOperator (Quote, [Typed (StringObject outputString)]) = (\bs -> ["\n", bs]) <$> decodeString outputString runOperator (DQuote, [Typed (StringObject outputString)]) = (\bs -> ["\n", bs]) <$> decodeString outputString runOperator _ = return [] decodeString :: StringObject -> ParserWithFont ByteString decodeString (Hexadecimal h) = decodeString (Literal (decodeHex h)) decodeString (Literal litString) = do cRangesBySize <- Map.toList <$> get f cRangesBySize litString where f :: [(Int, [CRange])] -> ByteString -> ParserWithFont ByteString f cRangesBySize input | BS.null input = return "" | otherwise = do (output, newInput) <- g cRangesBySize input mappend output <$> f cRangesBySize newInput g :: [(Int, [CRange])] -> ByteString -> ParserWithFont (ByteString, ByteString) g [] _ = fail "No matching code found in font" g ((size, cRanges):others) s = let prefix = BS.take size s in case h prefix cRanges of Nothing -> g others s Just outputSequence -> return (outputSequence, BS.drop size s) h :: ByteString -> [CRange] -> Maybe ByteString h prefix [] = Nothing h prefix ((CRange {mapping}):cRanges) = case Map.lookup prefix mapping of Nothing -> h prefix cRanges outputSequence -> outputSequence {- get >>= convertBytes litString where convertBytes :: String -> CMap -> ParserWithFont ByteString convertBytes [] _ = return "" convertBytes (c:cs) someCMap = do convertBytesAux (fromEnum c) 1 cs someCMap convertBytesAux :: Int -> Int -> String -> CMap -> ParserWithFont ByteString convertBytesAux code size s someCMap | size > 4 = fail "Could not match any input code smaller than an int" | otherwise = case (Map.lookup code someCMap, s) of (Nothing, (c:cs)) -> convertBytesAux (code * 256 + fromEnum c) (size + 1) cs someCMap (Nothing, []) -> fail "No character left to read but no code recognized" (Just outputText, _) -> mappend outputText <$> convertBytes s someCMap -}