{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} import Codec.Compression.Zlib (decompress) import Control.Monad.Reader (ask) import Control.Monad.RWS (RWST(..), evalRWST) import Control.Monad.Writer (tell) import Data.ByteString.Lazy.Char8 (ByteString) import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict) import Data.Map ((!)) import qualified Data.Map as Map (empty, lookup) import PDF (Document(..), parseDocument) import qualified PDF.EOL as EOL (Style) import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) ,) import PDF.Output (ObjectId(..)) import qualified PDF.Output as Output (render) import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents) import PDF.Update (unify) import System.Environment (getArgs) import System.IO (hPutStrLn, stderr) --type T a = ReaderT Content [] a type T a = RWST Content [ByteString] CMappers [] a list :: [a] -> T a list l = RWST (\_ s -> fillContext s <$> l) where fillContext s a = (a, s, []) extractText :: Object -> T () extractText object = do pageDict <- dict object pageContents <- follow =<< get "Contents" pageDict case pageContents of (Stream {header, streamContent}) -> do font <- getFont pageDict storeDecodedText font . clear header $ Lazy.fromStrict streamContent _ -> return () clear :: Dictionary -> ByteString -> ByteString clear header streamContent = case Map.lookup (Name "Filter") header of Just (NameObject (Name "FlateDecode")) -> decompress streamContent _ -> streamContent storeDecodedText :: Dictionary -> ByteString -> T () storeDecodedText font page = tell . chunks =<< pageContents font page getFont :: Dictionary -> T Dictionary getFont pageDict = get "Resources" pageDict >>= dict . Direct >>= get "Font" >>= follow >>= dict getObject :: ObjectId -> T Object getObject objectId = do content <- ask return (objects content ! objectId) get :: String -> Dictionary -> T DirectObject get key dictionary = case Map.lookup (Name key) dictionary of Just obj -> return obj _ -> list [] follow :: DirectObject -> T Object follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId follow _ = list [] {- obj <- getObject objectId case obj of Direct directObj -> return directObj _ -> list [] -} {- key dictionary = case Map.lookup (Name key) dictionary of Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId _ -> list [] -} dict :: Object -> T Dictionary dict (Direct (Dictionary dictionary)) = return dictionary dict _ = list [] pagesList :: T ObjectId pagesList = do root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask pages <- dict =<< follow =<< get "Pages" root case Map.lookup (Name "Kids") pages of Just (Array kids) -> list $ filterObjectIds kids _ -> list [] filterObjectIds :: [DirectObject] -> [ObjectId] filterObjectIds objects = do object <- objects case object of Reference (IndirectObjCoordinates {objectId}) -> [objectId] _ -> [] listTextObjects :: Document -> [ByteString] listTextObjects (Document {updates}) = snd =<< evalRWST rwsMain (unify updates) Map.empty where rwsMain = --Lazy.pack . show <$> (getObject =<< pagesList) pagesList >>= getObject >>= extractText main :: IO () main = do [inputFile] <- getArgs result <- parseDocument <$> BS.readFile inputFile case result of Left parseError -> hPutStrLn stderr $ show parseError Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc