{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} import Codec.Compression.Zlib (decompress) import Control.Monad (foldM) import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify) import Control.Monad.Writer (tell) import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, insert, lookup, toList) import PDF (Document(..), parseDocument) import PDF.CMap (CMap, CMappers, cMap, emptyCMap) import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) ,) import PDF.Output (ObjectId(..)) import PDF.Text (PageContents(..), pageContents) import PDF.Update (unify) import System.Environment (getArgs) import System.IO (hPutStrLn, stderr) import Text.Printf (printf) type CachedCMaps = Map ObjectId CMap type T a = RWST Content [ByteString] CachedCMaps [] a list :: [a] -> T a list l = RWST (\_ s -> fillContext s <$> l) where fillContext s a = (a, s, []) handleError :: ObjectId -> a -> String -> T a handleError objectId defaultValue s = (tell . replicate 1 $ BS.pack message) >> return defaultValue where message = printf "Object #%d : %s" (getObjectId objectId) s extractText :: Object -> T () extractText object = do pageDict <- dict object cMappers <- loadCMappers =<< getFont pageDict contentsId <- target =<< key "Contents" pageDict contents <- stream =<< getObject contentsId either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents) stream :: Object -> T ByteString stream (Stream {header, streamContent}) = return $ case Map.lookup (Name "Filter") header of Just (NameObject (Name "FlateDecode")) -> Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent _ -> streamContent stream _ = list [] getFont :: Dictionary -> T Dictionary getFont pageDict = key "Resources" pageDict >>= dict . Direct >>= key "Font" >>= follow >>= dict cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap cache loader objectId = do loaded <- get case Map.lookup objectId loaded of Just value -> return value Nothing -> do value <- loader objectId modify (Map.insert objectId value) >> return value loadFont :: ObjectId -> T CMap loadFont objectId = getObject objectId >>= dict >>= key "ToUnicode" >>= follow >>= stream >>= either (handleError objectId emptyCMap) return . cMap loadCMappers :: Dictionary -> T CMappers loadCMappers = foldM loadCMapper Map.empty . Map.toList where loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do flip (Map.insert name) output <$> cache loadFont objectId loadCMapper output _ = return output getObject :: ObjectId -> T Object getObject objectId = do content <- ask return (objects content ! objectId) key :: String -> Dictionary -> T DirectObject key keyName dictionary = case Map.lookup (Name keyName) dictionary of Just obj -> return obj _ -> list [] target :: DirectObject -> T ObjectId target (Reference (IndirectObjCoordinates {objectId})) = return objectId target _ = list [] follow :: DirectObject -> T Object follow directObject = target directObject >>= getObject dict :: Object -> T Dictionary dict (Direct (Dictionary dictionary)) = return dictionary dict _ = list [] pagesList :: T ObjectId pagesList = do root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask pages <- dict =<< follow =<< key "Pages" root case Map.lookup (Name "Kids") pages of Just (Array kids) -> list $ filterObjectIds kids _ -> list [] filterObjectIds :: [DirectObject] -> [ObjectId] filterObjectIds objects = do object <- objects case object of Reference (IndirectObjCoordinates {objectId}) -> [objectId] _ -> [] listTextObjects :: Document -> [ByteString] listTextObjects (Document {updates}) = snd =<< evalRWST rwsMain (unify updates) Map.empty where rwsMain = pagesList >>= getObject >>= extractText main :: IO () main = do [inputFile] <- getArgs result <- parseDocument <$> BS.readFile inputFile case result of Left parseError -> hPutStrLn stderr $ show parseError Right doc -> mapM_ BS.putStrLn $ listTextObjects doc