{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} import Codec.Compression.Zlib (decompress) import Control.Monad (foldM) import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify) import Control.Monad.Writer (tell) import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, insert, lookup, toList) import PDF (Document(..), parseDocument) import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) ,) import PDF.Output (ObjectId) import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents) import PDF.Update (unify) import System.Environment (getArgs) import System.IO (hPutStrLn, stderr) type CachedCMaps = Map ObjectId CMap type T a = RWST Content [ByteString] CachedCMaps [] a list :: [a] -> T a list l = RWST (\_ s -> fillContext s <$> l) where fillContext s a = (a, s, []) handleError :: a -> String -> T a handleError defaultValue s = (tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue extractText :: Object -> T () extractText object = do pageDict <- dict object cMappers <- loadCMappers =<< getFont pageDict contents <- stream =<< follow =<< key "Contents" pageDict either (handleError ()) (tell . chunks) (pageContents cMappers contents) stream :: Object -> T ByteString stream (Stream {header, streamContent}) = return $ case Map.lookup (Name "Filter") header of Just (NameObject (Name "FlateDecode")) -> Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent _ -> streamContent stream _ = list [] getFont :: Dictionary -> T Dictionary getFont pageDict = key "Resources" pageDict >>= dict . Direct >>= key "Font" >>= follow >>= dict cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap cache loader objectId = do loaded <- get case Map.lookup objectId loaded of Just value -> return value Nothing -> do value <- loader objectId modify (Map.insert objectId value) >> return value loadFont :: ObjectId -> T CMap loadFont objectId = getObject objectId >>= dict >>= key "ToUnicode" >>= follow >>= stream >>= either (handleError emptyCMap) return . cMap loadCMappers :: Dictionary -> T CMappers loadCMappers = foldM loadCMapper Map.empty . Map.toList where loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do flip (Map.insert name) output <$> cache loadFont objectId loadCMapper output _ = return output getObject :: ObjectId -> T Object getObject objectId = do content <- ask return (objects content ! objectId) key :: String -> Dictionary -> T DirectObject key keyName dictionary = case Map.lookup (Name keyName) dictionary of Just obj -> return obj _ -> list [] follow :: DirectObject -> T Object follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId follow _ = list [] dict :: Object -> T Dictionary dict (Direct (Dictionary dictionary)) = return dictionary dict _ = list [] pagesList :: T ObjectId pagesList = do root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask pages <- dict =<< follow =<< key "Pages" root case Map.lookup (Name "Kids") pages of Just (Array kids) -> list $ filterObjectIds kids _ -> list [] filterObjectIds :: [DirectObject] -> [ObjectId] filterObjectIds objects = do object <- objects case object of Reference (IndirectObjCoordinates {objectId}) -> [objectId] _ -> [] listTextObjects :: Document -> [ByteString] listTextObjects (Document {updates}) = snd =<< evalRWST rwsMain (unify updates) Map.empty where rwsMain = --Lazy.pack . show <$> (getObject =<< pagesList) pagesList >>= getObject >>= extractText main :: IO () main = do [inputFile] <- getArgs result <- parseDocument <$> BS.readFile inputFile case result of Left parseError -> hPutStrLn stderr $ show parseError Right doc -> mapM_ BS.putStrLn $ listTextObjects doc