{-# LANGUAGE NamedFieldPuns #-} module PDF.Pages ( Page(..) , get , getAll ) where import Codec.Compression.Zlib (decompress) import Control.Monad (foldM) import Control.Monad.RWS (RWST(..), ask, evalRWST, mapRWST, modify) import qualified Control.Monad.RWS as RWS (get) import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import PDF.CMap (CMap, CMappers, cMap) import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) ,) import PDF.Output (ObjectId(..)) import PDF.Text (pageContents) import Prelude hiding (fail) import Text.Printf (printf) type CachedCMaps = Map ObjectId CMap type T = RWST Content () CachedCMaps (Either String) data Page = Page { contents :: [ByteString] , source :: ObjectId } infixl 1 \\= (\\=) :: T a -> (a -> Either String b) -> T b x \\= f = mapRWST ((\(a, s, w) -> (\b -> (b, s, w)) <$> f a) =<<) x infixl 1 //= (//=) :: Either String a -> (a -> T b) -> T b (//=) (Left e) _ = RWST (\_ _ -> Left e) (//=) (Right a) f = f a lift :: Either String a -> T a lift x = x //= return expected :: Show a => String -> a -> Either String b expected name = Left . printf "Not a %s: %s" name . show stream :: Object -> Either String ByteString stream (Stream {header, streamContent}) = Right $ case Map.lookup (Name "Filter") header of Just (NameObject (Name "FlateDecode")) -> Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent _ -> streamContent stream obj = expected "stream" obj getResource :: DirectObject -> T Dictionary getResource (Dictionary dictionary) = return dictionary getResource (Reference (IndirectObjCoordinates {objectId})) = getObject objectId \\= dict getResource directObject = lift $ expected "resource (dictionary or reference)" directObject getFont :: Dictionary -> T Dictionary getFont pageDict = key "Resources" pageDict //= getResource \\= key "Font" >>= getResource cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap cache loader objectId = (maybe load return . Map.lookup objectId) =<< RWS.get where load = do value <- loader objectId modify $ Map.insert objectId value return value loadFont :: ObjectId -> T CMap loadFont objectId = (getObject objectId \\= dict \\= key "ToUnicode" >>= follow) \\= stream \\= cMap loadCMappers :: Dictionary -> T CMappers loadCMappers = foldM loadCMapper Map.empty . Map.toList where loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = flip (Map.insert name) output <$> cache loadFont objectId --maybe output (flip (Map.insert name) output) <$> cache loadFont objectId loadCMapper output _ = return output getObject :: ObjectId -> T Object getObject objectId = do content <- ask return (objects content ! objectId) key :: String -> Dictionary -> Either String DirectObject key keyName dictionary = maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary) where errorMessage = printf "Key %s not found in dictionary %s" keyName (show dictionary) target :: DirectObject -> Either String ObjectId target (Reference (IndirectObjCoordinates {objectId})) = Right objectId target directObject = expected "reference" directObject many :: DirectObject -> [DirectObject] many (Array l) = l many directObject = [directObject] follow :: DirectObject -> T Object follow directObject = target directObject //= getObject dict :: Object -> Either String Dictionary dict (Direct (Dictionary dictionary)) = Right dictionary dict obj = expected "dictionary" obj dictObject :: String -> Dictionary -> T Dictionary dictObject keyName dictionary = key keyName dictionary //= follow \\= dict pagesList :: T [ObjectId] pagesList = do root <- dictObject "Root" . trailer . docStructure =<< ask pages <- dictObject "Pages" root case Map.lookup (Name "Kids") pages of Just (Array kids) -> return $ getReferences kids _ -> return [] getReferences :: [DirectObject] -> [ObjectId] getReferences objects = do object <- objects case object of Reference (IndirectObjCoordinates {objectId}) -> [objectId] _ -> [] extractText :: Object -> T [ByteString] extractText object = do pageDict <- lift $ dict object cMappers <- loadCMappers =<< getFont pageDict let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) concat <$> (objects //= (mapM $ loadContent cMappers)) where loadContent :: CMappers -> DirectObject -> T [ByteString] loadContent cMappers directObject = follow directObject \\= stream \\= pageContents cMappers loadPage :: ObjectId -> T Page loadPage source = (\contents -> Page {contents, source}) <$> (extractText =<< getObject source) getAll :: Content -> Either String (Map Int Page) getAll content = fst <$> evalRWST getPages content Map.empty where numbered = Map.fromList . zip [1..] getPages = numbered <$> (mapM loadPage =<< pagesList) get :: Content -> Int -> Either String Page get content pageNumber | pageNumber < 1 = Left "Pages start at 1" | otherwise = fst <$> evalRWST getPage content Map.empty where firstPage [] = lift $ Left "Page is out of bounds" firstPage (p:_) = loadPage p getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage