{-# LANGUAGE NamedFieldPuns #-} module PDF.Pages ( Page(..) , get , getAll ) where import Codec.Compression.Zlib (decompress) import Control.Applicative ((<|>)) import Control.Monad (foldM) import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify) import qualified Control.Monad.RWS as RWS (get) import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map (Map, (!)) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import Data.Text (Text) import PDF.CMap (cMap) import PDF.Encoding (encoding) import PDF.Font (Font, FontSet) import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) ,) import PDF.Output (ObjectId(..)) import PDF.Text (pageContents) import Text.Printf (printf) type CachedFonts = Map ObjectId Font type T = RWST Content () CachedFonts (Either String) data Page = Page { contents :: [Text] , source :: ObjectId } infixl 1 \\= (\\=) :: T a -> (a -> Either String b) -> T b x \\= f = x >>= lift . f infixl 1 //= (//=) :: Either String a -> (a -> T b) -> T b x //= f = lift x >>= f expected :: Show a => String -> a -> Either String b expected name = Left . printf "Not a %s: %s" name . show stream :: Object -> Either String ByteString stream (Stream {header, streamContent}) = Right $ case Map.lookup (Name "Filter") header of Just (NameObject (Name "FlateDecode")) -> Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent _ -> streamContent stream obj = expected "stream" obj getResource :: DirectObject -> T Dictionary getResource (Dictionary dictionary) = return dictionary getResource (Reference (IndirectObjCoordinates {objectId})) = getObject objectId \\= dict getResource directObject = lift $ expected "resource (dictionary or reference)" directObject getFontDictionary :: Dictionary -> T Dictionary getFontDictionary pageDict = key "Resources" pageDict //= getResource \\= key "Font" >>= getResource cache :: (ObjectId -> T Font) -> ObjectId -> T Font cache loader objectId = (maybe load return . Map.lookup objectId) =<< RWS.get where load = do value <- loader objectId modify $ Map.insert objectId value return value loadFont :: ObjectId -> T Font loadFont objectId = getObject objectId \\= dict >>= tryMappings where tryMappings dictionary = loadCMap dictionary <|> lift (key "Encoding" dictionary >>= loadEncoding) <|> lift (Left $ unknownFormat (show objectId) (show dictionary)) unknownFormat = printf "Unknown font format for object #%s : %s" loadCMap dictionary = key "ToUnicode" dictionary //= follow \\= stream \\= cMap loadEncoding (NameObject (Name name)) = encoding name loadEncoding directObject = Left . printf "Encoding must be a name, not that : %s" $ show directObject loadFonts :: Dictionary -> T FontSet loadFonts = foldM addFont Map.empty . Map.toList where addFont :: FontSet -> (Name, DirectObject) -> T FontSet addFont output (name, Reference (IndirectObjCoordinates {objectId})) = flip (Map.insert name) output <$> cache loadFont objectId addFont output _ = return output getObject :: ObjectId -> T Object getObject objectId = do content <- ask return (objects content ! objectId) key :: String -> Dictionary -> Either String DirectObject key keyName dictionary = maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary) where errorMessage = printf "Key %s not found in dictionary %s" keyName (show dictionary) target :: DirectObject -> Either String ObjectId target (Reference (IndirectObjCoordinates {objectId})) = Right objectId target directObject = expected "reference" directObject many :: DirectObject -> [DirectObject] many (Array l) = l many directObject = [directObject] follow :: DirectObject -> T Object follow directObject = target directObject //= getObject dict :: Object -> Either String Dictionary dict (Direct (Dictionary dictionary)) = Right dictionary dict obj = expected "dictionary" obj dictObject :: String -> Dictionary -> T Dictionary dictObject keyName dictionary = key keyName dictionary //= follow \\= dict pagesList :: T [ObjectId] pagesList = do root <- dictObject "Root" . trailer . docStructure =<< ask pages <- dictObject "Pages" root case Map.lookup (Name "Kids") pages of Just (Array kids) -> return $ getReferences kids _ -> return [] getReferences :: [DirectObject] -> [ObjectId] getReferences objects = do object <- objects case object of Reference (IndirectObjCoordinates {objectId}) -> [objectId] _ -> [] extractText :: Object -> T [Text] extractText object = do pageDict <- lift $ dict object fonts <- loadFonts =<< getFontDictionary pageDict let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) concat <$> (objects //= (mapM $ loadContent fonts)) where loadContent :: FontSet -> DirectObject -> T [Text] loadContent fonts directObject = follow directObject \\= stream \\= pageContents fonts loadPage :: ObjectId -> T Page loadPage source = (\contents -> Page {contents, source}) <$> (extractText =<< getObject source) getAll :: Content -> Either String (Map Int Page) getAll content = fst <$> evalRWST getPages content Map.empty where numbered = Map.fromList . zip [1..] getPages = numbered <$> (mapM loadPage =<< pagesList) get :: Content -> Int -> Either String Page get content pageNumber | pageNumber < 1 = Left "Pages start at 1" | otherwise = fst <$> evalRWST getPage content Map.empty where firstPage [] = lift $ Left "Page is out of bounds" firstPage (p:_) = loadPage p getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage