{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} module PDF.Pages ( Page(..) , get , getAll ) where import Control.Applicative (Alternative(..), (<|>)) import Control.Monad (MonadPlus(..), foldM) import Control.Monad.Fail (MonadFail(..)) import Control.Monad.RWS (RWST(..), evalRWST, modify) import qualified Control.Monad.RWS as RWS (get) import Data.Map (Map) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import Data.Text (Text) import PDF.CMap (cMap) import qualified PDF.Content as Content (parse) import PDF.Content.Text (renderText) import PDF.Encoding (encoding) import PDF.Font (Font, FontSet) import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Name(..) ,) import PDF.Object.Navigation ( (//), dictionaryById, getDictionary, getField, follow, openStream, origin ) import PDF.Output (ObjectId(..)) import Prelude hiding (fail) import Text.Printf (printf) type CachedFonts = Map ObjectId Font newtype Error a = Error { runError :: Either String a } deriving (Alternative, Functor, Applicative, Monad, MonadPlus) type T = RWST Content () CachedFonts Error data Page = Page { contents :: [Text] , source :: ObjectId } instance MonadFail Error where fail = Error . Left getFontDictionary :: Dictionary -> T Dictionary getFontDictionary pageDict = ((pageDict // ["Resources", "Font"]) >>= getDictionary) <|> return Map.empty cache :: (ObjectId -> T Font) -> ObjectId -> T Font cache loader objectId = (maybe load return . Map.lookup objectId) =<< RWS.get where load = do value <- loader objectId modify $ Map.insert objectId value return value loadFont :: ObjectId -> T Font loadFont objectId = dictionaryById objectId >>= tryMappings where tryMappings dictionary = loadCMap dictionary <|> (getField "Encoding" dictionary >>= loadEncoding) <|> (fail $ unknownFormat (show objectId) (show dictionary)) unknownFormat = printf "Unknown font format for object #%s : %s" loadCMap :: Dictionary -> T Font loadCMap dictionary = getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap loadEncoding :: DirectObject -> T Font loadEncoding (NameObject (Name name)) = encoding name loadEncoding directObject = fail $ printf "Encoding must be a name, not that : %s" $ show directObject loadFonts :: Dictionary -> T FontSet loadFonts = foldM addFont Map.empty . Map.toList where addFont :: FontSet -> (Name, DirectObject) -> T FontSet addFont output (name, Reference (IndirectObjCoordinates {objectId})) = flip (Map.insert name) output <$> cache loadFont objectId addFont output _ = return output several :: DirectObject -> [DirectObject] several (Array l) = l several directObject = [directObject] pagesList :: T [ObjectId] pagesList = do pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary case Map.lookup (Name "Kids") pages of Just (Array kids) -> return $ getReferences kids _ -> return [] getReferences :: [DirectObject] -> [ObjectId] getReferences objects = do object <- objects case object of Reference (IndirectObjCoordinates {objectId}) -> [objectId] _ -> [] extractText :: Dictionary -> T [Text] extractText pageDict = do fonts <- loadFonts =<< getFontDictionary pageDict objects <- several <$> getField "Contents" pageDict concat <$> mapM (loadContent fonts) objects where loadContent :: FontSet -> DirectObject -> T [Text] loadContent fonts directObject = follow directObject >>= openStream >>= (either fail return . Content.parse) >>= renderText fonts loadPage :: ObjectId -> T Page loadPage source = do contents <- extractText =<< dictionaryById source return $ Page {contents, source} getAll :: Content -> Either String (Map Int Page) getAll content = runError $ fst <$> evalRWST getPages content Map.empty where numbered = Map.fromList . zip [1..] getPages = numbered <$> (mapM loadPage =<< pagesList) get :: Content -> Int -> Either String Page get content pageNumber | pageNumber < 1 = Left "Pages start at 1" | otherwise = runError $ fst <$> evalRWST getPage content Map.empty where firstPage [] = fail "Page is out of bounds" firstPage (p:_) = loadPage p getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage