{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE UndecidableInstances #-} module PDF.Pages ( Page(..) , PageNumber(..) , Pages , cacheFonts --, get --, getAll ) where import Control.Applicative (Alternative(..)) import Control.Applicative ((<|>)) import Control.Monad (foldM) import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Reader (runReaderT) import Control.Monad.State (MonadState, StateT(..), evalStateT) import Control.Monad.RWS (RWST(..), evalRWST, modify) import qualified Control.Monad.RWS as RWS (get) import Data.Map (Map) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import Data.Text (Text) import PDF.Box (Box(..)) import PDF.CMap (cMap) import qualified PDF.Content as Content (parse) import PDF.Content.Text (renderText) import PDF.Encoding (encoding) import PDF.Font (Font, FontSet) import PDF.Layer (Layer(..)) import PDF.Object ( Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Name(..), Object(..) ,) import PDF.Object.Navigation ( Error(..), PDFContent, StreamContent(..), (./), (//), (>./), (>//) , castObject, getDictionary, objectById, origin ) import PDF.Output (ObjectId(..)) import Prelude hiding (fail) import Text.Printf (printf) type CachedFonts = Map ObjectId Font type T = RWST Layer () CachedFonts Error type PageContext = StateT CachedFonts type FontCache m = (MonadState CachedFonts m, PDFContent m) --type PDFContent m = (MonadState Layer m, MonadFail m, MonadState CachedFonts m) data Page = Page { contents :: [Text] , source :: ObjectId } getFontDictionary :: PDFContent m => Object -> m Dictionary getFontDictionary pageObj = (pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font cache loader objectId = (maybe load return . Map.lookup objectId) =<< RWS.get where load = do value <- loader objectId modify $ Map.insert objectId value return value loadFont :: FontCache m => ObjectId -> m Font loadFont objectId = objectById objectId >>= tryMappings where tryMappings object = (object >./ "ToUnicode" >>= r StreamContent >>= cMap) <|> (object >./ "Encoding" >>= loadEncoding) <|> (fail $ unknownFormat (show objectId) (show object)) unknownFormat = printf "Unknown font format for object #%s : %s" loadEncoding :: MonadFail m => Object -> m Font loadEncoding (Direct (NameObject (Name name))) = encoding name loadEncoding object = fail $ printf "Encoding must be a name, not that : %s" $ show object loadFonts :: FontCache m => Dictionary -> m FontSet loadFonts = foldM addFont Map.empty . Map.toList where addFont output (name, Reference (IndirectObjCoordinates {objectId})) = flip (Map.insert name) output <$> cache loadFont objectId addFont output _ = return output several :: PDFContent m => Object -> m [Object] several (Direct (Array l)) = mapM castObject l several object = return [object] pagesList :: PDFContent m => m [ObjectId] pagesList = do pages <- origin // ["Root", "Pages"] >>= getDictionary case Map.lookup (Name "Kids") pages of Just (Array kids) -> return $ getReferences kids _ -> return [] getReferences :: [DirectObject] -> [ObjectId] getReferences objects = do object <- objects case object of Reference (IndirectObjCoordinates {objectId}) -> [objectId] _ -> [] extractText :: FontCache m => Object -> m [Text] extractText pageObj = do fonts <- loadFonts =<< getFontDictionary pageObj objects <- pageObj >./ "Contents" >>= several concat <$> mapM (loadContent fonts) objects where loadContent fonts object = r StreamContent object >>= (either fail return . Content.parse) >>= renderText fonts loadPage :: FontCache m => ObjectId -> m Page loadPage source = do contents <- extractText =<< objectById source return $ Page {contents, source} data Pages = Pages newtype PageNumber = P Int instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer where numbered :: [Page] -> Map Int Page numbered = Map.fromList . zip [1..] instance Box T PageNumber Layer Page cacheFonts :: Monad m => StateT CachedFonts m a -> m a cacheFonts = flip evalStateT Map.empty {- getAll :: Layer -> Either String (Map Int Page) getAll content = runError $ fst <$> evalRWST getPages content Map.empty where numbered = Map.fromList . zip [1..] getPages = numbered <$> (mapM loadPage =<< pagesList) get :: Layer -> Int -> Either String Page get content pageNumber | pageNumber < 1 = Left "Pages start at 1" | otherwise = runError $ fst <$> evalRWST getPage content Map.empty where firstPage [] = fail "Page is out of bounds" firstPage (p:_) = loadPage p getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage -}