Start reimplementing getAll as a Box instance and try to separate the various monad run steps
This commit is contained in:
parent
3b3eeef218
commit
d288ecf0ac
1 changed files with 36 additions and 15 deletions
|
@ -2,15 +2,24 @@
|
|||
{-# LANGUAGE TypeSynonymInstances #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
{-# LANGUAGE ConstraintKinds #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
{-# LANGUAGE UndecidableInstances #-}
|
||||
module PDF.Pages (
|
||||
Page(..)
|
||||
, get
|
||||
, getAll
|
||||
, PageNumber(..)
|
||||
, Pages
|
||||
, cacheFonts
|
||||
--, get
|
||||
--, getAll
|
||||
) where
|
||||
|
||||
import Control.Applicative (Alternative(..))
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.Reader (runReaderT)
|
||||
import Control.Monad.State (MonadState, StateT(..), evalStateT)
|
||||
import Control.Monad.RWS (RWST(..), evalRWST, modify)
|
||||
import qualified Control.Monad.RWS as RWS (get)
|
||||
import Data.Map (Map)
|
||||
|
@ -28,7 +37,7 @@ import PDF.Object (
|
|||
, Name(..), Object(..)
|
||||
,)
|
||||
import PDF.Object.Navigation (
|
||||
Error(..), StreamContent(..), (//), (>./), (>//), getDictionary, objectById
|
||||
Error(..), PDFContent, StreamContent(..), (//), (>./), (>//), getDictionary, objectById
|
||||
, origin
|
||||
)
|
||||
import PDF.Output (ObjectId(..))
|
||||
|
@ -37,17 +46,20 @@ import Text.Printf (printf)
|
|||
|
||||
type CachedFonts = Map ObjectId Font
|
||||
type T = RWST Layer () CachedFonts Error
|
||||
type PageContext = StateT CachedFonts
|
||||
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
||||
--type PDFContent m = (MonadState Layer m, MonadFail m, MonadState CachedFonts m)
|
||||
data Page = Page {
|
||||
contents :: [Text]
|
||||
, source :: ObjectId
|
||||
}
|
||||
|
||||
getFontDictionary :: Object -> T Dictionary
|
||||
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
||||
getFontDictionary pageObj =
|
||||
(pageObj >// ["Resources", "Font"] >>= getDictionary)
|
||||
<|> return Map.empty
|
||||
|
||||
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
||||
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
||||
cache loader objectId =
|
||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||
where
|
||||
|
@ -56,7 +68,7 @@ cache loader objectId =
|
|||
modify $ Map.insert objectId value
|
||||
return value
|
||||
|
||||
loadFont :: ObjectId -> T Font
|
||||
loadFont :: FontCache m => ObjectId -> m Font
|
||||
loadFont objectId = objectById objectId >>= tryMappings
|
||||
where
|
||||
tryMappings object =
|
||||
|
@ -64,15 +76,14 @@ loadFont objectId = objectById objectId >>= tryMappings
|
|||
<|> (object >./ "Encoding" >>= loadEncoding)
|
||||
<|> (fail $ unknownFormat (show objectId) (show object))
|
||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||
loadEncoding :: Object -> T Font
|
||||
loadEncoding :: MonadFail m => Object -> m Font
|
||||
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
||||
loadEncoding object =
|
||||
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
||||
|
||||
loadFonts :: Dictionary -> T FontSet
|
||||
loadFonts :: FontCache m => Dictionary -> m FontSet
|
||||
loadFonts = foldM addFont Map.empty . Map.toList
|
||||
where
|
||||
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
|
||||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||
flip (Map.insert name) output <$> cache loadFont objectId
|
||||
addFont output _ = return output
|
||||
|
@ -81,7 +92,7 @@ several :: Object -> [Object]
|
|||
several (Direct (Array l)) = Direct <$> l
|
||||
several object = [object]
|
||||
|
||||
pagesList :: T [ObjectId]
|
||||
pagesList :: PDFContent m => m [ObjectId]
|
||||
pagesList = do
|
||||
pages <- origin // ["Root", "Pages"] >>= getDictionary
|
||||
case Map.lookup (Name "Kids") pages of
|
||||
|
@ -95,28 +106,37 @@ getReferences objects = do
|
|||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||
_ -> []
|
||||
|
||||
extractText :: Object -> T [Text]
|
||||
extractText :: FontCache m => Object -> m [Text]
|
||||
extractText pageObj = do
|
||||
fonts <- loadFonts =<< getFontDictionary pageObj
|
||||
objects <- several <$> pageObj >./ "Contents"
|
||||
concat <$> mapM (loadContent fonts) objects
|
||||
where
|
||||
loadContent :: FontSet -> Object -> T [Text]
|
||||
loadContent fonts object =
|
||||
r StreamContent object
|
||||
>>= (either fail return . Content.parse)
|
||||
>>= renderText fonts
|
||||
|
||||
loadPage :: ObjectId -> T Page
|
||||
loadPage :: FontCache m => ObjectId -> m Page
|
||||
loadPage source = do
|
||||
contents <- extractText =<< objectById source
|
||||
return $ Page {contents, source}
|
||||
|
||||
data Pages = Pages
|
||||
newtype PageNumber = P Int
|
||||
|
||||
instance Box T Pages Layer (Map ObjectId Page)
|
||||
instance Box T ObjectId Layer Page
|
||||
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
||||
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
||||
where
|
||||
numbered :: [Page] -> Map Int Page
|
||||
numbered = Map.fromList . zip [1..]
|
||||
|
||||
instance Box T PageNumber Layer Page
|
||||
|
||||
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
||||
cacheFonts = flip evalStateT Map.empty
|
||||
|
||||
{-
|
||||
getAll :: Layer -> Either String (Map Int Page)
|
||||
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
|
||||
where
|
||||
|
@ -131,3 +151,4 @@ get content pageNumber
|
|||
firstPage [] = fail "Page is out of bounds"
|
||||
firstPage (p:_) = loadPage p
|
||||
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|
||||
-}
|
||||
|
|
Loading…
Reference in a new issue