Start reimplementing getAll as a Box instance and try to separate the various monad run steps

This commit is contained in:
Tissevert 2020-03-03 18:17:44 +01:00
parent 3b3eeef218
commit d288ecf0ac

View file

@ -2,15 +2,24 @@
{-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE UndecidableInstances #-}
module PDF.Pages ( module PDF.Pages (
Page(..) Page(..)
, get , PageNumber(..)
, getAll , Pages
, cacheFonts
--, get
--, getAll
) where ) where
import Control.Applicative (Alternative(..))
import Control.Applicative ((<|>)) import Control.Applicative ((<|>))
import Control.Monad (foldM) import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT)
import Control.Monad.RWS (RWST(..), evalRWST, modify) import Control.Monad.RWS (RWST(..), evalRWST, modify)
import qualified Control.Monad.RWS as RWS (get) import qualified Control.Monad.RWS as RWS (get)
import Data.Map (Map) import Data.Map (Map)
@ -28,7 +37,7 @@ import PDF.Object (
, Name(..), Object(..) , Name(..), Object(..)
,) ,)
import PDF.Object.Navigation ( import PDF.Object.Navigation (
Error(..), StreamContent(..), (//), (>./), (>//), getDictionary, objectById Error(..), PDFContent, StreamContent(..), (//), (>./), (>//), getDictionary, objectById
, origin , origin
) )
import PDF.Output (ObjectId(..)) import PDF.Output (ObjectId(..))
@ -37,17 +46,20 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font type CachedFonts = Map ObjectId Font
type T = RWST Layer () CachedFonts Error type T = RWST Layer () CachedFonts Error
type PageContext = StateT CachedFonts
type FontCache m = (MonadState CachedFonts m, PDFContent m)
--type PDFContent m = (MonadState Layer m, MonadFail m, MonadState CachedFonts m)
data Page = Page { data Page = Page {
contents :: [Text] contents :: [Text]
, source :: ObjectId , source :: ObjectId
} }
getFontDictionary :: Object -> T Dictionary getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj = getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary) (pageObj >// ["Resources", "Font"] >>= getDictionary)
<|> return Map.empty <|> return Map.empty
cache :: (ObjectId -> T Font) -> ObjectId -> T Font cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
cache loader objectId = cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get (maybe load return . Map.lookup objectId) =<< RWS.get
where where
@ -56,7 +68,7 @@ cache loader objectId =
modify $ Map.insert objectId value modify $ Map.insert objectId value
return value return value
loadFont :: ObjectId -> T Font loadFont :: FontCache m => ObjectId -> m Font
loadFont objectId = objectById objectId >>= tryMappings loadFont objectId = objectById objectId >>= tryMappings
where where
tryMappings object = tryMappings object =
@ -64,15 +76,14 @@ loadFont objectId = objectById objectId >>= tryMappings
<|> (object >./ "Encoding" >>= loadEncoding) <|> (object >./ "Encoding" >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show object)) <|> (fail $ unknownFormat (show objectId) (show object))
unknownFormat = printf "Unknown font format for object #%s : %s" unknownFormat = printf "Unknown font format for object #%s : %s"
loadEncoding :: Object -> T Font loadEncoding :: MonadFail m => Object -> m Font
loadEncoding (Direct (NameObject (Name name))) = encoding name loadEncoding (Direct (NameObject (Name name))) = encoding name
loadEncoding object = loadEncoding object =
fail $ printf "Encoding must be a name, not that : %s" $ show object fail $ printf "Encoding must be a name, not that : %s" $ show object
loadFonts :: Dictionary -> T FontSet loadFonts :: FontCache m => Dictionary -> m FontSet
loadFonts = foldM addFont Map.empty . Map.toList loadFonts = foldM addFont Map.empty . Map.toList
where where
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
addFont output (name, Reference (IndirectObjCoordinates {objectId})) = addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output addFont output _ = return output
@ -81,7 +92,7 @@ several :: Object -> [Object]
several (Direct (Array l)) = Direct <$> l several (Direct (Array l)) = Direct <$> l
several object = [object] several object = [object]
pagesList :: T [ObjectId] pagesList :: PDFContent m => m [ObjectId]
pagesList = do pagesList = do
pages <- origin // ["Root", "Pages"] >>= getDictionary pages <- origin // ["Root", "Pages"] >>= getDictionary
case Map.lookup (Name "Kids") pages of case Map.lookup (Name "Kids") pages of
@ -95,28 +106,37 @@ getReferences objects = do
Reference (IndirectObjCoordinates {objectId}) -> [objectId] Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> [] _ -> []
extractText :: Object -> T [Text] extractText :: FontCache m => Object -> m [Text]
extractText pageObj = do extractText pageObj = do
fonts <- loadFonts =<< getFontDictionary pageObj fonts <- loadFonts =<< getFontDictionary pageObj
objects <- several <$> pageObj >./ "Contents" objects <- several <$> pageObj >./ "Contents"
concat <$> mapM (loadContent fonts) objects concat <$> mapM (loadContent fonts) objects
where where
loadContent :: FontSet -> Object -> T [Text]
loadContent fonts object = loadContent fonts object =
r StreamContent object r StreamContent object
>>= (either fail return . Content.parse) >>= (either fail return . Content.parse)
>>= renderText fonts >>= renderText fonts
loadPage :: ObjectId -> T Page loadPage :: FontCache m => ObjectId -> m Page
loadPage source = do loadPage source = do
contents <- extractText =<< objectById source contents <- extractText =<< objectById source
return $ Page {contents, source} return $ Page {contents, source}
data Pages = Pages data Pages = Pages
newtype PageNumber = P Int
instance Box T Pages Layer (Map ObjectId Page) instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
instance Box T ObjectId Layer Page r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
where
numbered :: [Page] -> Map Int Page
numbered = Map.fromList . zip [1..]
instance Box T PageNumber Layer Page
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
cacheFonts = flip evalStateT Map.empty
{-
getAll :: Layer -> Either String (Map Int Page) getAll :: Layer -> Either String (Map Int Page)
getAll content = runError $ fst <$> evalRWST getPages content Map.empty getAll content = runError $ fst <$> evalRWST getPages content Map.empty
where where
@ -131,3 +151,4 @@ get content pageNumber
firstPage [] = fail "Page is out of bounds" firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p firstPage (p:_) = loadPage p
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
-}