Start reimplementing getAll as a Box instance and try to separate the various monad run steps

This commit is contained in:
Tissevert 2020-03-03 18:17:44 +01:00
parent 3b3eeef218
commit d288ecf0ac
1 changed files with 36 additions and 15 deletions

View File

@ -2,15 +2,24 @@
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE UndecidableInstances #-}
module PDF.Pages (
Page(..)
, get
, getAll
, PageNumber(..)
, Pages
, cacheFonts
--, get
--, getAll
) where
import Control.Applicative (Alternative(..))
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT)
import Control.Monad.RWS (RWST(..), evalRWST, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.Map (Map)
@ -28,7 +37,7 @@ import PDF.Object (
, Name(..), Object(..)
,)
import PDF.Object.Navigation (
Error(..), StreamContent(..), (//), (>./), (>//), getDictionary, objectById
Error(..), PDFContent, StreamContent(..), (//), (>./), (>//), getDictionary, objectById
, origin
)
import PDF.Output (ObjectId(..))
@ -37,17 +46,20 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type T = RWST Layer () CachedFonts Error
type PageContext = StateT CachedFonts
type FontCache m = (MonadState CachedFonts m, PDFContent m)
--type PDFContent m = (MonadState Layer m, MonadFail m, MonadState CachedFonts m)
data Page = Page {
contents :: [Text]
, source :: ObjectId
}
getFontDictionary :: Object -> T Dictionary
getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary)
<|> return Map.empty
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
where
@ -56,7 +68,7 @@ cache loader objectId =
modify $ Map.insert objectId value
return value
loadFont :: ObjectId -> T Font
loadFont :: FontCache m => ObjectId -> m Font
loadFont objectId = objectById objectId >>= tryMappings
where
tryMappings object =
@ -64,15 +76,14 @@ loadFont objectId = objectById objectId >>= tryMappings
<|> (object >./ "Encoding" >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show object))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadEncoding :: Object -> T Font
loadEncoding :: MonadFail m => Object -> m Font
loadEncoding (Direct (NameObject (Name name))) = encoding name
loadEncoding object =
fail $ printf "Encoding must be a name, not that : %s" $ show object
loadFonts :: Dictionary -> T FontSet
loadFonts :: FontCache m => Dictionary -> m FontSet
loadFonts = foldM addFont Map.empty . Map.toList
where
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
@ -81,7 +92,7 @@ several :: Object -> [Object]
several (Direct (Array l)) = Direct <$> l
several object = [object]
pagesList :: T [ObjectId]
pagesList :: PDFContent m => m [ObjectId]
pagesList = do
pages <- origin // ["Root", "Pages"] >>= getDictionary
case Map.lookup (Name "Kids") pages of
@ -95,28 +106,37 @@ getReferences objects = do
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: Object -> T [Text]
extractText :: FontCache m => Object -> m [Text]
extractText pageObj = do
fonts <- loadFonts =<< getFontDictionary pageObj
objects <- several <$> pageObj >./ "Contents"
concat <$> mapM (loadContent fonts) objects
where
loadContent :: FontSet -> Object -> T [Text]
loadContent fonts object =
r StreamContent object
>>= (either fail return . Content.parse)
>>= renderText fonts
loadPage :: ObjectId -> T Page
loadPage :: FontCache m => ObjectId -> m Page
loadPage source = do
contents <- extractText =<< objectById source
return $ Page {contents, source}
data Pages = Pages
newtype PageNumber = P Int
instance Box T Pages Layer (Map ObjectId Page)
instance Box T ObjectId Layer Page
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
where
numbered :: [Page] -> Map Int Page
numbered = Map.fromList . zip [1..]
instance Box T PageNumber Layer Page
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
cacheFonts = flip evalStateT Map.empty
{-
getAll :: Layer -> Either String (Map Int Page)
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
where
@ -131,3 +151,4 @@ get content pageNumber
firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
-}