Start reimplementing getAll as a Box instance and try to separate the various monad run steps
This commit is contained in:
parent
3b3eeef218
commit
d288ecf0ac
1 changed files with 36 additions and 15 deletions
|
@ -2,15 +2,24 @@
|
||||||
{-# LANGUAGE TypeSynonymInstances #-}
|
{-# LANGUAGE TypeSynonymInstances #-}
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
{-# LANGUAGE UndecidableInstances #-}
|
||||||
module PDF.Pages (
|
module PDF.Pages (
|
||||||
Page(..)
|
Page(..)
|
||||||
, get
|
, PageNumber(..)
|
||||||
, getAll
|
, Pages
|
||||||
|
, cacheFonts
|
||||||
|
--, get
|
||||||
|
--, getAll
|
||||||
) where
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative (Alternative(..))
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad (foldM)
|
import Control.Monad (foldM)
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Control.Monad.Reader (runReaderT)
|
||||||
|
import Control.Monad.State (MonadState, StateT(..), evalStateT)
|
||||||
import Control.Monad.RWS (RWST(..), evalRWST, modify)
|
import Control.Monad.RWS (RWST(..), evalRWST, modify)
|
||||||
import qualified Control.Monad.RWS as RWS (get)
|
import qualified Control.Monad.RWS as RWS (get)
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
|
@ -28,7 +37,7 @@ import PDF.Object (
|
||||||
, Name(..), Object(..)
|
, Name(..), Object(..)
|
||||||
,)
|
,)
|
||||||
import PDF.Object.Navigation (
|
import PDF.Object.Navigation (
|
||||||
Error(..), StreamContent(..), (//), (>./), (>//), getDictionary, objectById
|
Error(..), PDFContent, StreamContent(..), (//), (>./), (>//), getDictionary, objectById
|
||||||
, origin
|
, origin
|
||||||
)
|
)
|
||||||
import PDF.Output (ObjectId(..))
|
import PDF.Output (ObjectId(..))
|
||||||
|
@ -37,17 +46,20 @@ import Text.Printf (printf)
|
||||||
|
|
||||||
type CachedFonts = Map ObjectId Font
|
type CachedFonts = Map ObjectId Font
|
||||||
type T = RWST Layer () CachedFonts Error
|
type T = RWST Layer () CachedFonts Error
|
||||||
|
type PageContext = StateT CachedFonts
|
||||||
|
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
||||||
|
--type PDFContent m = (MonadState Layer m, MonadFail m, MonadState CachedFonts m)
|
||||||
data Page = Page {
|
data Page = Page {
|
||||||
contents :: [Text]
|
contents :: [Text]
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
}
|
}
|
||||||
|
|
||||||
getFontDictionary :: Object -> T Dictionary
|
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
||||||
getFontDictionary pageObj =
|
getFontDictionary pageObj =
|
||||||
(pageObj >// ["Resources", "Font"] >>= getDictionary)
|
(pageObj >// ["Resources", "Font"] >>= getDictionary)
|
||||||
<|> return Map.empty
|
<|> return Map.empty
|
||||||
|
|
||||||
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
||||||
cache loader objectId =
|
cache loader objectId =
|
||||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||||
where
|
where
|
||||||
|
@ -56,7 +68,7 @@ cache loader objectId =
|
||||||
modify $ Map.insert objectId value
|
modify $ Map.insert objectId value
|
||||||
return value
|
return value
|
||||||
|
|
||||||
loadFont :: ObjectId -> T Font
|
loadFont :: FontCache m => ObjectId -> m Font
|
||||||
loadFont objectId = objectById objectId >>= tryMappings
|
loadFont objectId = objectById objectId >>= tryMappings
|
||||||
where
|
where
|
||||||
tryMappings object =
|
tryMappings object =
|
||||||
|
@ -64,15 +76,14 @@ loadFont objectId = objectById objectId >>= tryMappings
|
||||||
<|> (object >./ "Encoding" >>= loadEncoding)
|
<|> (object >./ "Encoding" >>= loadEncoding)
|
||||||
<|> (fail $ unknownFormat (show objectId) (show object))
|
<|> (fail $ unknownFormat (show objectId) (show object))
|
||||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||||
loadEncoding :: Object -> T Font
|
loadEncoding :: MonadFail m => Object -> m Font
|
||||||
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
||||||
loadEncoding object =
|
loadEncoding object =
|
||||||
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
||||||
|
|
||||||
loadFonts :: Dictionary -> T FontSet
|
loadFonts :: FontCache m => Dictionary -> m FontSet
|
||||||
loadFonts = foldM addFont Map.empty . Map.toList
|
loadFonts = foldM addFont Map.empty . Map.toList
|
||||||
where
|
where
|
||||||
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
|
|
||||||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||||
flip (Map.insert name) output <$> cache loadFont objectId
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
addFont output _ = return output
|
addFont output _ = return output
|
||||||
|
@ -81,7 +92,7 @@ several :: Object -> [Object]
|
||||||
several (Direct (Array l)) = Direct <$> l
|
several (Direct (Array l)) = Direct <$> l
|
||||||
several object = [object]
|
several object = [object]
|
||||||
|
|
||||||
pagesList :: T [ObjectId]
|
pagesList :: PDFContent m => m [ObjectId]
|
||||||
pagesList = do
|
pagesList = do
|
||||||
pages <- origin // ["Root", "Pages"] >>= getDictionary
|
pages <- origin // ["Root", "Pages"] >>= getDictionary
|
||||||
case Map.lookup (Name "Kids") pages of
|
case Map.lookup (Name "Kids") pages of
|
||||||
|
@ -95,28 +106,37 @@ getReferences objects = do
|
||||||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||||
_ -> []
|
_ -> []
|
||||||
|
|
||||||
extractText :: Object -> T [Text]
|
extractText :: FontCache m => Object -> m [Text]
|
||||||
extractText pageObj = do
|
extractText pageObj = do
|
||||||
fonts <- loadFonts =<< getFontDictionary pageObj
|
fonts <- loadFonts =<< getFontDictionary pageObj
|
||||||
objects <- several <$> pageObj >./ "Contents"
|
objects <- several <$> pageObj >./ "Contents"
|
||||||
concat <$> mapM (loadContent fonts) objects
|
concat <$> mapM (loadContent fonts) objects
|
||||||
where
|
where
|
||||||
loadContent :: FontSet -> Object -> T [Text]
|
|
||||||
loadContent fonts object =
|
loadContent fonts object =
|
||||||
r StreamContent object
|
r StreamContent object
|
||||||
>>= (either fail return . Content.parse)
|
>>= (either fail return . Content.parse)
|
||||||
>>= renderText fonts
|
>>= renderText fonts
|
||||||
|
|
||||||
loadPage :: ObjectId -> T Page
|
loadPage :: FontCache m => ObjectId -> m Page
|
||||||
loadPage source = do
|
loadPage source = do
|
||||||
contents <- extractText =<< objectById source
|
contents <- extractText =<< objectById source
|
||||||
return $ Page {contents, source}
|
return $ Page {contents, source}
|
||||||
|
|
||||||
data Pages = Pages
|
data Pages = Pages
|
||||||
|
newtype PageNumber = P Int
|
||||||
|
|
||||||
instance Box T Pages Layer (Map ObjectId Page)
|
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
||||||
instance Box T ObjectId Layer Page
|
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
||||||
|
where
|
||||||
|
numbered :: [Page] -> Map Int Page
|
||||||
|
numbered = Map.fromList . zip [1..]
|
||||||
|
|
||||||
|
instance Box T PageNumber Layer Page
|
||||||
|
|
||||||
|
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
||||||
|
cacheFonts = flip evalStateT Map.empty
|
||||||
|
|
||||||
|
{-
|
||||||
getAll :: Layer -> Either String (Map Int Page)
|
getAll :: Layer -> Either String (Map Int Page)
|
||||||
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
|
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
|
||||||
where
|
where
|
||||||
|
@ -131,3 +151,4 @@ get content pageNumber
|
||||||
firstPage [] = fail "Page is out of bounds"
|
firstPage [] = fail "Page is out of bounds"
|
||||||
firstPage (p:_) = loadPage p
|
firstPage (p:_) = loadPage p
|
||||||
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|
||||||
|
-}
|
||||||
|
|
Loading…
Reference in a new issue