Use a defined monadic stack in Pages to lift the MonadReader ambiguity and allow finishing to reimplement getText demo

This commit is contained in:
Tissevert 2020-03-14 16:57:16 +01:00
parent 40475a3093
commit 0f857c457d
2 changed files with 30 additions and 60 deletions

View file

@ -1,47 +1,36 @@
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
import Control.Monad ((>=>)) import Control.Monad ((>=>))
import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.IO.Class (liftIO)
import Control.Monad.Reader (runReaderT) import Control.Monad.Reader (runReaderT)
import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Char8 as BS (readFile)
import Data.Map (Map, foldlWithKey, mapWithKey) import Data.Map (mapWithKey)
import qualified Data.Map as Map (empty, insert, toList, union) import qualified Data.Map as Map (toList)
import Data.OrderedMap (mapi) import Data.OrderedMap (mapi)
import qualified Data.Text as Text (unpack) import qualified Data.Text as Text (unpack)
import qualified Data.Text.IO as Text (putStrLn)
import PDF (UnifiedLayers(..), parseDocument) import PDF (UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..)) import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer) import PDF.Layer (Layer)
import PDF.Output (ObjectId(..)) import PDF.Output (ObjectId(..))
import PDF.Pages ( import PDF.Pages (
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), Text_(..), cacheFonts, withResources Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), cacheFonts, withResources
) )
import System.Environment (getArgs) import System.Environment (getArgs)
import System.Exit (die) import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout) import System.IO (BufferMode(..), hSetBuffering, stdout)
import Text.Printf (printf) import Text.Printf (printf)
displayPage :: (MonadIO m, FontCache m) => Page -> m () displayPage :: Page -> FontCache IO ()
displayPage = displayPage = withResources (
withResources (r Contents) r Contents
>=> sequence_ . mapi (\objectId content -> >=> sequence_ . mapi (\objectId ->
r Chunks content -- >=> sequence_ . mapWithKey (display objectId) r Chunks >=> sequence_ . mapWithKey (display objectId)
) )
where
display = undefined
{-
(
r Contents :: ReaderT FontSet m (OrderedMap ObjectId Content)
-- >=> sequenceA . mapi $ \(objectId, content) -> undefined
) )
--sequenceA $ mapWithKey (display objectId) (r Chunks content)
-- >=> mapM_ Text.putStrLn
where where
display a b v = display a b v =
liftIO . putStrLn $ liftIO . putStrLn $
printf "%d@%s: %s" (getObjectId a) (show b) (Text.unpack v) printf "%d@%s: %s" (getObjectId a) (show b) (Text.unpack v)
-}
getAll :: Layer -> IO () getAll :: Layer -> IO ()
getAll layer = getAll layer =

View file

@ -1,17 +1,14 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE UndecidableInstances #-}
module PDF.Pages ( module PDF.Pages (
Contents(..) Contents(..)
, FontCache , FontCache
, Page(..) , Page(..)
, PageNumber(..) , PageNumber(..)
, Pages(..) , Pages(..)
, Text_(..)
, cacheFonts , cacheFonts
, withResources , withResources
) where ) where
@ -20,17 +17,15 @@ import Control.Applicative (Alternative, (<|>))
import Control.Monad (foldM) import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (ReaderT, runReaderT) import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify) import Control.Monad.State (StateT(..), evalStateT, gets, modify)
import qualified Control.Monad.RWS as RWS (get) import Control.Monad.Trans (lift)
import Data.OrderedMap (OrderedMap, build) import Data.OrderedMap (OrderedMap, build)
import Data.Map (Map, foldrWithKey, mapWithKey) import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.CMap (cMap) import PDF.CMap (cMap)
import PDF.Content (Content(..), Id(..), Indexed) import PDF.Content (Content(..))
import qualified PDF.Content as Content (parse) import qualified PDF.Content as Content (parse)
import PDF.Content.Text (Chunks(..))
import PDF.Encoding (encoding) import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet) import PDF.Font (Font, FontSet)
import PDF.Layer (Layer(..)) import PDF.Layer (Layer(..))
@ -46,41 +41,41 @@ import PDF.Output (ObjectId(..))
import Prelude hiding (fail) import Prelude hiding (fail)
import Text.Printf (printf) import Text.Printf (printf)
type Except m = (Alternative m, MonadFail m)
type InLayer m = ReaderT Layer m
type CachedFonts = Map ObjectId Font type CachedFonts = Map ObjectId Font
type FontCache m = (MonadState CachedFonts m, PDFContent m) type FontCache m = StateT CachedFonts (InLayer m)
data Page = Page { data Page = Page {
--contents :: [(ObjectId, Content)]
contents :: OrderedMap ObjectId Content contents :: OrderedMap ObjectId Content
, resources :: Dictionary , resources :: Dictionary
, source :: ObjectId , source :: ObjectId
} }
--loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)] loadContents :: Except m => DirectObject -> InLayer m (OrderedMap ObjectId Content)
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
loadContents directObject = loadContents directObject =
sequenceA . build loadContent $ objectIds directObject sequenceA . build loadContent $ objectIds directObject
where where
loadContent :: PDFContent m => ObjectId -> m Content loadContent :: Except m => ObjectId -> InLayer m Content
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
objectIds (Array l) = l >>= getReference objectIds (Array l) = l >>= getReference
objectIds dirObj = getReference dirObj objectIds dirObj = getReference dirObj
--build f = fmap (\k -> f k >>= \v -> return (k, v))
getFontDictionary :: PDFContent m => Object -> m Dictionary getFontDictionary :: Except m => Object -> InLayer m Dictionary
getFontDictionary pageObj = getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty (pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font cache :: Except m => (ObjectId -> FontCache m Font) -> ObjectId -> FontCache m Font
cache loader objectId = cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get gets (Map.lookup objectId) >>= maybe load return
where where
load = do load = do
value <- loader objectId value <- loader objectId
modify $ Map.insert objectId value modify $ Map.insert objectId value
return value return value
loadFont :: FontCache m => ObjectId -> m Font loadFont :: Except m => ObjectId -> FontCache m Font
loadFont objectId = objectById objectId >>= tryMappings loadFont objectId = lift $ objectById objectId >>= tryMappings
where where
tryMappings object = tryMappings object =
(object >./ "ToUnicode" >>= r Clear >>= cMap) (object >./ "ToUnicode" >>= r Clear >>= cMap)
@ -92,7 +87,7 @@ loadFont objectId = objectById objectId >>= tryMappings
loadEncoding object = loadEncoding object =
fail $ printf "Encoding must be a name, not that : %s" $ show object fail $ printf "Encoding must be a name, not that : %s" $ show object
loadResources :: FontCache m => Dictionary -> m FontSet loadResources :: Except m => Dictionary -> FontCache m FontSet
loadResources = foldM addFont Map.empty . Map.toList loadResources = foldM addFont Map.empty . Map.toList
where where
addFont output (name, Reference (IndirectObjCoordinates {objectId})) = addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
@ -103,14 +98,14 @@ getReference :: DirectObject -> [ObjectId]
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId] getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
getReference _ = [] getReference _ = []
loadPage :: PDFContent m => ObjectId -> m Page loadPage :: Except m => ObjectId -> InLayer m Page
loadPage source = do loadPage source = do
page <- objectById source page <- objectById source
contents <- getKey "Contents" page >>= loadContents contents <- getKey "Contents" page >>= loadContents
resources <- getFontDictionary page resources <- getFontDictionary page
return $ Page {contents, resources, source} return $ Page {contents, resources, source}
pagesList :: PDFContent m => m [ObjectId] pagesList :: Except m => InLayer m [ObjectId]
pagesList = pagesList =
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences) (origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
<|> return [] <|> return []
@ -121,7 +116,6 @@ pagesList =
data Pages = Pages data Pages = Pages
newtype PageNumber = P Int newtype PageNumber = P Int
data Contents = Contents data Contents = Contents
data Text_ = Text_
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
@ -137,25 +131,12 @@ instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where
firstPage [] = fail "Page is out of bounds" firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p firstPage (p:_) = loadPage p
--instance Monad m => Box m Contents Page [(ObjectId, Content)] where
instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
r Contents = return . contents r Contents = return . contents
{-
instance FontCache m => Box m Text_ Page (OrderedMap ObjectId (Indexed Text)) where
--instance FontCache m => Box m Text_ Page [((ObjectId, Id Text), Text)] where
r Text_ (Page {contents, resources}) =
loadResources resources
>>= runReaderT (mapM (r Chunks) contents)
-- >>= runReaderT (concat <$> mapM (r Chunks . snd) contents)
where
mixIn prefix = foldrWithKey (\k v m -> ((prefix, k), v):m) []
extractChunks (objectId, content) = mixIn objectId <$> r Chunks content
-}
cacheFonts :: Monad m => StateT CachedFonts m a -> m a cacheFonts :: Monad m => StateT CachedFonts m a -> m a
cacheFonts = flip evalStateT Map.empty cacheFonts = flip evalStateT Map.empty
withResources :: FontCache m => (Page -> ReaderT FontSet m b) -> Page -> m b withResources :: Except m => (Page -> ReaderT FontSet m b) -> Page -> FontCache m b
withResources f p = withResources f p =
loadResources (resources p) >>= runReaderT (f p) loadResources (resources p) >>= lift . lift . runReaderT (f p)