Use a defined monadic stack in Pages to lift the MonadReader ambiguity and allow finishing to reimplement getText demo
This commit is contained in:
parent
40475a3093
commit
0f857c457d
2 changed files with 30 additions and 60 deletions
|
@ -1,47 +1,36 @@
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
import Control.Monad ((>=>))
|
import Control.Monad ((>=>))
|
||||||
import Control.Monad.IO.Class (MonadIO, liftIO)
|
import Control.Monad.IO.Class (liftIO)
|
||||||
import Control.Monad.Reader (runReaderT)
|
import Control.Monad.Reader (runReaderT)
|
||||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
import Data.Map (Map, foldlWithKey, mapWithKey)
|
import Data.Map (mapWithKey)
|
||||||
import qualified Data.Map as Map (empty, insert, toList, union)
|
import qualified Data.Map as Map (toList)
|
||||||
import Data.OrderedMap (mapi)
|
import Data.OrderedMap (mapi)
|
||||||
import qualified Data.Text as Text (unpack)
|
import qualified Data.Text as Text (unpack)
|
||||||
import qualified Data.Text.IO as Text (putStrLn)
|
|
||||||
import PDF (UnifiedLayers(..), parseDocument)
|
import PDF (UnifiedLayers(..), parseDocument)
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
import PDF.Content.Text (Chunks(..))
|
import PDF.Content.Text (Chunks(..))
|
||||||
import PDF.Layer (Layer)
|
import PDF.Layer (Layer)
|
||||||
import PDF.Output (ObjectId(..))
|
import PDF.Output (ObjectId(..))
|
||||||
import PDF.Pages (
|
import PDF.Pages (
|
||||||
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), Text_(..), cacheFonts, withResources
|
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), cacheFonts, withResources
|
||||||
)
|
)
|
||||||
import System.Environment (getArgs)
|
import System.Environment (getArgs)
|
||||||
import System.Exit (die)
|
import System.Exit (die)
|
||||||
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
displayPage :: (MonadIO m, FontCache m) => Page -> m ()
|
displayPage :: Page -> FontCache IO ()
|
||||||
displayPage =
|
displayPage = withResources (
|
||||||
withResources (r Contents)
|
r Contents
|
||||||
>=> sequence_ . mapi (\objectId content ->
|
>=> sequence_ . mapi (\objectId ->
|
||||||
r Chunks content -- >=> sequence_ . mapWithKey (display objectId)
|
r Chunks >=> sequence_ . mapWithKey (display objectId)
|
||||||
)
|
)
|
||||||
where
|
|
||||||
display = undefined
|
|
||||||
|
|
||||||
{-
|
|
||||||
(
|
|
||||||
r Contents :: ReaderT FontSet m (OrderedMap ObjectId Content)
|
|
||||||
-- >=> sequenceA . mapi $ \(objectId, content) -> undefined
|
|
||||||
)
|
)
|
||||||
--sequenceA $ mapWithKey (display objectId) (r Chunks content)
|
|
||||||
-- >=> mapM_ Text.putStrLn
|
|
||||||
where
|
where
|
||||||
display a b v =
|
display a b v =
|
||||||
liftIO . putStrLn $
|
liftIO . putStrLn $
|
||||||
printf "%d@%s: %s" (getObjectId a) (show b) (Text.unpack v)
|
printf "%d@%s: %s" (getObjectId a) (show b) (Text.unpack v)
|
||||||
-}
|
|
||||||
|
|
||||||
getAll :: Layer -> IO ()
|
getAll :: Layer -> IO ()
|
||||||
getAll layer =
|
getAll layer =
|
||||||
|
|
|
@ -1,17 +1,14 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
{-# LANGUAGE TypeSynonymInstances #-}
|
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
{-# LANGUAGE MultiParamTypeClasses #-}
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||||
{-# LANGUAGE ConstraintKinds #-}
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
{-# LANGUAGE UndecidableInstances #-}
|
|
||||||
module PDF.Pages (
|
module PDF.Pages (
|
||||||
Contents(..)
|
Contents(..)
|
||||||
, FontCache
|
, FontCache
|
||||||
, Page(..)
|
, Page(..)
|
||||||
, PageNumber(..)
|
, PageNumber(..)
|
||||||
, Pages(..)
|
, Pages(..)
|
||||||
, Text_(..)
|
|
||||||
, cacheFonts
|
, cacheFonts
|
||||||
, withResources
|
, withResources
|
||||||
) where
|
) where
|
||||||
|
@ -20,17 +17,15 @@ import Control.Applicative (Alternative, (<|>))
|
||||||
import Control.Monad (foldM)
|
import Control.Monad (foldM)
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.Reader (ReaderT, runReaderT)
|
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||||
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
import Control.Monad.State (StateT(..), evalStateT, gets, modify)
|
||||||
import qualified Control.Monad.RWS as RWS (get)
|
import Control.Monad.Trans (lift)
|
||||||
import Data.OrderedMap (OrderedMap, build)
|
import Data.OrderedMap (OrderedMap, build)
|
||||||
import Data.Map (Map, foldrWithKey, mapWithKey)
|
import Data.Map (Map)
|
||||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||||
import Data.Text (Text)
|
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
import PDF.CMap (cMap)
|
import PDF.CMap (cMap)
|
||||||
import PDF.Content (Content(..), Id(..), Indexed)
|
import PDF.Content (Content(..))
|
||||||
import qualified PDF.Content as Content (parse)
|
import qualified PDF.Content as Content (parse)
|
||||||
import PDF.Content.Text (Chunks(..))
|
|
||||||
import PDF.Encoding (encoding)
|
import PDF.Encoding (encoding)
|
||||||
import PDF.Font (Font, FontSet)
|
import PDF.Font (Font, FontSet)
|
||||||
import PDF.Layer (Layer(..))
|
import PDF.Layer (Layer(..))
|
||||||
|
@ -46,41 +41,41 @@ import PDF.Output (ObjectId(..))
|
||||||
import Prelude hiding (fail)
|
import Prelude hiding (fail)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
type Except m = (Alternative m, MonadFail m)
|
||||||
|
type InLayer m = ReaderT Layer m
|
||||||
|
|
||||||
type CachedFonts = Map ObjectId Font
|
type CachedFonts = Map ObjectId Font
|
||||||
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
type FontCache m = StateT CachedFonts (InLayer m)
|
||||||
data Page = Page {
|
data Page = Page {
|
||||||
--contents :: [(ObjectId, Content)]
|
|
||||||
contents :: OrderedMap ObjectId Content
|
contents :: OrderedMap ObjectId Content
|
||||||
, resources :: Dictionary
|
, resources :: Dictionary
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
}
|
}
|
||||||
|
|
||||||
--loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
|
loadContents :: Except m => DirectObject -> InLayer m (OrderedMap ObjectId Content)
|
||||||
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
|
||||||
loadContents directObject =
|
loadContents directObject =
|
||||||
sequenceA . build loadContent $ objectIds directObject
|
sequenceA . build loadContent $ objectIds directObject
|
||||||
where
|
where
|
||||||
loadContent :: PDFContent m => ObjectId -> m Content
|
loadContent :: Except m => ObjectId -> InLayer m Content
|
||||||
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
||||||
objectIds (Array l) = l >>= getReference
|
objectIds (Array l) = l >>= getReference
|
||||||
objectIds dirObj = getReference dirObj
|
objectIds dirObj = getReference dirObj
|
||||||
--build f = fmap (\k -> f k >>= \v -> return (k, v))
|
|
||||||
|
|
||||||
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
getFontDictionary :: Except m => Object -> InLayer m Dictionary
|
||||||
getFontDictionary pageObj =
|
getFontDictionary pageObj =
|
||||||
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
|
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
|
||||||
|
|
||||||
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
cache :: Except m => (ObjectId -> FontCache m Font) -> ObjectId -> FontCache m Font
|
||||||
cache loader objectId =
|
cache loader objectId =
|
||||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
gets (Map.lookup objectId) >>= maybe load return
|
||||||
where
|
where
|
||||||
load = do
|
load = do
|
||||||
value <- loader objectId
|
value <- loader objectId
|
||||||
modify $ Map.insert objectId value
|
modify $ Map.insert objectId value
|
||||||
return value
|
return value
|
||||||
|
|
||||||
loadFont :: FontCache m => ObjectId -> m Font
|
loadFont :: Except m => ObjectId -> FontCache m Font
|
||||||
loadFont objectId = objectById objectId >>= tryMappings
|
loadFont objectId = lift $ objectById objectId >>= tryMappings
|
||||||
where
|
where
|
||||||
tryMappings object =
|
tryMappings object =
|
||||||
(object >./ "ToUnicode" >>= r Clear >>= cMap)
|
(object >./ "ToUnicode" >>= r Clear >>= cMap)
|
||||||
|
@ -92,7 +87,7 @@ loadFont objectId = objectById objectId >>= tryMappings
|
||||||
loadEncoding object =
|
loadEncoding object =
|
||||||
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
||||||
|
|
||||||
loadResources :: FontCache m => Dictionary -> m FontSet
|
loadResources :: Except m => Dictionary -> FontCache m FontSet
|
||||||
loadResources = foldM addFont Map.empty . Map.toList
|
loadResources = foldM addFont Map.empty . Map.toList
|
||||||
where
|
where
|
||||||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||||
|
@ -103,14 +98,14 @@ getReference :: DirectObject -> [ObjectId]
|
||||||
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
||||||
getReference _ = []
|
getReference _ = []
|
||||||
|
|
||||||
loadPage :: PDFContent m => ObjectId -> m Page
|
loadPage :: Except m => ObjectId -> InLayer m Page
|
||||||
loadPage source = do
|
loadPage source = do
|
||||||
page <- objectById source
|
page <- objectById source
|
||||||
contents <- getKey "Contents" page >>= loadContents
|
contents <- getKey "Contents" page >>= loadContents
|
||||||
resources <- getFontDictionary page
|
resources <- getFontDictionary page
|
||||||
return $ Page {contents, resources, source}
|
return $ Page {contents, resources, source}
|
||||||
|
|
||||||
pagesList :: PDFContent m => m [ObjectId]
|
pagesList :: Except m => InLayer m [ObjectId]
|
||||||
pagesList =
|
pagesList =
|
||||||
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
|
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
|
||||||
<|> return []
|
<|> return []
|
||||||
|
@ -121,7 +116,6 @@ pagesList =
|
||||||
data Pages = Pages
|
data Pages = Pages
|
||||||
newtype PageNumber = P Int
|
newtype PageNumber = P Int
|
||||||
data Contents = Contents
|
data Contents = Contents
|
||||||
data Text_ = Text_
|
|
||||||
|
|
||||||
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
|
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
|
||||||
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
||||||
|
@ -137,25 +131,12 @@ instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where
|
||||||
firstPage [] = fail "Page is out of bounds"
|
firstPage [] = fail "Page is out of bounds"
|
||||||
firstPage (p:_) = loadPage p
|
firstPage (p:_) = loadPage p
|
||||||
|
|
||||||
--instance Monad m => Box m Contents Page [(ObjectId, Content)] where
|
|
||||||
instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
|
instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
|
||||||
r Contents = return . contents
|
r Contents = return . contents
|
||||||
|
|
||||||
{-
|
|
||||||
instance FontCache m => Box m Text_ Page (OrderedMap ObjectId (Indexed Text)) where
|
|
||||||
--instance FontCache m => Box m Text_ Page [((ObjectId, Id Text), Text)] where
|
|
||||||
r Text_ (Page {contents, resources}) =
|
|
||||||
loadResources resources
|
|
||||||
>>= runReaderT (mapM (r Chunks) contents)
|
|
||||||
-- >>= runReaderT (concat <$> mapM (r Chunks . snd) contents)
|
|
||||||
where
|
|
||||||
mixIn prefix = foldrWithKey (\k v m -> ((prefix, k), v):m) []
|
|
||||||
extractChunks (objectId, content) = mixIn objectId <$> r Chunks content
|
|
||||||
-}
|
|
||||||
|
|
||||||
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
||||||
cacheFonts = flip evalStateT Map.empty
|
cacheFonts = flip evalStateT Map.empty
|
||||||
|
|
||||||
withResources :: FontCache m => (Page -> ReaderT FontSet m b) -> Page -> m b
|
withResources :: Except m => (Page -> ReaderT FontSet m b) -> Page -> FontCache m b
|
||||||
withResources f p =
|
withResources f p =
|
||||||
loadResources (resources p) >>= runReaderT (f p)
|
loadResources (resources p) >>= lift . lift . runReaderT (f p)
|
||||||
|
|
Loading…
Reference in a new issue