WIP: Try about everything that's possible to try, OrderedMap or [(,)], try to decouple Box instance for Content and the one for Indexed Text, breaks getText… will probably require some advanced effect library, there seems to be a weird MonadReader conflict in the errors messages

This commit is contained in:
Tissevert 2020-03-11 18:55:18 +01:00
parent d3f1b97f3a
commit 5b8d951516
3 changed files with 65 additions and 29 deletions

View File

@ -16,7 +16,8 @@ extra-source-files: ChangeLog.md
cabal-version: >=1.10 cabal-version: >=1.10
library library
exposed-modules: PDF exposed-modules: Data.OrderedMap
, PDF
, PDF.Box , PDF.Box
, PDF.CMap , PDF.CMap
, PDF.Content , PDF.Content
@ -29,7 +30,6 @@ library
, PDF.Parser , PDF.Parser
, PDF.Pages , PDF.Pages
other-modules: Data.ByteString.Char8.Util other-modules: Data.ByteString.Char8.Util
, Data.OrderedMap
, PDF.Content.Operator , PDF.Content.Operator
, PDF.Content.Operator.Color , PDF.Content.Operator.Color
, PDF.Content.Operator.Common , PDF.Content.Operator.Common

View File

@ -1,38 +1,55 @@
{-# LANGUAGE FlexibleContexts #-}
import Control.Monad ((>=>)) import Control.Monad ((>=>))
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Reader (runReaderT)
import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Char8 as BS (readFile)
import Data.Map (Map, foldlWithKey, mapWithKey) import Data.Map (Map, foldlWithKey, mapWithKey)
import qualified Data.Map as Map (empty, insert, toList, union) import qualified Data.Map as Map (empty, insert, toList, union)
import Data.OrderedMap (mapi)
import qualified Data.Text as Text (unpack) import qualified Data.Text as Text (unpack)
import qualified Data.Text.IO as Text (putStrLn) import qualified Data.Text.IO as Text (putStrLn)
import PDF (UnifiedLayers(..), parseDocument) import PDF (UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..)) import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer) import PDF.Layer (Layer)
import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts) import PDF.Output (ObjectId(..))
import PDF.Pages (
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), Text_(..), cacheFonts, withResources
)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.Exit (die) import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout) import System.IO (BufferMode(..), hSetBuffering, stdout)
import Text.Printf (printf) import Text.Printf (printf)
displayPage :: Page -> IO () displayPage :: (MonadIO m, FontCache m) => Page -> m ()
displayPage = displayPage =
r Contents withResources (r Contents)
>=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId) >=> sequence_ . mapi (\objectId content ->
>=> fusion r Chunks content -- >=> sequence_ . mapWithKey (display objectId)
>=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v)) )
where
display = undefined
{-
(
r Contents :: ReaderT FontSet m (OrderedMap ObjectId Content)
-- >=> sequenceA . mapi $ \(objectId, content) -> undefined
)
--sequenceA $ mapWithKey (display objectId) (r Chunks content)
-- >=> mapM_ Text.putStrLn -- >=> mapM_ Text.putStrLn
where where
mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v display a b v =
mixIn prefix = liftIO . putStrLn $
foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty printf "%d@%s: %s" (getObjectId a) (show b) (Text.unpack v)
fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b) -}
fusion = return . foldl Map.union Map.empty
getAll :: Layer -> IO () getAll :: Layer -> IO ()
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList getAll layer =
r Pages layer
>>= flip runReaderT layer . cacheFonts . mapM_ (displayPage . snd) . Map.toList
get :: Int -> Layer -> IO () get :: Int -> Layer -> IO ()
get n layer = (cacheFonts $ r (P n) layer) >>= displayPage get n layer = r (P n) layer >>= flip runReaderT layer . cacheFonts . displayPage
onDoc :: FilePath -> (Layer -> IO ()) -> IO () onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
onDoc inputFile f = do onDoc inputFile f = do

View File

@ -7,24 +7,28 @@
{-# LANGUAGE UndecidableInstances #-} {-# LANGUAGE UndecidableInstances #-}
module PDF.Pages ( module PDF.Pages (
Contents(..) Contents(..)
, FontCache
, Page(..) , Page(..)
, PageNumber(..) , PageNumber(..)
, Pages(..) , Pages(..)
, Text_(..)
, cacheFonts , cacheFonts
, withResources
) where ) where
import Control.Applicative (Alternative, (<|>)) import Control.Applicative (Alternative, (<|>))
import Control.Monad (foldM) import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (runReaderT) import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify) import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
import qualified Control.Monad.RWS as RWS (get) import qualified Control.Monad.RWS as RWS (get)
--import Data.OrderedMap (OrderedMap, build) import Data.OrderedMap (OrderedMap, build)
import Data.Map (Map) import Data.Map (Map, foldrWithKey, mapWithKey)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.CMap (cMap) import PDF.CMap (cMap)
import PDF.Content (Content(..)) import PDF.Content (Content(..), Id(..), Indexed)
import qualified PDF.Content as Content (parse) import qualified PDF.Content as Content (parse)
import PDF.Content.Text (Chunks(..)) import PDF.Content.Text (Chunks(..))
import PDF.Encoding (encoding) import PDF.Encoding (encoding)
@ -45,14 +49,14 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font type CachedFonts = Map ObjectId Font
type FontCache m = (MonadState CachedFonts m, PDFContent m) type FontCache m = (MonadState CachedFonts m, PDFContent m)
data Page = Page { data Page = Page {
contents :: [(ObjectId, Content)] --contents :: [(ObjectId, Content)]
-- contents :: OrderedMap ObjectId Content contents :: OrderedMap ObjectId Content
, resources :: Dictionary , resources :: Dictionary
, source :: ObjectId , source :: ObjectId
} }
loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)] --loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
--loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content) loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
loadContents directObject = loadContents directObject =
sequenceA . build loadContent $ objectIds directObject sequenceA . build loadContent $ objectIds directObject
where where
@ -60,7 +64,7 @@ loadContents directObject =
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
objectIds (Array l) = l >>= getReference objectIds (Array l) = l >>= getReference
objectIds dirObj = getReference dirObj objectIds dirObj = getReference dirObj
build f = fmap (\k -> f k >>= \v -> return (k, v)) --build f = fmap (\k -> f k >>= \v -> return (k, v))
getFontDictionary :: PDFContent m => Object -> m Dictionary getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj = getFontDictionary pageObj =
@ -99,7 +103,7 @@ getReference :: DirectObject -> [ObjectId]
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId] getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
getReference _ = [] getReference _ = []
loadPage :: FontCache m => ObjectId -> m Page loadPage :: PDFContent m => ObjectId -> m Page
loadPage source = do loadPage source = do
page <- objectById source page <- objectById source
contents <- getKey "Contents" page >>= loadContents contents <- getKey "Contents" page >>= loadContents
@ -119,13 +123,13 @@ newtype PageNumber = P Int
data Contents = Contents data Contents = Contents
data Text_ = Text_ data Text_ = Text_
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
where where
numbered :: [Page] -> Map Int Page numbered :: [Page] -> Map Int Page
numbered = Map.fromList . zip [1..] numbered = Map.fromList . zip [1..]
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNumber Layer Page where instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where
r (P i) layer r (P i) layer
| i < 1 = fail "Pages start at 1" | i < 1 = fail "Pages start at 1"
| otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer | otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer
@ -133,10 +137,25 @@ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNum
firstPage [] = fail "Page is out of bounds" firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p firstPage (p:_) = loadPage p
instance Monad m => Box m Contents Page [(ObjectId, Content)] where --instance Monad m => Box m Contents Page [(ObjectId, Content)] where
--instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
r Contents = return . contents r Contents = return . contents
{-
instance FontCache m => Box m Text_ Page (OrderedMap ObjectId (Indexed Text)) where
--instance FontCache m => Box m Text_ Page [((ObjectId, Id Text), Text)] where
r Text_ (Page {contents, resources}) =
loadResources resources
>>= runReaderT (mapM (r Chunks) contents)
-- >>= runReaderT (concat <$> mapM (r Chunks . snd) contents)
where
mixIn prefix = foldrWithKey (\k v m -> ((prefix, k), v):m) []
extractChunks (objectId, content) = mixIn objectId <$> r Chunks content
-}
cacheFonts :: Monad m => StateT CachedFonts m a -> m a cacheFonts :: Monad m => StateT CachedFonts m a -> m a
cacheFonts = flip evalStateT Map.empty cacheFonts = flip evalStateT Map.empty
withResources :: FontCache m => (Page -> ReaderT FontSet m b) -> Page -> m b
withResources f p =
loadResources (resources p) >>= runReaderT (f p)