WIP: Try about everything that's possible to try, OrderedMap or [(,)], try to decouple Box instance for Content and the one for Indexed Text, breaks getText… will probably require some advanced effect library, there seems to be a weird MonadReader conflict in the errors messages

This commit is contained in:
Tissevert 2020-03-11 18:55:18 +01:00
parent d3f1b97f3a
commit 5b8d951516
3 changed files with 65 additions and 29 deletions

View File

@ -16,7 +16,8 @@ extra-source-files: ChangeLog.md
cabal-version: >=1.10
library
exposed-modules: PDF
exposed-modules: Data.OrderedMap
, PDF
, PDF.Box
, PDF.CMap
, PDF.Content
@ -29,7 +30,6 @@ library
, PDF.Parser
, PDF.Pages
other-modules: Data.ByteString.Char8.Util
, Data.OrderedMap
, PDF.Content.Operator
, PDF.Content.Operator.Color
, PDF.Content.Operator.Common

View File

@ -1,38 +1,55 @@
{-# LANGUAGE FlexibleContexts #-}
import Control.Monad ((>=>))
import Control.Monad.IO.Class (MonadIO, liftIO)
import Control.Monad.Reader (runReaderT)
import qualified Data.ByteString.Char8 as BS (readFile)
import Data.Map (Map, foldlWithKey, mapWithKey)
import qualified Data.Map as Map (empty, insert, toList, union)
import Data.OrderedMap (mapi)
import qualified Data.Text as Text (unpack)
import qualified Data.Text.IO as Text (putStrLn)
import PDF (UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer)
import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts)
import PDF.Output (ObjectId(..))
import PDF.Pages (
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), Text_(..), cacheFonts, withResources
)
import System.Environment (getArgs)
import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout)
import Text.Printf (printf)
displayPage :: Page -> IO ()
displayPage :: (MonadIO m, FontCache m) => Page -> m ()
displayPage =
r Contents
>=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId)
>=> fusion
>=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v))
withResources (r Contents)
>=> sequence_ . mapi (\objectId content ->
r Chunks content -- >=> sequence_ . mapWithKey (display objectId)
)
where
display = undefined
{-
(
r Contents :: ReaderT FontSet m (OrderedMap ObjectId Content)
-- >=> sequenceA . mapi $ \(objectId, content) -> undefined
)
--sequenceA $ mapWithKey (display objectId) (r Chunks content)
-- >=> mapM_ Text.putStrLn
where
mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v
mixIn prefix =
foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty
fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b)
fusion = return . foldl Map.union Map.empty
display a b v =
liftIO . putStrLn $
printf "%d@%s: %s" (getObjectId a) (show b) (Text.unpack v)
-}
getAll :: Layer -> IO ()
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList
getAll layer =
r Pages layer
>>= flip runReaderT layer . cacheFonts . mapM_ (displayPage . snd) . Map.toList
get :: Int -> Layer -> IO ()
get n layer = (cacheFonts $ r (P n) layer) >>= displayPage
get n layer = r (P n) layer >>= flip runReaderT layer . cacheFonts . displayPage
onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
onDoc inputFile f = do

View File

@ -7,24 +7,28 @@
{-# LANGUAGE UndecidableInstances #-}
module PDF.Pages (
Contents(..)
, FontCache
, Page(..)
, PageNumber(..)
, Pages(..)
, Text_(..)
, cacheFonts
, withResources
) where
import Control.Applicative (Alternative, (<|>))
import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (runReaderT)
import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
import qualified Control.Monad.RWS as RWS (get)
--import Data.OrderedMap (OrderedMap, build)
import Data.Map (Map)
import Data.OrderedMap (OrderedMap, build)
import Data.Map (Map, foldrWithKey, mapWithKey)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.Box (Box(..))
import PDF.CMap (cMap)
import PDF.Content (Content(..))
import PDF.Content (Content(..), Id(..), Indexed)
import qualified PDF.Content as Content (parse)
import PDF.Content.Text (Chunks(..))
import PDF.Encoding (encoding)
@ -45,14 +49,14 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type FontCache m = (MonadState CachedFonts m, PDFContent m)
data Page = Page {
contents :: [(ObjectId, Content)]
-- contents :: OrderedMap ObjectId Content
--contents :: [(ObjectId, Content)]
contents :: OrderedMap ObjectId Content
, resources :: Dictionary
, source :: ObjectId
}
loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
--loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
--loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
loadContents directObject =
sequenceA . build loadContent $ objectIds directObject
where
@ -60,7 +64,7 @@ loadContents directObject =
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
objectIds (Array l) = l >>= getReference
objectIds dirObj = getReference dirObj
build f = fmap (\k -> f k >>= \v -> return (k, v))
--build f = fmap (\k -> f k >>= \v -> return (k, v))
getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj =
@ -99,7 +103,7 @@ getReference :: DirectObject -> [ObjectId]
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
getReference _ = []
loadPage :: FontCache m => ObjectId -> m Page
loadPage :: PDFContent m => ObjectId -> m Page
loadPage source = do
page <- objectById source
contents <- getKey "Contents" page >>= loadContents
@ -119,13 +123,13 @@ newtype PageNumber = P Int
data Contents = Contents
data Text_ = Text_
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
where
numbered :: [Page] -> Map Int Page
numbered = Map.fromList . zip [1..]
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNumber Layer Page where
instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where
r (P i) layer
| i < 1 = fail "Pages start at 1"
| otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer
@ -133,10 +137,25 @@ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNum
firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p
instance Monad m => Box m Contents Page [(ObjectId, Content)] where
--instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
--instance Monad m => Box m Contents Page [(ObjectId, Content)] where
instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
r Contents = return . contents
{-
instance FontCache m => Box m Text_ Page (OrderedMap ObjectId (Indexed Text)) where
--instance FontCache m => Box m Text_ Page [((ObjectId, Id Text), Text)] where
r Text_ (Page {contents, resources}) =
loadResources resources
>>= runReaderT (mapM (r Chunks) contents)
-- >>= runReaderT (concat <$> mapM (r Chunks . snd) contents)
where
mixIn prefix = foldrWithKey (\k v m -> ((prefix, k), v):m) []
extractChunks (objectId, content) = mixIn objectId <$> r Chunks content
-}
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
cacheFonts = flip evalStateT Map.empty
withResources :: FontCache m => (Page -> ReaderT FontSet m b) -> Page -> m b
withResources f p =
loadResources (resources p) >>= runReaderT (f p)