WIP: Try about everything that's possible to try, OrderedMap or [(,)], try to decouple Box instance for Content and the one for Indexed Text, breaks getText… will probably require some advanced effect library, there seems to be a weird MonadReader conflict in the errors messages
This commit is contained in:
parent
d3f1b97f3a
commit
5b8d951516
|
@ -16,7 +16,8 @@ extra-source-files: ChangeLog.md
|
|||
cabal-version: >=1.10
|
||||
|
||||
library
|
||||
exposed-modules: PDF
|
||||
exposed-modules: Data.OrderedMap
|
||||
, PDF
|
||||
, PDF.Box
|
||||
, PDF.CMap
|
||||
, PDF.Content
|
||||
|
@ -29,7 +30,6 @@ library
|
|||
, PDF.Parser
|
||||
, PDF.Pages
|
||||
other-modules: Data.ByteString.Char8.Util
|
||||
, Data.OrderedMap
|
||||
, PDF.Content.Operator
|
||||
, PDF.Content.Operator.Color
|
||||
, PDF.Content.Operator.Common
|
||||
|
|
|
@ -1,38 +1,55 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
import Control.Monad ((>=>))
|
||||
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||
import Control.Monad.Reader (runReaderT)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import Data.Map (Map, foldlWithKey, mapWithKey)
|
||||
import qualified Data.Map as Map (empty, insert, toList, union)
|
||||
import Data.OrderedMap (mapi)
|
||||
import qualified Data.Text as Text (unpack)
|
||||
import qualified Data.Text.IO as Text (putStrLn)
|
||||
import PDF (UnifiedLayers(..), parseDocument)
|
||||
import PDF.Box (Box(..))
|
||||
import PDF.Content.Text (Chunks(..))
|
||||
import PDF.Layer (Layer)
|
||||
import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import PDF.Pages (
|
||||
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), Text_(..), cacheFonts, withResources
|
||||
)
|
||||
import System.Environment (getArgs)
|
||||
import System.Exit (die)
|
||||
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||
import Text.Printf (printf)
|
||||
|
||||
displayPage :: Page -> IO ()
|
||||
displayPage :: (MonadIO m, FontCache m) => Page -> m ()
|
||||
displayPage =
|
||||
r Contents
|
||||
>=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId)
|
||||
>=> fusion
|
||||
>=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v))
|
||||
withResources (r Contents)
|
||||
>=> sequence_ . mapi (\objectId content ->
|
||||
r Chunks content -- >=> sequence_ . mapWithKey (display objectId)
|
||||
)
|
||||
where
|
||||
display = undefined
|
||||
|
||||
{-
|
||||
(
|
||||
r Contents :: ReaderT FontSet m (OrderedMap ObjectId Content)
|
||||
-- >=> sequenceA . mapi $ \(objectId, content) -> undefined
|
||||
)
|
||||
--sequenceA $ mapWithKey (display objectId) (r Chunks content)
|
||||
-- >=> mapM_ Text.putStrLn
|
||||
where
|
||||
mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v
|
||||
mixIn prefix =
|
||||
foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty
|
||||
fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b)
|
||||
fusion = return . foldl Map.union Map.empty
|
||||
display a b v =
|
||||
liftIO . putStrLn $
|
||||
printf "%d@%s: %s" (getObjectId a) (show b) (Text.unpack v)
|
||||
-}
|
||||
|
||||
getAll :: Layer -> IO ()
|
||||
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList
|
||||
getAll layer =
|
||||
r Pages layer
|
||||
>>= flip runReaderT layer . cacheFonts . mapM_ (displayPage . snd) . Map.toList
|
||||
|
||||
get :: Int -> Layer -> IO ()
|
||||
get n layer = (cacheFonts $ r (P n) layer) >>= displayPage
|
||||
get n layer = r (P n) layer >>= flip runReaderT layer . cacheFonts . displayPage
|
||||
|
||||
onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
|
||||
onDoc inputFile f = do
|
||||
|
|
|
@ -7,24 +7,28 @@
|
|||
{-# LANGUAGE UndecidableInstances #-}
|
||||
module PDF.Pages (
|
||||
Contents(..)
|
||||
, FontCache
|
||||
, Page(..)
|
||||
, PageNumber(..)
|
||||
, Pages(..)
|
||||
, Text_(..)
|
||||
, cacheFonts
|
||||
, withResources
|
||||
) where
|
||||
|
||||
import Control.Applicative (Alternative, (<|>))
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.Reader (runReaderT)
|
||||
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
||||
import qualified Control.Monad.RWS as RWS (get)
|
||||
--import Data.OrderedMap (OrderedMap, build)
|
||||
import Data.Map (Map)
|
||||
import Data.OrderedMap (OrderedMap, build)
|
||||
import Data.Map (Map, foldrWithKey, mapWithKey)
|
||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||
import Data.Text (Text)
|
||||
import PDF.Box (Box(..))
|
||||
import PDF.CMap (cMap)
|
||||
import PDF.Content (Content(..))
|
||||
import PDF.Content (Content(..), Id(..), Indexed)
|
||||
import qualified PDF.Content as Content (parse)
|
||||
import PDF.Content.Text (Chunks(..))
|
||||
import PDF.Encoding (encoding)
|
||||
|
@ -45,14 +49,14 @@ import Text.Printf (printf)
|
|||
type CachedFonts = Map ObjectId Font
|
||||
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
||||
data Page = Page {
|
||||
contents :: [(ObjectId, Content)]
|
||||
-- contents :: OrderedMap ObjectId Content
|
||||
--contents :: [(ObjectId, Content)]
|
||||
contents :: OrderedMap ObjectId Content
|
||||
, resources :: Dictionary
|
||||
, source :: ObjectId
|
||||
}
|
||||
|
||||
loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
|
||||
--loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
||||
--loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
|
||||
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
||||
loadContents directObject =
|
||||
sequenceA . build loadContent $ objectIds directObject
|
||||
where
|
||||
|
@ -60,7 +64,7 @@ loadContents directObject =
|
|||
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
||||
objectIds (Array l) = l >>= getReference
|
||||
objectIds dirObj = getReference dirObj
|
||||
build f = fmap (\k -> f k >>= \v -> return (k, v))
|
||||
--build f = fmap (\k -> f k >>= \v -> return (k, v))
|
||||
|
||||
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
||||
getFontDictionary pageObj =
|
||||
|
@ -99,7 +103,7 @@ getReference :: DirectObject -> [ObjectId]
|
|||
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
||||
getReference _ = []
|
||||
|
||||
loadPage :: FontCache m => ObjectId -> m Page
|
||||
loadPage :: PDFContent m => ObjectId -> m Page
|
||||
loadPage source = do
|
||||
page <- objectById source
|
||||
contents <- getKey "Contents" page >>= loadContents
|
||||
|
@ -119,13 +123,13 @@ newtype PageNumber = P Int
|
|||
data Contents = Contents
|
||||
data Text_ = Text_
|
||||
|
||||
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
||||
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
|
||||
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
||||
where
|
||||
numbered :: [Page] -> Map Int Page
|
||||
numbered = Map.fromList . zip [1..]
|
||||
|
||||
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNumber Layer Page where
|
||||
instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where
|
||||
r (P i) layer
|
||||
| i < 1 = fail "Pages start at 1"
|
||||
| otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer
|
||||
|
@ -133,10 +137,25 @@ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNum
|
|||
firstPage [] = fail "Page is out of bounds"
|
||||
firstPage (p:_) = loadPage p
|
||||
|
||||
instance Monad m => Box m Contents Page [(ObjectId, Content)] where
|
||||
--instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
|
||||
--instance Monad m => Box m Contents Page [(ObjectId, Content)] where
|
||||
instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
|
||||
r Contents = return . contents
|
||||
|
||||
{-
|
||||
instance FontCache m => Box m Text_ Page (OrderedMap ObjectId (Indexed Text)) where
|
||||
--instance FontCache m => Box m Text_ Page [((ObjectId, Id Text), Text)] where
|
||||
r Text_ (Page {contents, resources}) =
|
||||
loadResources resources
|
||||
>>= runReaderT (mapM (r Chunks) contents)
|
||||
-- >>= runReaderT (concat <$> mapM (r Chunks . snd) contents)
|
||||
where
|
||||
mixIn prefix = foldrWithKey (\k v m -> ((prefix, k), v):m) []
|
||||
extractChunks (objectId, content) = mixIn objectId <$> r Chunks content
|
||||
-}
|
||||
|
||||
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
||||
cacheFonts = flip evalStateT Map.empty
|
||||
|
||||
withResources :: FontCache m => (Page -> ReaderT FontSet m b) -> Page -> m b
|
||||
withResources f p =
|
||||
loadResources (resources p) >>= runReaderT (f p)
|
||||
|
|
Loading…
Reference in New Issue