WIP: Try about everything that's possible to try, OrderedMap or [(,)], try to decouple Box instance for Content and the one for Indexed Text, breaks getText… will probably require some advanced effect library, there seems to be a weird MonadReader conflict in the errors messages
This commit is contained in:
parent
d3f1b97f3a
commit
5b8d951516
|
@ -16,7 +16,8 @@ extra-source-files: ChangeLog.md
|
||||||
cabal-version: >=1.10
|
cabal-version: >=1.10
|
||||||
|
|
||||||
library
|
library
|
||||||
exposed-modules: PDF
|
exposed-modules: Data.OrderedMap
|
||||||
|
, PDF
|
||||||
, PDF.Box
|
, PDF.Box
|
||||||
, PDF.CMap
|
, PDF.CMap
|
||||||
, PDF.Content
|
, PDF.Content
|
||||||
|
@ -29,7 +30,6 @@ library
|
||||||
, PDF.Parser
|
, PDF.Parser
|
||||||
, PDF.Pages
|
, PDF.Pages
|
||||||
other-modules: Data.ByteString.Char8.Util
|
other-modules: Data.ByteString.Char8.Util
|
||||||
, Data.OrderedMap
|
|
||||||
, PDF.Content.Operator
|
, PDF.Content.Operator
|
||||||
, PDF.Content.Operator.Color
|
, PDF.Content.Operator.Color
|
||||||
, PDF.Content.Operator.Common
|
, PDF.Content.Operator.Common
|
||||||
|
|
|
@ -1,38 +1,55 @@
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
import Control.Monad ((>=>))
|
import Control.Monad ((>=>))
|
||||||
|
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||||
|
import Control.Monad.Reader (runReaderT)
|
||||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
import Data.Map (Map, foldlWithKey, mapWithKey)
|
import Data.Map (Map, foldlWithKey, mapWithKey)
|
||||||
import qualified Data.Map as Map (empty, insert, toList, union)
|
import qualified Data.Map as Map (empty, insert, toList, union)
|
||||||
|
import Data.OrderedMap (mapi)
|
||||||
import qualified Data.Text as Text (unpack)
|
import qualified Data.Text as Text (unpack)
|
||||||
import qualified Data.Text.IO as Text (putStrLn)
|
import qualified Data.Text.IO as Text (putStrLn)
|
||||||
import PDF (UnifiedLayers(..), parseDocument)
|
import PDF (UnifiedLayers(..), parseDocument)
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
import PDF.Content.Text (Chunks(..))
|
import PDF.Content.Text (Chunks(..))
|
||||||
import PDF.Layer (Layer)
|
import PDF.Layer (Layer)
|
||||||
import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts)
|
import PDF.Output (ObjectId(..))
|
||||||
|
import PDF.Pages (
|
||||||
|
Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), Text_(..), cacheFonts, withResources
|
||||||
|
)
|
||||||
import System.Environment (getArgs)
|
import System.Environment (getArgs)
|
||||||
import System.Exit (die)
|
import System.Exit (die)
|
||||||
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
displayPage :: Page -> IO ()
|
displayPage :: (MonadIO m, FontCache m) => Page -> m ()
|
||||||
displayPage =
|
displayPage =
|
||||||
r Contents
|
withResources (r Contents)
|
||||||
>=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId)
|
>=> sequence_ . mapi (\objectId content ->
|
||||||
>=> fusion
|
r Chunks content -- >=> sequence_ . mapWithKey (display objectId)
|
||||||
>=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v))
|
)
|
||||||
|
where
|
||||||
|
display = undefined
|
||||||
|
|
||||||
|
{-
|
||||||
|
(
|
||||||
|
r Contents :: ReaderT FontSet m (OrderedMap ObjectId Content)
|
||||||
|
-- >=> sequenceA . mapi $ \(objectId, content) -> undefined
|
||||||
|
)
|
||||||
|
--sequenceA $ mapWithKey (display objectId) (r Chunks content)
|
||||||
-- >=> mapM_ Text.putStrLn
|
-- >=> mapM_ Text.putStrLn
|
||||||
where
|
where
|
||||||
mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v
|
display a b v =
|
||||||
mixIn prefix =
|
liftIO . putStrLn $
|
||||||
foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty
|
printf "%d@%s: %s" (getObjectId a) (show b) (Text.unpack v)
|
||||||
fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b)
|
-}
|
||||||
fusion = return . foldl Map.union Map.empty
|
|
||||||
|
|
||||||
getAll :: Layer -> IO ()
|
getAll :: Layer -> IO ()
|
||||||
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList
|
getAll layer =
|
||||||
|
r Pages layer
|
||||||
|
>>= flip runReaderT layer . cacheFonts . mapM_ (displayPage . snd) . Map.toList
|
||||||
|
|
||||||
get :: Int -> Layer -> IO ()
|
get :: Int -> Layer -> IO ()
|
||||||
get n layer = (cacheFonts $ r (P n) layer) >>= displayPage
|
get n layer = r (P n) layer >>= flip runReaderT layer . cacheFonts . displayPage
|
||||||
|
|
||||||
onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
|
onDoc :: FilePath -> (Layer -> IO ()) -> IO ()
|
||||||
onDoc inputFile f = do
|
onDoc inputFile f = do
|
||||||
|
|
|
@ -7,24 +7,28 @@
|
||||||
{-# LANGUAGE UndecidableInstances #-}
|
{-# LANGUAGE UndecidableInstances #-}
|
||||||
module PDF.Pages (
|
module PDF.Pages (
|
||||||
Contents(..)
|
Contents(..)
|
||||||
|
, FontCache
|
||||||
, Page(..)
|
, Page(..)
|
||||||
, PageNumber(..)
|
, PageNumber(..)
|
||||||
, Pages(..)
|
, Pages(..)
|
||||||
|
, Text_(..)
|
||||||
, cacheFonts
|
, cacheFonts
|
||||||
|
, withResources
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative (Alternative, (<|>))
|
import Control.Applicative (Alternative, (<|>))
|
||||||
import Control.Monad (foldM)
|
import Control.Monad (foldM)
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.Reader (runReaderT)
|
import Control.Monad.Reader (ReaderT, runReaderT)
|
||||||
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
||||||
import qualified Control.Monad.RWS as RWS (get)
|
import qualified Control.Monad.RWS as RWS (get)
|
||||||
--import Data.OrderedMap (OrderedMap, build)
|
import Data.OrderedMap (OrderedMap, build)
|
||||||
import Data.Map (Map)
|
import Data.Map (Map, foldrWithKey, mapWithKey)
|
||||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||||
|
import Data.Text (Text)
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
import PDF.CMap (cMap)
|
import PDF.CMap (cMap)
|
||||||
import PDF.Content (Content(..))
|
import PDF.Content (Content(..), Id(..), Indexed)
|
||||||
import qualified PDF.Content as Content (parse)
|
import qualified PDF.Content as Content (parse)
|
||||||
import PDF.Content.Text (Chunks(..))
|
import PDF.Content.Text (Chunks(..))
|
||||||
import PDF.Encoding (encoding)
|
import PDF.Encoding (encoding)
|
||||||
|
@ -45,14 +49,14 @@ import Text.Printf (printf)
|
||||||
type CachedFonts = Map ObjectId Font
|
type CachedFonts = Map ObjectId Font
|
||||||
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
||||||
data Page = Page {
|
data Page = Page {
|
||||||
contents :: [(ObjectId, Content)]
|
--contents :: [(ObjectId, Content)]
|
||||||
-- contents :: OrderedMap ObjectId Content
|
contents :: OrderedMap ObjectId Content
|
||||||
, resources :: Dictionary
|
, resources :: Dictionary
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
}
|
}
|
||||||
|
|
||||||
loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
|
--loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
|
||||||
--loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
||||||
loadContents directObject =
|
loadContents directObject =
|
||||||
sequenceA . build loadContent $ objectIds directObject
|
sequenceA . build loadContent $ objectIds directObject
|
||||||
where
|
where
|
||||||
|
@ -60,7 +64,7 @@ loadContents directObject =
|
||||||
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
||||||
objectIds (Array l) = l >>= getReference
|
objectIds (Array l) = l >>= getReference
|
||||||
objectIds dirObj = getReference dirObj
|
objectIds dirObj = getReference dirObj
|
||||||
build f = fmap (\k -> f k >>= \v -> return (k, v))
|
--build f = fmap (\k -> f k >>= \v -> return (k, v))
|
||||||
|
|
||||||
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
||||||
getFontDictionary pageObj =
|
getFontDictionary pageObj =
|
||||||
|
@ -99,7 +103,7 @@ getReference :: DirectObject -> [ObjectId]
|
||||||
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
||||||
getReference _ = []
|
getReference _ = []
|
||||||
|
|
||||||
loadPage :: FontCache m => ObjectId -> m Page
|
loadPage :: PDFContent m => ObjectId -> m Page
|
||||||
loadPage source = do
|
loadPage source = do
|
||||||
page <- objectById source
|
page <- objectById source
|
||||||
contents <- getKey "Contents" page >>= loadContents
|
contents <- getKey "Contents" page >>= loadContents
|
||||||
|
@ -119,13 +123,13 @@ newtype PageNumber = P Int
|
||||||
data Contents = Contents
|
data Contents = Contents
|
||||||
data Text_ = Text_
|
data Text_ = Text_
|
||||||
|
|
||||||
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
|
||||||
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
||||||
where
|
where
|
||||||
numbered :: [Page] -> Map Int Page
|
numbered :: [Page] -> Map Int Page
|
||||||
numbered = Map.fromList . zip [1..]
|
numbered = Map.fromList . zip [1..]
|
||||||
|
|
||||||
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNumber Layer Page where
|
instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where
|
||||||
r (P i) layer
|
r (P i) layer
|
||||||
| i < 1 = fail "Pages start at 1"
|
| i < 1 = fail "Pages start at 1"
|
||||||
| otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer
|
| otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer
|
||||||
|
@ -133,10 +137,25 @@ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNum
|
||||||
firstPage [] = fail "Page is out of bounds"
|
firstPage [] = fail "Page is out of bounds"
|
||||||
firstPage (p:_) = loadPage p
|
firstPage (p:_) = loadPage p
|
||||||
|
|
||||||
instance Monad m => Box m Contents Page [(ObjectId, Content)] where
|
--instance Monad m => Box m Contents Page [(ObjectId, Content)] where
|
||||||
--instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
|
instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
|
||||||
r Contents = return . contents
|
r Contents = return . contents
|
||||||
|
|
||||||
|
{-
|
||||||
|
instance FontCache m => Box m Text_ Page (OrderedMap ObjectId (Indexed Text)) where
|
||||||
|
--instance FontCache m => Box m Text_ Page [((ObjectId, Id Text), Text)] where
|
||||||
|
r Text_ (Page {contents, resources}) =
|
||||||
|
loadResources resources
|
||||||
|
>>= runReaderT (mapM (r Chunks) contents)
|
||||||
|
-- >>= runReaderT (concat <$> mapM (r Chunks . snd) contents)
|
||||||
|
where
|
||||||
|
mixIn prefix = foldrWithKey (\k v m -> ((prefix, k), v):m) []
|
||||||
|
extractChunks (objectId, content) = mixIn objectId <$> r Chunks content
|
||||||
|
-}
|
||||||
|
|
||||||
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
||||||
cacheFonts = flip evalStateT Map.empty
|
cacheFonts = flip evalStateT Map.empty
|
||||||
|
|
||||||
|
withResources :: FontCache m => (Page -> ReaderT FontSet m b) -> Page -> m b
|
||||||
|
withResources f p =
|
||||||
|
loadResources (resources p) >>= runReaderT (f p)
|
||||||
|
|
Loading…
Reference in New Issue