diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index 83d83ac..1f4a6fc 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -16,7 +16,8 @@ extra-source-files: ChangeLog.md cabal-version: >=1.10 library - exposed-modules: PDF + exposed-modules: Data.OrderedMap + , PDF , PDF.Box , PDF.CMap , PDF.Content @@ -29,7 +30,6 @@ library , PDF.Parser , PDF.Pages other-modules: Data.ByteString.Char8.Util - , Data.OrderedMap , PDF.Content.Operator , PDF.Content.Operator.Color , PDF.Content.Operator.Common diff --git a/examples/getText.hs b/examples/getText.hs index fe3ab83..8326002 100644 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -1,38 +1,55 @@ +{-# LANGUAGE FlexibleContexts #-} import Control.Monad ((>=>)) +import Control.Monad.IO.Class (MonadIO, liftIO) +import Control.Monad.Reader (runReaderT) import qualified Data.ByteString.Char8 as BS (readFile) import Data.Map (Map, foldlWithKey, mapWithKey) import qualified Data.Map as Map (empty, insert, toList, union) +import Data.OrderedMap (mapi) import qualified Data.Text as Text (unpack) import qualified Data.Text.IO as Text (putStrLn) import PDF (UnifiedLayers(..), parseDocument) import PDF.Box (Box(..)) import PDF.Content.Text (Chunks(..)) import PDF.Layer (Layer) -import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts) +import PDF.Output (ObjectId(..)) +import PDF.Pages ( + Contents(..), FontCache, Page(..), PageNumber(..), Pages(..), Text_(..), cacheFonts, withResources + ) import System.Environment (getArgs) import System.Exit (die) import System.IO (BufferMode(..), hSetBuffering, stdout) import Text.Printf (printf) -displayPage :: Page -> IO () +displayPage :: (MonadIO m, FontCache m) => Page -> m () displayPage = - r Contents - >=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId) - >=> fusion - >=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v)) + withResources (r Contents) + >=> sequence_ . mapi (\objectId content -> + r Chunks content -- >=> sequence_ . mapWithKey (display objectId) + ) + where + display = undefined + +{- +( + r Contents :: ReaderT FontSet m (OrderedMap ObjectId Content) + -- >=> sequenceA . mapi $ \(objectId, content) -> undefined + ) + --sequenceA $ mapWithKey (display objectId) (r Chunks content) -- >=> mapM_ Text.putStrLn where - mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v - mixIn prefix = - foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty - fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b) - fusion = return . foldl Map.union Map.empty + display a b v = + liftIO . putStrLn $ + printf "%d@%s: %s" (getObjectId a) (show b) (Text.unpack v) +-} getAll :: Layer -> IO () -getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList +getAll layer = + r Pages layer + >>= flip runReaderT layer . cacheFonts . mapM_ (displayPage . snd) . Map.toList get :: Int -> Layer -> IO () -get n layer = (cacheFonts $ r (P n) layer) >>= displayPage +get n layer = r (P n) layer >>= flip runReaderT layer . cacheFonts . displayPage onDoc :: FilePath -> (Layer -> IO ()) -> IO () onDoc inputFile f = do diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs index 931d27a..2ec20a7 100755 --- a/src/PDF/Pages.hs +++ b/src/PDF/Pages.hs @@ -7,24 +7,28 @@ {-# LANGUAGE UndecidableInstances #-} module PDF.Pages ( Contents(..) + , FontCache , Page(..) , PageNumber(..) , Pages(..) + , Text_(..) , cacheFonts + , withResources ) where import Control.Applicative (Alternative, (<|>)) import Control.Monad (foldM) import Control.Monad.Fail (MonadFail(..)) -import Control.Monad.Reader (runReaderT) +import Control.Monad.Reader (ReaderT, runReaderT) import Control.Monad.State (MonadState, StateT(..), evalStateT, modify) import qualified Control.Monad.RWS as RWS (get) ---import Data.OrderedMap (OrderedMap, build) -import Data.Map (Map) +import Data.OrderedMap (OrderedMap, build) +import Data.Map (Map, foldrWithKey, mapWithKey) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) +import Data.Text (Text) import PDF.Box (Box(..)) import PDF.CMap (cMap) -import PDF.Content (Content(..)) +import PDF.Content (Content(..), Id(..), Indexed) import qualified PDF.Content as Content (parse) import PDF.Content.Text (Chunks(..)) import PDF.Encoding (encoding) @@ -45,14 +49,14 @@ import Text.Printf (printf) type CachedFonts = Map ObjectId Font type FontCache m = (MonadState CachedFonts m, PDFContent m) data Page = Page { - contents :: [(ObjectId, Content)] - -- contents :: OrderedMap ObjectId Content + --contents :: [(ObjectId, Content)] + contents :: OrderedMap ObjectId Content , resources :: Dictionary , source :: ObjectId } -loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)] ---loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content) +--loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)] +loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content) loadContents directObject = sequenceA . build loadContent $ objectIds directObject where @@ -60,7 +64,7 @@ loadContents directObject = loadContent objectId = objectById objectId >>= r Clear >>= Content.parse objectIds (Array l) = l >>= getReference objectIds dirObj = getReference dirObj - build f = fmap (\k -> f k >>= \v -> return (k, v)) + --build f = fmap (\k -> f k >>= \v -> return (k, v)) getFontDictionary :: PDFContent m => Object -> m Dictionary getFontDictionary pageObj = @@ -99,7 +103,7 @@ getReference :: DirectObject -> [ObjectId] getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId] getReference _ = [] -loadPage :: FontCache m => ObjectId -> m Page +loadPage :: PDFContent m => ObjectId -> m Page loadPage source = do page <- objectById source contents <- getKey "Contents" page >>= loadContents @@ -119,13 +123,13 @@ newtype PageNumber = P Int data Contents = Contents data Text_ = Text_ -instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where +instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer where numbered :: [Page] -> Map Int Page numbered = Map.fromList . zip [1..] -instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNumber Layer Page where +instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where r (P i) layer | i < 1 = fail "Pages start at 1" | otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer @@ -133,10 +137,25 @@ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNum firstPage [] = fail "Page is out of bounds" firstPage (p:_) = loadPage p -instance Monad m => Box m Contents Page [(ObjectId, Content)] where ---instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where +--instance Monad m => Box m Contents Page [(ObjectId, Content)] where +instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where r Contents = return . contents +{- +instance FontCache m => Box m Text_ Page (OrderedMap ObjectId (Indexed Text)) where +--instance FontCache m => Box m Text_ Page [((ObjectId, Id Text), Text)] where + r Text_ (Page {contents, resources}) = + loadResources resources + >>= runReaderT (mapM (r Chunks) contents) + -- >>= runReaderT (concat <$> mapM (r Chunks . snd) contents) + where + mixIn prefix = foldrWithKey (\k v m -> ((prefix, k), v):m) [] + extractChunks (objectId, content) = mixIn objectId <$> r Chunks content +-} cacheFonts :: Monad m => StateT CachedFonts m a -> m a cacheFonts = flip evalStateT Map.empty + +withResources :: FontCache m => (Page -> ReaderT FontSet m b) -> Page -> m b +withResources f p = + loadResources (resources p) >>= runReaderT (f p)