diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index 1df20eb..83d83ac 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -20,6 +20,7 @@ library , PDF.Box , PDF.CMap , PDF.Content + , PDF.Content.Text , PDF.EOL , PDF.Layer , PDF.Object @@ -35,7 +36,6 @@ library , PDF.Content.Operator.GraphicState , PDF.Content.Operator.Path , PDF.Content.Operator.Text - , PDF.Content.Text , PDF.Encoding , PDF.Encoding.MacRoman , PDF.Body diff --git a/examples/getText.hs b/examples/getText.hs index ba73131..fe3ab83 100644 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -1,17 +1,32 @@ import Control.Monad ((>=>)) import qualified Data.ByteString.Char8 as BS (readFile) -import qualified Data.Map as Map (toList) +import Data.Map (Map, foldlWithKey, mapWithKey) +import qualified Data.Map as Map (empty, insert, toList, union) +import qualified Data.Text as Text (unpack) import qualified Data.Text.IO as Text (putStrLn) import PDF (UnifiedLayers(..), parseDocument) import PDF.Box (Box(..)) +import PDF.Content.Text (Chunks(..)) import PDF.Layer (Layer) -import PDF.Pages (Page(..), PageNumber(..), Pages(..), cacheFonts) +import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts) import System.Environment (getArgs) import System.Exit (die) import System.IO (BufferMode(..), hSetBuffering, stdout) +import Text.Printf (printf) displayPage :: Page -> IO () -displayPage = mapM_ Text.putStrLn . contents +displayPage = + r Contents + >=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId) + >=> fusion + >=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v)) + -- >=> mapM_ Text.putStrLn + where + mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v + mixIn prefix = + foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty + fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b) + fusion = return . foldl Map.union Map.empty getAll :: Layer -> IO () getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs index db0bd31..931d27a 100755 --- a/src/PDF/Pages.hs +++ b/src/PDF/Pages.hs @@ -6,7 +6,8 @@ {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE UndecidableInstances #-} module PDF.Pages ( - Page(..) + Contents(..) + , Page(..) , PageNumber(..) , Pages(..) , cacheFonts @@ -18,13 +19,14 @@ import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Reader (runReaderT) import Control.Monad.State (MonadState, StateT(..), evalStateT, modify) import qualified Control.Monad.RWS as RWS (get) -import Data.OrderedMap (OrderedMap, build) +--import Data.OrderedMap (OrderedMap, build) import Data.Map (Map) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import PDF.Box (Box(..)) import PDF.CMap (cMap) import PDF.Content (Content(..)) import qualified PDF.Content as Content (parse) +import PDF.Content.Text (Chunks(..)) import PDF.Encoding (encoding) import PDF.Font (Font, FontSet) import PDF.Layer (Layer(..)) @@ -43,12 +45,14 @@ import Text.Printf (printf) type CachedFonts = Map ObjectId Font type FontCache m = (MonadState CachedFonts m, PDFContent m) data Page = Page { - contents :: OrderedMap ObjectId Content + contents :: [(ObjectId, Content)] + -- contents :: OrderedMap ObjectId Content , resources :: Dictionary , source :: ObjectId } -loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content) +loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)] +--loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content) loadContents directObject = sequenceA . build loadContent $ objectIds directObject where @@ -56,12 +60,12 @@ loadContents directObject = loadContent objectId = objectById objectId >>= r Clear >>= Content.parse objectIds (Array l) = l >>= getReference objectIds dirObj = getReference dirObj + build f = fmap (\k -> f k >>= \v -> return (k, v)) getFontDictionary :: PDFContent m => Object -> m Dictionary getFontDictionary pageObj = (pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty -{- cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font cache loader objectId = (maybe load return . Map.lookup objectId) =<< RWS.get @@ -90,7 +94,6 @@ loadResources = foldM addFont Map.empty . Map.toList addFont output (name, Reference (IndirectObjCoordinates {objectId})) = flip (Map.insert name) output <$> cache loadFont objectId addFont output _ = return output --} getReference :: DirectObject -> [ObjectId] getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId] @@ -114,6 +117,7 @@ pagesList = data Pages = Pages newtype PageNumber = P Int data Contents = Contents +data Text_ = Text_ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer @@ -129,5 +133,10 @@ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNum firstPage [] = fail "Page is out of bounds" firstPage (p:_) = loadPage p +instance Monad m => Box m Contents Page [(ObjectId, Content)] where +--instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where + r Contents = return . contents + + cacheFonts :: Monad m => StateT CachedFonts m a -> m a cacheFonts = flip evalStateT Map.empty