Try connecting all the Box instance in the getText demo, try to encode pages contents with a simple assoc list

This commit is contained in:
Tissevert 2020-03-10 22:57:11 +01:00
parent a04adff1d2
commit 3b1a5152e4
3 changed files with 34 additions and 10 deletions

View file

@ -20,6 +20,7 @@ library
, PDF.Box , PDF.Box
, PDF.CMap , PDF.CMap
, PDF.Content , PDF.Content
, PDF.Content.Text
, PDF.EOL , PDF.EOL
, PDF.Layer , PDF.Layer
, PDF.Object , PDF.Object
@ -35,7 +36,6 @@ library
, PDF.Content.Operator.GraphicState , PDF.Content.Operator.GraphicState
, PDF.Content.Operator.Path , PDF.Content.Operator.Path
, PDF.Content.Operator.Text , PDF.Content.Operator.Text
, PDF.Content.Text
, PDF.Encoding , PDF.Encoding
, PDF.Encoding.MacRoman , PDF.Encoding.MacRoman
, PDF.Body , PDF.Body

View file

@ -1,17 +1,32 @@
import Control.Monad ((>=>)) import Control.Monad ((>=>))
import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.Map as Map (toList) import Data.Map (Map, foldlWithKey, mapWithKey)
import qualified Data.Map as Map (empty, insert, toList, union)
import qualified Data.Text as Text (unpack)
import qualified Data.Text.IO as Text (putStrLn) import qualified Data.Text.IO as Text (putStrLn)
import PDF (UnifiedLayers(..), parseDocument) import PDF (UnifiedLayers(..), parseDocument)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.Content.Text (Chunks(..))
import PDF.Layer (Layer) import PDF.Layer (Layer)
import PDF.Pages (Page(..), PageNumber(..), Pages(..), cacheFonts) import PDF.Pages (Contents(..), Page(..), PageNumber(..), Pages(..), cacheFonts)
import System.Environment (getArgs) import System.Environment (getArgs)
import System.Exit (die) import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout) import System.IO (BufferMode(..), hSetBuffering, stdout)
import Text.Printf (printf)
displayPage :: Page -> IO () displayPage :: Page -> IO ()
displayPage = mapM_ Text.putStrLn . contents displayPage =
r Contents
>=> mapM (\(objectId, content) -> r Chunks content >>= return . mixIn objectId)
>=> fusion
>=> sequence_ . mapWithKey (\k v -> putStrLn $ printf "%s: %s" (show k) (Text.unpack v))
-- >=> mapM_ Text.putStrLn
where
mixIn :: (Ord k1, Ord k2) => k1 -> Map k2 v -> Map (k1, k2) v
mixIn prefix =
foldlWithKey (\m k v -> Map.insert (prefix, k) v m) Map.empty
fusion :: (Monad m, Ord a) => [Map a b] -> m (Map a b)
fusion = return . foldl Map.union Map.empty
getAll :: Layer -> IO () getAll :: Layer -> IO ()
getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList getAll layer = (cacheFonts $ r Pages layer) >>= mapM_ (displayPage . snd) . Map.toList

View file

@ -6,7 +6,8 @@
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE UndecidableInstances #-} {-# LANGUAGE UndecidableInstances #-}
module PDF.Pages ( module PDF.Pages (
Page(..) Contents(..)
, Page(..)
, PageNumber(..) , PageNumber(..)
, Pages(..) , Pages(..)
, cacheFonts , cacheFonts
@ -18,13 +19,14 @@ import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (runReaderT) import Control.Monad.Reader (runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify) import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
import qualified Control.Monad.RWS as RWS (get) import qualified Control.Monad.RWS as RWS (get)
import Data.OrderedMap (OrderedMap, build) --import Data.OrderedMap (OrderedMap, build)
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.CMap (cMap) import PDF.CMap (cMap)
import PDF.Content (Content(..)) import PDF.Content (Content(..))
import qualified PDF.Content as Content (parse) import qualified PDF.Content as Content (parse)
import PDF.Content.Text (Chunks(..))
import PDF.Encoding (encoding) import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet) import PDF.Font (Font, FontSet)
import PDF.Layer (Layer(..)) import PDF.Layer (Layer(..))
@ -43,12 +45,14 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font type CachedFonts = Map ObjectId Font
type FontCache m = (MonadState CachedFonts m, PDFContent m) type FontCache m = (MonadState CachedFonts m, PDFContent m)
data Page = Page { data Page = Page {
contents :: OrderedMap ObjectId Content contents :: [(ObjectId, Content)]
-- contents :: OrderedMap ObjectId Content
, resources :: Dictionary , resources :: Dictionary
, source :: ObjectId , source :: ObjectId
} }
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content) loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
--loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
loadContents directObject = loadContents directObject =
sequenceA . build loadContent $ objectIds directObject sequenceA . build loadContent $ objectIds directObject
where where
@ -56,12 +60,12 @@ loadContents directObject =
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
objectIds (Array l) = l >>= getReference objectIds (Array l) = l >>= getReference
objectIds dirObj = getReference dirObj objectIds dirObj = getReference dirObj
build f = fmap (\k -> f k >>= \v -> return (k, v))
getFontDictionary :: PDFContent m => Object -> m Dictionary getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj = getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty (pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
{-
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
cache loader objectId = cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get (maybe load return . Map.lookup objectId) =<< RWS.get
@ -90,7 +94,6 @@ loadResources = foldM addFont Map.empty . Map.toList
addFont output (name, Reference (IndirectObjCoordinates {objectId})) = addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output addFont output _ = return output
-}
getReference :: DirectObject -> [ObjectId] getReference :: DirectObject -> [ObjectId]
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId] getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
@ -114,6 +117,7 @@ pagesList =
data Pages = Pages data Pages = Pages
newtype PageNumber = P Int newtype PageNumber = P Int
data Contents = Contents data Contents = Contents
data Text_ = Text_
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
@ -129,5 +133,10 @@ instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNum
firstPage [] = fail "Page is out of bounds" firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p firstPage (p:_) = loadPage p
instance Monad m => Box m Contents Page [(ObjectId, Content)] where
--instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
r Contents = return . contents
cacheFonts :: Monad m => StateT CachedFonts m a -> m a cacheFonts :: Monad m => StateT CachedFonts m a -> m a
cacheFonts = flip evalStateT Map.empty cacheFonts = flip evalStateT Map.empty