From dce10ae63a2168c1056d732493c81265a3173df0 Mon Sep 17 00:00:00 2001 From: Tissevert Date: Sun, 8 Mar 2020 22:18:47 +0100 Subject: [PATCH] Keep Page as only a reference object keeping the ObjectId explicit so we can modify the actual objects one day, write an OrderedMap data structure to help --- Hufflepdf.cabal | 1 + src/Data/OrderedMap.hs | 67 ++++++++++++++++++++++++++++++++++ src/PDF/Object/Navigation.hs | 11 +++--- src/PDF/Pages.hs | 69 ++++++++++++++++++------------------ 4 files changed, 108 insertions(+), 40 deletions(-) create mode 100644 src/Data/OrderedMap.hs diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index c6e2477..1df20eb 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -28,6 +28,7 @@ library , PDF.Parser , PDF.Pages other-modules: Data.ByteString.Char8.Util + , Data.OrderedMap , PDF.Content.Operator , PDF.Content.Operator.Color , PDF.Content.Operator.Common diff --git a/src/Data/OrderedMap.hs b/src/Data/OrderedMap.hs new file mode 100644 index 0000000..09f37ae --- /dev/null +++ b/src/Data/OrderedMap.hs @@ -0,0 +1,67 @@ +{-# LANGUAGE NamedFieldPuns #-} +module Data.OrderedMap ( + OrderedMap + , build + , elems + , fromList + , get + , keys + , lookup + , toList + ) where + +import Data.Map (Map, (!)) +import qualified Data.Map as Map (fromList, lookup) +import Prelude hiding (lookup) + +data OrderedMap k a = OrderedMap { + assoc :: Map k a + , keys :: [k] + } + +instance (Ord k, Show k, Show a) => Show (OrderedMap k a) where + show = show . toList + +instance Functor (OrderedMap k) where + fmap f orderedMap = orderedMap {assoc = fmap f (assoc orderedMap)} + +instance Ord k => Foldable (OrderedMap k) where + foldMap f (OrderedMap {assoc, keys}) = foldMap f $ (assoc !) <$> keys + +instance Ord k => Traversable (OrderedMap k) where + sequenceA (OrderedMap {assoc, keys}) = + (flip OrderedMap keys) <$> sequenceA assoc + +elems :: Ord k => OrderedMap k a -> [a] +elems (OrderedMap {assoc, keys}) = (assoc !) <$> keys + +toList :: Ord k => OrderedMap k a -> [(k, a)] +toList (OrderedMap {assoc, keys}) = (\k -> (k, assoc ! k)) <$> keys + +fromList :: Ord k => [(k, a)] -> OrderedMap k a +fromList keyValueList = OrderedMap { + assoc = Map.fromList keyValueList + , keys = fst <$> keyValueList + } + +build :: Ord k => (k -> a) -> [k] -> OrderedMap k a +build f keys = OrderedMap { + assoc = Map.fromList $ (\k -> (k, f k)) <$> keys + , keys + } + +get :: Ord k => k -> OrderedMap k a -> a +get k = (! k) . assoc + +lookup :: Ord k => k -> OrderedMap k a -> Maybe a +lookup k = (Map.lookup k) . assoc + +{- +cons :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a +cons k a orderedMap = + +snoc :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a + +alter :: Ord k => (Maybe a -> Maybe a) -> k -> OrderedMap k a -> OrderedMap k a +alter +-} diff --git a/src/PDF/Object/Navigation.hs b/src/PDF/Object/Navigation.hs index 5a7e977..be967c5 100644 --- a/src/PDF/Object/Navigation.hs +++ b/src/PDF/Object/Navigation.hs @@ -15,6 +15,7 @@ module PDF.Object.Navigation ( , (>//) , castObject , getDictionary + , getKey , objectById , origin ) where @@ -56,12 +57,12 @@ getDictionary obj = expected "dictionary : " obj expected :: (MonadFail m, Show a) => String -> a -> m b expected name = fail . printf "Not a %s: %s" name . show -getField :: MonadFail m => String -> Dictionary -> m DirectObject -getField key aDictionary = - maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary) +getKey :: PDFContent m => String -> Object -> m DirectObject +getKey key object = getDictionary object >>= catchMaybe . Map.lookup (Name key) where errorMessage = - printf "Key %s not found in dictionary %s" key (show aDictionary) + printf "Key %s not found in dictionary %s" key (show object) + catchMaybe = maybe (fail errorMessage) return objectById :: PDFContent m => ObjectId -> m Object objectById objectId = do @@ -69,7 +70,7 @@ objectById objectId = do return (objects layer ! objectId) (./) :: PDFContent m => m Object -> Component -> m Object -(./) object key = (object >>= getDictionary >>= getField key >>= castObject) +(./) object key = (object >>= getKey key >>= castObject) castObject :: PDFContent m => DirectObject -> m Object castObject (Reference (IndirectObjCoordinates {objectId})) = diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs index 92a76d2..db0bd31 100755 --- a/src/PDF/Pages.hs +++ b/src/PDF/Pages.hs @@ -18,14 +18,13 @@ import Control.Monad.Fail (MonadFail(..)) import Control.Monad.Reader (runReaderT) import Control.Monad.State (MonadState, StateT(..), evalStateT, modify) import qualified Control.Monad.RWS as RWS (get) +import Data.OrderedMap (OrderedMap, build) import Data.Map (Map) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) -import Data.Text (Text) import PDF.Box (Box(..)) import PDF.CMap (cMap) import PDF.Content (Content(..)) import qualified PDF.Content as Content (parse) -import PDF.Content.Text (renderText) import PDF.Encoding (encoding) import PDF.Font (Font, FontSet) import PDF.Layer (Layer(..)) @@ -34,8 +33,8 @@ import PDF.Object ( , Name(..), Object(..) ,) import PDF.Object.Navigation ( - Clear(..), PDFContent, (//), (>./), (>//), castObject, getDictionary - , objectById, origin + Clear(..), PDFContent, (//), (>./), (>//), getDictionary + , getKey, objectById, origin ) import PDF.Output (ObjectId(..)) import Prelude hiding (fail) @@ -44,16 +43,25 @@ import Text.Printf (printf) type CachedFonts = Map ObjectId Font type FontCache m = (MonadState CachedFonts m, PDFContent m) data Page = Page { - contents :: [Content] + contents :: OrderedMap ObjectId Content + , resources :: Dictionary , source :: ObjectId } -{- +loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content) +loadContents directObject = + sequenceA . build loadContent $ objectIds directObject + where + loadContent :: PDFContent m => ObjectId -> m Content + loadContent objectId = objectById objectId >>= r Clear >>= Content.parse + objectIds (Array l) = l >>= getReference + objectIds dirObj = getReference dirObj + getFontDictionary :: PDFContent m => Object -> m Dictionary getFontDictionary pageObj = - (pageObj >// ["Resources", "Font"] >>= getDictionary) - <|> return Map.empty + (pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty +{- cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font cache loader objectId = (maybe load return . Map.lookup objectId) =<< RWS.get @@ -76,45 +84,36 @@ loadFont objectId = objectById objectId >>= tryMappings loadEncoding object = fail $ printf "Encoding must be a name, not that : %s" $ show object -loadFonts :: FontCache m => Dictionary -> m FontSet -loadFonts = foldM addFont Map.empty . Map.toList +loadResources :: FontCache m => Dictionary -> m FontSet +loadResources = foldM addFont Map.empty . Map.toList where addFont output (name, Reference (IndirectObjCoordinates {objectId})) = flip (Map.insert name) output <$> cache loadFont objectId addFont output _ = return output -} -pagesList :: PDFContent m => m [ObjectId] -pagesList = do - pages <- origin // ["Root", "Pages"] >>= getDictionary - case Map.lookup (Name "Kids") pages of - Just (Array kids) -> return $ getReferences kids - _ -> return [] - -getReferences :: [DirectObject] -> [ObjectId] -getReferences objects = do - object <- objects - case object of - Reference (IndirectObjCoordinates {objectId}) -> [objectId] - _ -> [] - -extractContent :: FontCache m => Object -> m [Content] -extractContent pageObj = do - --fonts <- loadFonts =<< getFontDictionary pageObj - pageObj >./ "Contents" >>= several >>= mapM loadContent - where - loadContent object = r Clear object >>= (either fail return . Content.parse) - several (Direct (Array l)) = mapM castObject l - several object = return [object] - -- >>= renderText fonts +getReference :: DirectObject -> [ObjectId] +getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId] +getReference _ = [] loadPage :: FontCache m => ObjectId -> m Page loadPage source = do - contents <- extractContent =<< objectById source - return $ Page {contents, source} + page <- objectById source + contents <- getKey "Contents" page >>= loadContents + resources <- getFontDictionary page + return $ Page {contents, resources, source} + +pagesList :: PDFContent m => m [ObjectId] +pagesList = + (origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences) + <|> return [] + where + getReferences (Array kids) = kids >>= getReference + getReferences _ = fail "Not a pages array" data Pages = Pages newtype PageNumber = P Int +data Contents = Contents instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer