Keep Page as only a reference object keeping the ObjectId explicit so we can modify the actual objects one day, write an OrderedMap data structure to help

This commit is contained in:
Tissevert 2020-03-08 22:18:47 +01:00
parent f2986da96d
commit dce10ae63a
4 changed files with 108 additions and 40 deletions

View file

@ -28,6 +28,7 @@ library
, PDF.Parser , PDF.Parser
, PDF.Pages , PDF.Pages
other-modules: Data.ByteString.Char8.Util other-modules: Data.ByteString.Char8.Util
, Data.OrderedMap
, PDF.Content.Operator , PDF.Content.Operator
, PDF.Content.Operator.Color , PDF.Content.Operator.Color
, PDF.Content.Operator.Common , PDF.Content.Operator.Common

67
src/Data/OrderedMap.hs Normal file
View file

@ -0,0 +1,67 @@
{-# LANGUAGE NamedFieldPuns #-}
module Data.OrderedMap (
OrderedMap
, build
, elems
, fromList
, get
, keys
, lookup
, toList
) where
import Data.Map (Map, (!))
import qualified Data.Map as Map (fromList, lookup)
import Prelude hiding (lookup)
data OrderedMap k a = OrderedMap {
assoc :: Map k a
, keys :: [k]
}
instance (Ord k, Show k, Show a) => Show (OrderedMap k a) where
show = show . toList
instance Functor (OrderedMap k) where
fmap f orderedMap = orderedMap {assoc = fmap f (assoc orderedMap)}
instance Ord k => Foldable (OrderedMap k) where
foldMap f (OrderedMap {assoc, keys}) = foldMap f $ (assoc !) <$> keys
instance Ord k => Traversable (OrderedMap k) where
sequenceA (OrderedMap {assoc, keys}) =
(flip OrderedMap keys) <$> sequenceA assoc
elems :: Ord k => OrderedMap k a -> [a]
elems (OrderedMap {assoc, keys}) = (assoc !) <$> keys
toList :: Ord k => OrderedMap k a -> [(k, a)]
toList (OrderedMap {assoc, keys}) = (\k -> (k, assoc ! k)) <$> keys
fromList :: Ord k => [(k, a)] -> OrderedMap k a
fromList keyValueList = OrderedMap {
assoc = Map.fromList keyValueList
, keys = fst <$> keyValueList
}
build :: Ord k => (k -> a) -> [k] -> OrderedMap k a
build f keys = OrderedMap {
assoc = Map.fromList $ (\k -> (k, f k)) <$> keys
, keys
}
get :: Ord k => k -> OrderedMap k a -> a
get k = (! k) . assoc
lookup :: Ord k => k -> OrderedMap k a -> Maybe a
lookup k = (Map.lookup k) . assoc
{-
cons :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
cons k a orderedMap =
snoc :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
alter :: Ord k => (Maybe a -> Maybe a) -> k -> OrderedMap k a -> OrderedMap k a
alter
-}

View file

@ -15,6 +15,7 @@ module PDF.Object.Navigation (
, (>//) , (>//)
, castObject , castObject
, getDictionary , getDictionary
, getKey
, objectById , objectById
, origin , origin
) where ) where
@ -56,12 +57,12 @@ getDictionary obj = expected "dictionary : " obj
expected :: (MonadFail m, Show a) => String -> a -> m b expected :: (MonadFail m, Show a) => String -> a -> m b
expected name = fail . printf "Not a %s: %s" name . show expected name = fail . printf "Not a %s: %s" name . show
getField :: MonadFail m => String -> Dictionary -> m DirectObject getKey :: PDFContent m => String -> Object -> m DirectObject
getField key aDictionary = getKey key object = getDictionary object >>= catchMaybe . Map.lookup (Name key)
maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
where where
errorMessage = errorMessage =
printf "Key %s not found in dictionary %s" key (show aDictionary) printf "Key %s not found in dictionary %s" key (show object)
catchMaybe = maybe (fail errorMessage) return
objectById :: PDFContent m => ObjectId -> m Object objectById :: PDFContent m => ObjectId -> m Object
objectById objectId = do objectById objectId = do
@ -69,7 +70,7 @@ objectById objectId = do
return (objects layer ! objectId) return (objects layer ! objectId)
(./) :: PDFContent m => m Object -> Component -> m Object (./) :: PDFContent m => m Object -> Component -> m Object
(./) object key = (object >>= getDictionary >>= getField key >>= castObject) (./) object key = (object >>= getKey key >>= castObject)
castObject :: PDFContent m => DirectObject -> m Object castObject :: PDFContent m => DirectObject -> m Object
castObject (Reference (IndirectObjCoordinates {objectId})) = castObject (Reference (IndirectObjCoordinates {objectId})) =

View file

@ -18,14 +18,13 @@ import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (runReaderT) import Control.Monad.Reader (runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify) import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
import qualified Control.Monad.RWS as RWS (get) import qualified Control.Monad.RWS as RWS (get)
import Data.OrderedMap (OrderedMap, build)
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.Box (Box(..)) import PDF.Box (Box(..))
import PDF.CMap (cMap) import PDF.CMap (cMap)
import PDF.Content (Content(..)) import PDF.Content (Content(..))
import qualified PDF.Content as Content (parse) import qualified PDF.Content as Content (parse)
import PDF.Content.Text (renderText)
import PDF.Encoding (encoding) import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet) import PDF.Font (Font, FontSet)
import PDF.Layer (Layer(..)) import PDF.Layer (Layer(..))
@ -34,8 +33,8 @@ import PDF.Object (
, Name(..), Object(..) , Name(..), Object(..)
,) ,)
import PDF.Object.Navigation ( import PDF.Object.Navigation (
Clear(..), PDFContent, (//), (>./), (>//), castObject, getDictionary Clear(..), PDFContent, (//), (>./), (>//), getDictionary
, objectById, origin , getKey, objectById, origin
) )
import PDF.Output (ObjectId(..)) import PDF.Output (ObjectId(..))
import Prelude hiding (fail) import Prelude hiding (fail)
@ -44,16 +43,25 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font type CachedFonts = Map ObjectId Font
type FontCache m = (MonadState CachedFonts m, PDFContent m) type FontCache m = (MonadState CachedFonts m, PDFContent m)
data Page = Page { data Page = Page {
contents :: [Content] contents :: OrderedMap ObjectId Content
, resources :: Dictionary
, source :: ObjectId , source :: ObjectId
} }
{- loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
loadContents directObject =
sequenceA . build loadContent $ objectIds directObject
where
loadContent :: PDFContent m => ObjectId -> m Content
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
objectIds (Array l) = l >>= getReference
objectIds dirObj = getReference dirObj
getFontDictionary :: PDFContent m => Object -> m Dictionary getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj = getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary) (pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
<|> return Map.empty
{-
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
cache loader objectId = cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get (maybe load return . Map.lookup objectId) =<< RWS.get
@ -76,45 +84,36 @@ loadFont objectId = objectById objectId >>= tryMappings
loadEncoding object = loadEncoding object =
fail $ printf "Encoding must be a name, not that : %s" $ show object fail $ printf "Encoding must be a name, not that : %s" $ show object
loadFonts :: FontCache m => Dictionary -> m FontSet loadResources :: FontCache m => Dictionary -> m FontSet
loadFonts = foldM addFont Map.empty . Map.toList loadResources = foldM addFont Map.empty . Map.toList
where where
addFont output (name, Reference (IndirectObjCoordinates {objectId})) = addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output addFont output _ = return output
-} -}
pagesList :: PDFContent m => m [ObjectId] getReference :: DirectObject -> [ObjectId]
pagesList = do getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
pages <- origin // ["Root", "Pages"] >>= getDictionary getReference _ = []
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
getReferences :: [DirectObject] -> [ObjectId]
getReferences objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractContent :: FontCache m => Object -> m [Content]
extractContent pageObj = do
--fonts <- loadFonts =<< getFontDictionary pageObj
pageObj >./ "Contents" >>= several >>= mapM loadContent
where
loadContent object = r Clear object >>= (either fail return . Content.parse)
several (Direct (Array l)) = mapM castObject l
several object = return [object]
-- >>= renderText fonts
loadPage :: FontCache m => ObjectId -> m Page loadPage :: FontCache m => ObjectId -> m Page
loadPage source = do loadPage source = do
contents <- extractContent =<< objectById source page <- objectById source
return $ Page {contents, source} contents <- getKey "Contents" page >>= loadContents
resources <- getFontDictionary page
return $ Page {contents, resources, source}
pagesList :: PDFContent m => m [ObjectId]
pagesList =
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
<|> return []
where
getReferences (Array kids) = kids >>= getReference
getReferences _ = fail "Not a pages array"
data Pages = Pages data Pages = Pages
newtype PageNumber = P Int newtype PageNumber = P Int
data Contents = Contents
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer