Keep Page as only a reference object keeping the ObjectId explicit so we can modify the actual objects one day, write an OrderedMap data structure to help

This commit is contained in:
Tissevert 2020-03-08 22:18:47 +01:00
parent f2986da96d
commit dce10ae63a
4 changed files with 108 additions and 40 deletions

View file

@ -28,6 +28,7 @@ library
, PDF.Parser
, PDF.Pages
other-modules: Data.ByteString.Char8.Util
, Data.OrderedMap
, PDF.Content.Operator
, PDF.Content.Operator.Color
, PDF.Content.Operator.Common

67
src/Data/OrderedMap.hs Normal file
View file

@ -0,0 +1,67 @@
{-# LANGUAGE NamedFieldPuns #-}
module Data.OrderedMap (
OrderedMap
, build
, elems
, fromList
, get
, keys
, lookup
, toList
) where
import Data.Map (Map, (!))
import qualified Data.Map as Map (fromList, lookup)
import Prelude hiding (lookup)
data OrderedMap k a = OrderedMap {
assoc :: Map k a
, keys :: [k]
}
instance (Ord k, Show k, Show a) => Show (OrderedMap k a) where
show = show . toList
instance Functor (OrderedMap k) where
fmap f orderedMap = orderedMap {assoc = fmap f (assoc orderedMap)}
instance Ord k => Foldable (OrderedMap k) where
foldMap f (OrderedMap {assoc, keys}) = foldMap f $ (assoc !) <$> keys
instance Ord k => Traversable (OrderedMap k) where
sequenceA (OrderedMap {assoc, keys}) =
(flip OrderedMap keys) <$> sequenceA assoc
elems :: Ord k => OrderedMap k a -> [a]
elems (OrderedMap {assoc, keys}) = (assoc !) <$> keys
toList :: Ord k => OrderedMap k a -> [(k, a)]
toList (OrderedMap {assoc, keys}) = (\k -> (k, assoc ! k)) <$> keys
fromList :: Ord k => [(k, a)] -> OrderedMap k a
fromList keyValueList = OrderedMap {
assoc = Map.fromList keyValueList
, keys = fst <$> keyValueList
}
build :: Ord k => (k -> a) -> [k] -> OrderedMap k a
build f keys = OrderedMap {
assoc = Map.fromList $ (\k -> (k, f k)) <$> keys
, keys
}
get :: Ord k => k -> OrderedMap k a -> a
get k = (! k) . assoc
lookup :: Ord k => k -> OrderedMap k a -> Maybe a
lookup k = (Map.lookup k) . assoc
{-
cons :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
cons k a orderedMap =
snoc :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
alter :: Ord k => (Maybe a -> Maybe a) -> k -> OrderedMap k a -> OrderedMap k a
alter
-}

View file

@ -15,6 +15,7 @@ module PDF.Object.Navigation (
, (>//)
, castObject
, getDictionary
, getKey
, objectById
, origin
) where
@ -56,12 +57,12 @@ getDictionary obj = expected "dictionary : " obj
expected :: (MonadFail m, Show a) => String -> a -> m b
expected name = fail . printf "Not a %s: %s" name . show
getField :: MonadFail m => String -> Dictionary -> m DirectObject
getField key aDictionary =
maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
getKey :: PDFContent m => String -> Object -> m DirectObject
getKey key object = getDictionary object >>= catchMaybe . Map.lookup (Name key)
where
errorMessage =
printf "Key %s not found in dictionary %s" key (show aDictionary)
printf "Key %s not found in dictionary %s" key (show object)
catchMaybe = maybe (fail errorMessage) return
objectById :: PDFContent m => ObjectId -> m Object
objectById objectId = do
@ -69,7 +70,7 @@ objectById objectId = do
return (objects layer ! objectId)
(./) :: PDFContent m => m Object -> Component -> m Object
(./) object key = (object >>= getDictionary >>= getField key >>= castObject)
(./) object key = (object >>= getKey key >>= castObject)
castObject :: PDFContent m => DirectObject -> m Object
castObject (Reference (IndirectObjCoordinates {objectId})) =

View file

@ -18,14 +18,13 @@ import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.OrderedMap (OrderedMap, build)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.Box (Box(..))
import PDF.CMap (cMap)
import PDF.Content (Content(..))
import qualified PDF.Content as Content (parse)
import PDF.Content.Text (renderText)
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Layer (Layer(..))
@ -34,8 +33,8 @@ import PDF.Object (
, Name(..), Object(..)
,)
import PDF.Object.Navigation (
Clear(..), PDFContent, (//), (>./), (>//), castObject, getDictionary
, objectById, origin
Clear(..), PDFContent, (//), (>./), (>//), getDictionary
, getKey, objectById, origin
)
import PDF.Output (ObjectId(..))
import Prelude hiding (fail)
@ -44,16 +43,25 @@ import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type FontCache m = (MonadState CachedFonts m, PDFContent m)
data Page = Page {
contents :: [Content]
contents :: OrderedMap ObjectId Content
, resources :: Dictionary
, source :: ObjectId
}
{-
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
loadContents directObject =
sequenceA . build loadContent $ objectIds directObject
where
loadContent :: PDFContent m => ObjectId -> m Content
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
objectIds (Array l) = l >>= getReference
objectIds dirObj = getReference dirObj
getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary)
<|> return Map.empty
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
{-
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
@ -76,45 +84,36 @@ loadFont objectId = objectById objectId >>= tryMappings
loadEncoding object =
fail $ printf "Encoding must be a name, not that : %s" $ show object
loadFonts :: FontCache m => Dictionary -> m FontSet
loadFonts = foldM addFont Map.empty . Map.toList
loadResources :: FontCache m => Dictionary -> m FontSet
loadResources = foldM addFont Map.empty . Map.toList
where
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
-}
pagesList :: PDFContent m => m [ObjectId]
pagesList = do
pages <- origin // ["Root", "Pages"] >>= getDictionary
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
getReferences :: [DirectObject] -> [ObjectId]
getReferences objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractContent :: FontCache m => Object -> m [Content]
extractContent pageObj = do
--fonts <- loadFonts =<< getFontDictionary pageObj
pageObj >./ "Contents" >>= several >>= mapM loadContent
where
loadContent object = r Clear object >>= (either fail return . Content.parse)
several (Direct (Array l)) = mapM castObject l
several object = return [object]
-- >>= renderText fonts
getReference :: DirectObject -> [ObjectId]
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
getReference _ = []
loadPage :: FontCache m => ObjectId -> m Page
loadPage source = do
contents <- extractContent =<< objectById source
return $ Page {contents, source}
page <- objectById source
contents <- getKey "Contents" page >>= loadContents
resources <- getFontDictionary page
return $ Page {contents, resources, source}
pagesList :: PDFContent m => m [ObjectId]
pagesList =
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
<|> return []
where
getReferences (Array kids) = kids >>= getReference
getReferences _ = fail "Not a pages array"
data Pages = Pages
newtype PageNumber = P Int
data Contents = Contents
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer