Keep Page as only a reference object keeping the ObjectId explicit so we can modify the actual objects one day, write an OrderedMap data structure to help
This commit is contained in:
parent
f2986da96d
commit
dce10ae63a
4 changed files with 108 additions and 40 deletions
|
@ -28,6 +28,7 @@ library
|
||||||
, PDF.Parser
|
, PDF.Parser
|
||||||
, PDF.Pages
|
, PDF.Pages
|
||||||
other-modules: Data.ByteString.Char8.Util
|
other-modules: Data.ByteString.Char8.Util
|
||||||
|
, Data.OrderedMap
|
||||||
, PDF.Content.Operator
|
, PDF.Content.Operator
|
||||||
, PDF.Content.Operator.Color
|
, PDF.Content.Operator.Color
|
||||||
, PDF.Content.Operator.Common
|
, PDF.Content.Operator.Common
|
||||||
|
|
67
src/Data/OrderedMap.hs
Normal file
67
src/Data/OrderedMap.hs
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
module Data.OrderedMap (
|
||||||
|
OrderedMap
|
||||||
|
, build
|
||||||
|
, elems
|
||||||
|
, fromList
|
||||||
|
, get
|
||||||
|
, keys
|
||||||
|
, lookup
|
||||||
|
, toList
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.Map (Map, (!))
|
||||||
|
import qualified Data.Map as Map (fromList, lookup)
|
||||||
|
import Prelude hiding (lookup)
|
||||||
|
|
||||||
|
data OrderedMap k a = OrderedMap {
|
||||||
|
assoc :: Map k a
|
||||||
|
, keys :: [k]
|
||||||
|
}
|
||||||
|
|
||||||
|
instance (Ord k, Show k, Show a) => Show (OrderedMap k a) where
|
||||||
|
show = show . toList
|
||||||
|
|
||||||
|
instance Functor (OrderedMap k) where
|
||||||
|
fmap f orderedMap = orderedMap {assoc = fmap f (assoc orderedMap)}
|
||||||
|
|
||||||
|
instance Ord k => Foldable (OrderedMap k) where
|
||||||
|
foldMap f (OrderedMap {assoc, keys}) = foldMap f $ (assoc !) <$> keys
|
||||||
|
|
||||||
|
instance Ord k => Traversable (OrderedMap k) where
|
||||||
|
sequenceA (OrderedMap {assoc, keys}) =
|
||||||
|
(flip OrderedMap keys) <$> sequenceA assoc
|
||||||
|
|
||||||
|
elems :: Ord k => OrderedMap k a -> [a]
|
||||||
|
elems (OrderedMap {assoc, keys}) = (assoc !) <$> keys
|
||||||
|
|
||||||
|
toList :: Ord k => OrderedMap k a -> [(k, a)]
|
||||||
|
toList (OrderedMap {assoc, keys}) = (\k -> (k, assoc ! k)) <$> keys
|
||||||
|
|
||||||
|
fromList :: Ord k => [(k, a)] -> OrderedMap k a
|
||||||
|
fromList keyValueList = OrderedMap {
|
||||||
|
assoc = Map.fromList keyValueList
|
||||||
|
, keys = fst <$> keyValueList
|
||||||
|
}
|
||||||
|
|
||||||
|
build :: Ord k => (k -> a) -> [k] -> OrderedMap k a
|
||||||
|
build f keys = OrderedMap {
|
||||||
|
assoc = Map.fromList $ (\k -> (k, f k)) <$> keys
|
||||||
|
, keys
|
||||||
|
}
|
||||||
|
|
||||||
|
get :: Ord k => k -> OrderedMap k a -> a
|
||||||
|
get k = (! k) . assoc
|
||||||
|
|
||||||
|
lookup :: Ord k => k -> OrderedMap k a -> Maybe a
|
||||||
|
lookup k = (Map.lookup k) . assoc
|
||||||
|
|
||||||
|
{-
|
||||||
|
cons :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
|
||||||
|
cons k a orderedMap =
|
||||||
|
|
||||||
|
snoc :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
|
||||||
|
|
||||||
|
alter :: Ord k => (Maybe a -> Maybe a) -> k -> OrderedMap k a -> OrderedMap k a
|
||||||
|
alter
|
||||||
|
-}
|
|
@ -15,6 +15,7 @@ module PDF.Object.Navigation (
|
||||||
, (>//)
|
, (>//)
|
||||||
, castObject
|
, castObject
|
||||||
, getDictionary
|
, getDictionary
|
||||||
|
, getKey
|
||||||
, objectById
|
, objectById
|
||||||
, origin
|
, origin
|
||||||
) where
|
) where
|
||||||
|
@ -56,12 +57,12 @@ getDictionary obj = expected "dictionary : " obj
|
||||||
expected :: (MonadFail m, Show a) => String -> a -> m b
|
expected :: (MonadFail m, Show a) => String -> a -> m b
|
||||||
expected name = fail . printf "Not a %s: %s" name . show
|
expected name = fail . printf "Not a %s: %s" name . show
|
||||||
|
|
||||||
getField :: MonadFail m => String -> Dictionary -> m DirectObject
|
getKey :: PDFContent m => String -> Object -> m DirectObject
|
||||||
getField key aDictionary =
|
getKey key object = getDictionary object >>= catchMaybe . Map.lookup (Name key)
|
||||||
maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
|
|
||||||
where
|
where
|
||||||
errorMessage =
|
errorMessage =
|
||||||
printf "Key %s not found in dictionary %s" key (show aDictionary)
|
printf "Key %s not found in dictionary %s" key (show object)
|
||||||
|
catchMaybe = maybe (fail errorMessage) return
|
||||||
|
|
||||||
objectById :: PDFContent m => ObjectId -> m Object
|
objectById :: PDFContent m => ObjectId -> m Object
|
||||||
objectById objectId = do
|
objectById objectId = do
|
||||||
|
@ -69,7 +70,7 @@ objectById objectId = do
|
||||||
return (objects layer ! objectId)
|
return (objects layer ! objectId)
|
||||||
|
|
||||||
(./) :: PDFContent m => m Object -> Component -> m Object
|
(./) :: PDFContent m => m Object -> Component -> m Object
|
||||||
(./) object key = (object >>= getDictionary >>= getField key >>= castObject)
|
(./) object key = (object >>= getKey key >>= castObject)
|
||||||
|
|
||||||
castObject :: PDFContent m => DirectObject -> m Object
|
castObject :: PDFContent m => DirectObject -> m Object
|
||||||
castObject (Reference (IndirectObjCoordinates {objectId})) =
|
castObject (Reference (IndirectObjCoordinates {objectId})) =
|
||||||
|
|
|
@ -18,14 +18,13 @@ import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.Reader (runReaderT)
|
import Control.Monad.Reader (runReaderT)
|
||||||
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
||||||
import qualified Control.Monad.RWS as RWS (get)
|
import qualified Control.Monad.RWS as RWS (get)
|
||||||
|
import Data.OrderedMap (OrderedMap, build)
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||||
import Data.Text (Text)
|
|
||||||
import PDF.Box (Box(..))
|
import PDF.Box (Box(..))
|
||||||
import PDF.CMap (cMap)
|
import PDF.CMap (cMap)
|
||||||
import PDF.Content (Content(..))
|
import PDF.Content (Content(..))
|
||||||
import qualified PDF.Content as Content (parse)
|
import qualified PDF.Content as Content (parse)
|
||||||
import PDF.Content.Text (renderText)
|
|
||||||
import PDF.Encoding (encoding)
|
import PDF.Encoding (encoding)
|
||||||
import PDF.Font (Font, FontSet)
|
import PDF.Font (Font, FontSet)
|
||||||
import PDF.Layer (Layer(..))
|
import PDF.Layer (Layer(..))
|
||||||
|
@ -34,8 +33,8 @@ import PDF.Object (
|
||||||
, Name(..), Object(..)
|
, Name(..), Object(..)
|
||||||
,)
|
,)
|
||||||
import PDF.Object.Navigation (
|
import PDF.Object.Navigation (
|
||||||
Clear(..), PDFContent, (//), (>./), (>//), castObject, getDictionary
|
Clear(..), PDFContent, (//), (>./), (>//), getDictionary
|
||||||
, objectById, origin
|
, getKey, objectById, origin
|
||||||
)
|
)
|
||||||
import PDF.Output (ObjectId(..))
|
import PDF.Output (ObjectId(..))
|
||||||
import Prelude hiding (fail)
|
import Prelude hiding (fail)
|
||||||
|
@ -44,16 +43,25 @@ import Text.Printf (printf)
|
||||||
type CachedFonts = Map ObjectId Font
|
type CachedFonts = Map ObjectId Font
|
||||||
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
||||||
data Page = Page {
|
data Page = Page {
|
||||||
contents :: [Content]
|
contents :: OrderedMap ObjectId Content
|
||||||
|
, resources :: Dictionary
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
}
|
}
|
||||||
|
|
||||||
{-
|
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
||||||
|
loadContents directObject =
|
||||||
|
sequenceA . build loadContent $ objectIds directObject
|
||||||
|
where
|
||||||
|
loadContent :: PDFContent m => ObjectId -> m Content
|
||||||
|
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
||||||
|
objectIds (Array l) = l >>= getReference
|
||||||
|
objectIds dirObj = getReference dirObj
|
||||||
|
|
||||||
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
||||||
getFontDictionary pageObj =
|
getFontDictionary pageObj =
|
||||||
(pageObj >// ["Resources", "Font"] >>= getDictionary)
|
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
|
||||||
<|> return Map.empty
|
|
||||||
|
|
||||||
|
{-
|
||||||
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
||||||
cache loader objectId =
|
cache loader objectId =
|
||||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||||
|
@ -76,45 +84,36 @@ loadFont objectId = objectById objectId >>= tryMappings
|
||||||
loadEncoding object =
|
loadEncoding object =
|
||||||
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
||||||
|
|
||||||
loadFonts :: FontCache m => Dictionary -> m FontSet
|
loadResources :: FontCache m => Dictionary -> m FontSet
|
||||||
loadFonts = foldM addFont Map.empty . Map.toList
|
loadResources = foldM addFont Map.empty . Map.toList
|
||||||
where
|
where
|
||||||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||||
flip (Map.insert name) output <$> cache loadFont objectId
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
addFont output _ = return output
|
addFont output _ = return output
|
||||||
-}
|
-}
|
||||||
|
|
||||||
pagesList :: PDFContent m => m [ObjectId]
|
getReference :: DirectObject -> [ObjectId]
|
||||||
pagesList = do
|
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
||||||
pages <- origin // ["Root", "Pages"] >>= getDictionary
|
getReference _ = []
|
||||||
case Map.lookup (Name "Kids") pages of
|
|
||||||
Just (Array kids) -> return $ getReferences kids
|
|
||||||
_ -> return []
|
|
||||||
|
|
||||||
getReferences :: [DirectObject] -> [ObjectId]
|
|
||||||
getReferences objects = do
|
|
||||||
object <- objects
|
|
||||||
case object of
|
|
||||||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
|
||||||
_ -> []
|
|
||||||
|
|
||||||
extractContent :: FontCache m => Object -> m [Content]
|
|
||||||
extractContent pageObj = do
|
|
||||||
--fonts <- loadFonts =<< getFontDictionary pageObj
|
|
||||||
pageObj >./ "Contents" >>= several >>= mapM loadContent
|
|
||||||
where
|
|
||||||
loadContent object = r Clear object >>= (either fail return . Content.parse)
|
|
||||||
several (Direct (Array l)) = mapM castObject l
|
|
||||||
several object = return [object]
|
|
||||||
-- >>= renderText fonts
|
|
||||||
|
|
||||||
loadPage :: FontCache m => ObjectId -> m Page
|
loadPage :: FontCache m => ObjectId -> m Page
|
||||||
loadPage source = do
|
loadPage source = do
|
||||||
contents <- extractContent =<< objectById source
|
page <- objectById source
|
||||||
return $ Page {contents, source}
|
contents <- getKey "Contents" page >>= loadContents
|
||||||
|
resources <- getFontDictionary page
|
||||||
|
return $ Page {contents, resources, source}
|
||||||
|
|
||||||
|
pagesList :: PDFContent m => m [ObjectId]
|
||||||
|
pagesList =
|
||||||
|
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
|
||||||
|
<|> return []
|
||||||
|
where
|
||||||
|
getReferences (Array kids) = kids >>= getReference
|
||||||
|
getReferences _ = fail "Not a pages array"
|
||||||
|
|
||||||
data Pages = Pages
|
data Pages = Pages
|
||||||
newtype PageNumber = P Int
|
newtype PageNumber = P Int
|
||||||
|
data Contents = Contents
|
||||||
|
|
||||||
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
||||||
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
||||||
|
|
Loading…
Reference in a new issue