Keep Page as only a reference object keeping the ObjectId explicit so we can modify the actual objects one day, write an OrderedMap data structure to help
This commit is contained in:
parent
f2986da96d
commit
dce10ae63a
4 changed files with 108 additions and 40 deletions
|
@ -28,6 +28,7 @@ library
|
|||
, PDF.Parser
|
||||
, PDF.Pages
|
||||
other-modules: Data.ByteString.Char8.Util
|
||||
, Data.OrderedMap
|
||||
, PDF.Content.Operator
|
||||
, PDF.Content.Operator.Color
|
||||
, PDF.Content.Operator.Common
|
||||
|
|
67
src/Data/OrderedMap.hs
Normal file
67
src/Data/OrderedMap.hs
Normal file
|
@ -0,0 +1,67 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module Data.OrderedMap (
|
||||
OrderedMap
|
||||
, build
|
||||
, elems
|
||||
, fromList
|
||||
, get
|
||||
, keys
|
||||
, lookup
|
||||
, toList
|
||||
) where
|
||||
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (fromList, lookup)
|
||||
import Prelude hiding (lookup)
|
||||
|
||||
data OrderedMap k a = OrderedMap {
|
||||
assoc :: Map k a
|
||||
, keys :: [k]
|
||||
}
|
||||
|
||||
instance (Ord k, Show k, Show a) => Show (OrderedMap k a) where
|
||||
show = show . toList
|
||||
|
||||
instance Functor (OrderedMap k) where
|
||||
fmap f orderedMap = orderedMap {assoc = fmap f (assoc orderedMap)}
|
||||
|
||||
instance Ord k => Foldable (OrderedMap k) where
|
||||
foldMap f (OrderedMap {assoc, keys}) = foldMap f $ (assoc !) <$> keys
|
||||
|
||||
instance Ord k => Traversable (OrderedMap k) where
|
||||
sequenceA (OrderedMap {assoc, keys}) =
|
||||
(flip OrderedMap keys) <$> sequenceA assoc
|
||||
|
||||
elems :: Ord k => OrderedMap k a -> [a]
|
||||
elems (OrderedMap {assoc, keys}) = (assoc !) <$> keys
|
||||
|
||||
toList :: Ord k => OrderedMap k a -> [(k, a)]
|
||||
toList (OrderedMap {assoc, keys}) = (\k -> (k, assoc ! k)) <$> keys
|
||||
|
||||
fromList :: Ord k => [(k, a)] -> OrderedMap k a
|
||||
fromList keyValueList = OrderedMap {
|
||||
assoc = Map.fromList keyValueList
|
||||
, keys = fst <$> keyValueList
|
||||
}
|
||||
|
||||
build :: Ord k => (k -> a) -> [k] -> OrderedMap k a
|
||||
build f keys = OrderedMap {
|
||||
assoc = Map.fromList $ (\k -> (k, f k)) <$> keys
|
||||
, keys
|
||||
}
|
||||
|
||||
get :: Ord k => k -> OrderedMap k a -> a
|
||||
get k = (! k) . assoc
|
||||
|
||||
lookup :: Ord k => k -> OrderedMap k a -> Maybe a
|
||||
lookup k = (Map.lookup k) . assoc
|
||||
|
||||
{-
|
||||
cons :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
|
||||
cons k a orderedMap =
|
||||
|
||||
snoc :: Ord k => k -> a -> OrderedMap k a -> OrderedMap k a
|
||||
|
||||
alter :: Ord k => (Maybe a -> Maybe a) -> k -> OrderedMap k a -> OrderedMap k a
|
||||
alter
|
||||
-}
|
|
@ -15,6 +15,7 @@ module PDF.Object.Navigation (
|
|||
, (>//)
|
||||
, castObject
|
||||
, getDictionary
|
||||
, getKey
|
||||
, objectById
|
||||
, origin
|
||||
) where
|
||||
|
@ -56,12 +57,12 @@ getDictionary obj = expected "dictionary : " obj
|
|||
expected :: (MonadFail m, Show a) => String -> a -> m b
|
||||
expected name = fail . printf "Not a %s: %s" name . show
|
||||
|
||||
getField :: MonadFail m => String -> Dictionary -> m DirectObject
|
||||
getField key aDictionary =
|
||||
maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
|
||||
getKey :: PDFContent m => String -> Object -> m DirectObject
|
||||
getKey key object = getDictionary object >>= catchMaybe . Map.lookup (Name key)
|
||||
where
|
||||
errorMessage =
|
||||
printf "Key %s not found in dictionary %s" key (show aDictionary)
|
||||
printf "Key %s not found in dictionary %s" key (show object)
|
||||
catchMaybe = maybe (fail errorMessage) return
|
||||
|
||||
objectById :: PDFContent m => ObjectId -> m Object
|
||||
objectById objectId = do
|
||||
|
@ -69,7 +70,7 @@ objectById objectId = do
|
|||
return (objects layer ! objectId)
|
||||
|
||||
(./) :: PDFContent m => m Object -> Component -> m Object
|
||||
(./) object key = (object >>= getDictionary >>= getField key >>= castObject)
|
||||
(./) object key = (object >>= getKey key >>= castObject)
|
||||
|
||||
castObject :: PDFContent m => DirectObject -> m Object
|
||||
castObject (Reference (IndirectObjCoordinates {objectId})) =
|
||||
|
|
|
@ -18,14 +18,13 @@ import Control.Monad.Fail (MonadFail(..))
|
|||
import Control.Monad.Reader (runReaderT)
|
||||
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
||||
import qualified Control.Monad.RWS as RWS (get)
|
||||
import Data.OrderedMap (OrderedMap, build)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||
import Data.Text (Text)
|
||||
import PDF.Box (Box(..))
|
||||
import PDF.CMap (cMap)
|
||||
import PDF.Content (Content(..))
|
||||
import qualified PDF.Content as Content (parse)
|
||||
import PDF.Content.Text (renderText)
|
||||
import PDF.Encoding (encoding)
|
||||
import PDF.Font (Font, FontSet)
|
||||
import PDF.Layer (Layer(..))
|
||||
|
@ -34,8 +33,8 @@ import PDF.Object (
|
|||
, Name(..), Object(..)
|
||||
,)
|
||||
import PDF.Object.Navigation (
|
||||
Clear(..), PDFContent, (//), (>./), (>//), castObject, getDictionary
|
||||
, objectById, origin
|
||||
Clear(..), PDFContent, (//), (>./), (>//), getDictionary
|
||||
, getKey, objectById, origin
|
||||
)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import Prelude hiding (fail)
|
||||
|
@ -44,16 +43,25 @@ import Text.Printf (printf)
|
|||
type CachedFonts = Map ObjectId Font
|
||||
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
||||
data Page = Page {
|
||||
contents :: [Content]
|
||||
contents :: OrderedMap ObjectId Content
|
||||
, resources :: Dictionary
|
||||
, source :: ObjectId
|
||||
}
|
||||
|
||||
{-
|
||||
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
||||
loadContents directObject =
|
||||
sequenceA . build loadContent $ objectIds directObject
|
||||
where
|
||||
loadContent :: PDFContent m => ObjectId -> m Content
|
||||
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
||||
objectIds (Array l) = l >>= getReference
|
||||
objectIds dirObj = getReference dirObj
|
||||
|
||||
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
||||
getFontDictionary pageObj =
|
||||
(pageObj >// ["Resources", "Font"] >>= getDictionary)
|
||||
<|> return Map.empty
|
||||
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
|
||||
|
||||
{-
|
||||
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
||||
cache loader objectId =
|
||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||
|
@ -76,45 +84,36 @@ loadFont objectId = objectById objectId >>= tryMappings
|
|||
loadEncoding object =
|
||||
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
||||
|
||||
loadFonts :: FontCache m => Dictionary -> m FontSet
|
||||
loadFonts = foldM addFont Map.empty . Map.toList
|
||||
loadResources :: FontCache m => Dictionary -> m FontSet
|
||||
loadResources = foldM addFont Map.empty . Map.toList
|
||||
where
|
||||
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||
flip (Map.insert name) output <$> cache loadFont objectId
|
||||
addFont output _ = return output
|
||||
-}
|
||||
|
||||
pagesList :: PDFContent m => m [ObjectId]
|
||||
pagesList = do
|
||||
pages <- origin // ["Root", "Pages"] >>= getDictionary
|
||||
case Map.lookup (Name "Kids") pages of
|
||||
Just (Array kids) -> return $ getReferences kids
|
||||
_ -> return []
|
||||
|
||||
getReferences :: [DirectObject] -> [ObjectId]
|
||||
getReferences objects = do
|
||||
object <- objects
|
||||
case object of
|
||||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||
_ -> []
|
||||
|
||||
extractContent :: FontCache m => Object -> m [Content]
|
||||
extractContent pageObj = do
|
||||
--fonts <- loadFonts =<< getFontDictionary pageObj
|
||||
pageObj >./ "Contents" >>= several >>= mapM loadContent
|
||||
where
|
||||
loadContent object = r Clear object >>= (either fail return . Content.parse)
|
||||
several (Direct (Array l)) = mapM castObject l
|
||||
several object = return [object]
|
||||
-- >>= renderText fonts
|
||||
getReference :: DirectObject -> [ObjectId]
|
||||
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
||||
getReference _ = []
|
||||
|
||||
loadPage :: FontCache m => ObjectId -> m Page
|
||||
loadPage source = do
|
||||
contents <- extractContent =<< objectById source
|
||||
return $ Page {contents, source}
|
||||
page <- objectById source
|
||||
contents <- getKey "Contents" page >>= loadContents
|
||||
resources <- getFontDictionary page
|
||||
return $ Page {contents, resources, source}
|
||||
|
||||
pagesList :: PDFContent m => m [ObjectId]
|
||||
pagesList =
|
||||
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
|
||||
<|> return []
|
||||
where
|
||||
getReferences (Array kids) = kids >>= getReference
|
||||
getReferences _ = fail "Not a pages array"
|
||||
|
||||
data Pages = Pages
|
||||
newtype PageNumber = P Int
|
||||
data Contents = Contents
|
||||
|
||||
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
||||
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
||||
|
|
Loading…
Reference in a new issue