Simplify navigations by centering everything on Objects to avoid needing to many conversion tools between DirectObject / Object / Dictionary

This commit is contained in:
Tissevert 2020-02-15 13:51:24 +01:00
parent 23186100a8
commit 6096a1a237
3 changed files with 55 additions and 66 deletions

View File

@ -8,7 +8,7 @@ import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
import PDF (Document(..), parseDocument) import PDF (Document(..), parseDocument)
import PDF.Object (Content(..), Object(..)) import PDF.Object (Content(..), Object(..))
import PDF.Object.Navigation ( import PDF.Object.Navigation (
Error(..), (//), castObject, objectById, openStream, origin Error(..), (//), objectById, openStream, origin
) )
import PDF.Output (ObjectId(..), Output) import PDF.Output (ObjectId(..), Output)
import qualified PDF.Output as Output (render) import qualified PDF.Output as Output (render)
@ -34,7 +34,7 @@ parse [inputFile, key] =
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key) return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
where where
byId = objectById . ObjectId byId = objectById . ObjectId
byPath path = (origin >>= (// (explode path)) >>= castObject) byPath path = origin // (explode path)
explode "" = [] explode "" = []
explode path = explode path =
case break (== '.') path of case break (== '.') path of

View File

@ -4,12 +4,11 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-}
module PDF.Object.Navigation ( module PDF.Object.Navigation (
Error(..) Error(..)
, (./)
, (//) , (//)
, castObject , (>./)
, dictionaryById , (>//)
, getDictionary , getDictionary
, getField
, follow
, objectById , objectById
, openStream , openStream
, origin , origin
@ -39,21 +38,12 @@ newtype Error a = Error {
instance MonadFail Error where instance MonadFail Error where
fail = Error . Left fail = Error . Left
castDictionary :: MonadFail m => Object -> m Dictionary getDictionary :: PDFContent m => Object -> m Dictionary
castDictionary (Direct (Dictionary aDict)) = return aDict getDictionary (Direct (Dictionary aDict)) = return aDict
castDictionary (Stream {header}) = return header getDictionary (Direct (Reference (IndirectObjCoordinates {objectId}))) =
castDictionary obj = expected "dictionary : " obj objectById objectId >>= getDictionary
getDictionary (Stream {header}) = return header
castObject :: PDFContent m => DirectObject -> m Object getDictionary obj = expected "dictionary : " obj
castObject directObject =
(castObjectId directObject >>= objectById) <|> return (Direct directObject)
castObjectId :: MonadFail m => DirectObject -> m ObjectId
castObjectId (Reference (IndirectObjCoordinates {objectId})) = return objectId
castObjectId directObject = expected "reference" directObject
dictionaryById :: PDFContent m => ObjectId -> m Dictionary
dictionaryById objectId = objectById objectId >>= castDictionary
expected :: (MonadFail m, Show a) => String -> a -> m b expected :: (MonadFail m, Show a) => String -> a -> m b
expected name = fail . printf "Not a %s: %s" name . show expected name = fail . printf "Not a %s: %s" name . show
@ -65,28 +55,30 @@ getField key aDictionary =
errorMessage = errorMessage =
printf "Key %s not found in dictionary %s" key (show aDictionary) printf "Key %s not found in dictionary %s" key (show aDictionary)
follow :: PDFContent m => DirectObject -> m Object
follow directObject = castObjectId directObject >>= objectById
objectById :: PDFContent m => ObjectId -> m Object objectById :: PDFContent m => ObjectId -> m Object
objectById objectId = do objectById objectId = do
content <- ask content <- ask
return (objects content ! objectId) return (objects content ! objectId)
getDictionary :: PDFContent m => DirectObject -> m Dictionary (./) :: PDFContent m => m Object -> String -> m Object
getDictionary (Dictionary aDictionary) = return aDictionary (./) object key = (object >>= getDictionary >>= getField key >>= castObject)
getDictionary (Reference (IndirectObjCoordinates {objectId})) = where
objectById objectId >>= castDictionary castObject (Reference (IndirectObjCoordinates {objectId})) =
getDictionary aDirectObject = objectById objectId
expected "resource (dictionary or reference)" aDirectObject castObject directObject = return $ Direct directObject
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject (//) :: PDFContent m => m Object -> [String] -> m Object
(//) aDict [] = return $ Dictionary aDict (//) object [] = object
(//) aDict [key] = getField key aDict (//) object (key:keys) = object ./ key // keys
(//) aDict (key:keys) = getField key aDict >>= getDictionary >>= (// keys)
origin :: PDFContent m => m Dictionary (>./) :: PDFContent m => Object -> String -> m Object
origin = trailer . docStructure <$> ask (>./) object = (return object ./)
(>//) :: PDFContent m => Object -> [String] -> m Object
(>//) object = (return object //)
origin :: PDFContent m => m Object
origin = Direct . Dictionary . trailer . docStructure <$> ask
openStream :: MonadFail m => Object -> m ByteString openStream :: MonadFail m => Object -> m ByteString
openStream (Stream {header, streamContent}) = return $ openStream (Stream {header, streamContent}) = return $

View File

@ -20,10 +20,11 @@ import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet) import PDF.Font (Font, FontSet)
import PDF.Object ( import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..) , Name(..), Object(..)
,) ,)
import PDF.Object.Navigation ( import PDF.Object.Navigation (
Error(..), (//), dictionaryById, getDictionary, getField, follow, openStream, origin Error(..), (//), (>./), (>//), getDictionary, objectById, openStream
, origin
) )
import PDF.Output (ObjectId(..)) import PDF.Output (ObjectId(..))
import Prelude hiding (fail) import Prelude hiding (fail)
@ -36,9 +37,9 @@ data Page = Page {
, source :: ObjectId , source :: ObjectId
} }
getFontDictionary :: Dictionary -> T Dictionary getFontDictionary :: Object -> T Dictionary
getFontDictionary pageDict = getFontDictionary pageObj =
((pageDict // ["Resources", "Font"]) >>= getDictionary) (pageObj >// ["Resources", "Font"] >>= getDictionary)
<|> return Map.empty <|> return Map.empty
cache :: (ObjectId -> T Font) -> ObjectId -> T Font cache :: (ObjectId -> T Font) -> ObjectId -> T Font
@ -51,20 +52,17 @@ cache loader objectId =
return value return value
loadFont :: ObjectId -> T Font loadFont :: ObjectId -> T Font
loadFont objectId = dictionaryById objectId >>= tryMappings loadFont objectId = objectById objectId >>= tryMappings
where where
tryMappings dictionary = tryMappings object =
loadCMap dictionary (object >./ "ToUnicode" >>= openStream >>= cMap)
<|> (getField "Encoding" dictionary >>= loadEncoding) <|> (object >./ "Encoding" >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show dictionary)) <|> (fail $ unknownFormat (show objectId) (show object))
unknownFormat = printf "Unknown font format for object #%s : %s" unknownFormat = printf "Unknown font format for object #%s : %s"
loadCMap :: Dictionary -> T Font loadEncoding :: Object -> T Font
loadCMap dictionary = loadEncoding (Direct (NameObject (Name name))) = encoding name
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap loadEncoding object =
loadEncoding :: DirectObject -> T Font fail $ printf "Encoding must be a name, not that : %s" $ show object
loadEncoding (NameObject (Name name)) = encoding name
loadEncoding directObject =
fail $ printf "Encoding must be a name, not that : %s" $ show directObject
loadFonts :: Dictionary -> T FontSet loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList loadFonts = foldM addFont Map.empty . Map.toList
@ -74,13 +72,13 @@ loadFonts = foldM addFont Map.empty . Map.toList
flip (Map.insert name) output <$> cache loadFont objectId flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output addFont output _ = return output
several :: DirectObject -> [DirectObject] several :: Object -> [Object]
several (Array l) = l several (Direct (Array l)) = Direct <$> l
several directObject = [directObject] several object = [object]
pagesList :: T [ObjectId] pagesList :: T [ObjectId]
pagesList = do pagesList = do
pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary pages <- origin // ["Root", "Pages"] >>= getDictionary
case Map.lookup (Name "Kids") pages of case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids Just (Array kids) -> return $ getReferences kids
_ -> return [] _ -> return []
@ -92,22 +90,21 @@ getReferences objects = do
Reference (IndirectObjCoordinates {objectId}) -> [objectId] Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> [] _ -> []
extractText :: Dictionary -> T [Text] extractText :: Object -> T [Text]
extractText pageDict = do extractText pageObj = do
fonts <- loadFonts =<< getFontDictionary pageDict fonts <- loadFonts =<< getFontDictionary pageObj
objects <- several <$> getField "Contents" pageDict objects <- several <$> pageObj >./ "Contents"
concat <$> mapM (loadContent fonts) objects concat <$> mapM (loadContent fonts) objects
where where
loadContent :: FontSet -> DirectObject -> T [Text] loadContent :: FontSet -> Object -> T [Text]
loadContent fonts directObject = loadContent fonts object =
follow directObject openStream object
>>= openStream
>>= (either fail return . Content.parse) >>= (either fail return . Content.parse)
>>= renderText fonts >>= renderText fonts
loadPage :: ObjectId -> T Page loadPage :: ObjectId -> T Page
loadPage source = do loadPage source = do
contents <- extractText =<< dictionaryById source contents <- extractText =<< objectById source
return $ Page {contents, source} return $ Page {contents, source}
getAll :: Content -> Either String (Map Int Page) getAll :: Content -> Either String (Map Int Page)