Simplify navigations by centering everything on Objects to avoid needing to many conversion tools between DirectObject / Object / Dictionary

This commit is contained in:
Tissevert 2020-02-15 13:51:24 +01:00
parent 23186100a8
commit 6096a1a237
3 changed files with 55 additions and 66 deletions

View File

@ -8,7 +8,7 @@ import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
import PDF (Document(..), parseDocument)
import PDF.Object (Content(..), Object(..))
import PDF.Object.Navigation (
Error(..), (//), castObject, objectById, openStream, origin
Error(..), (//), objectById, openStream, origin
)
import PDF.Output (ObjectId(..), Output)
import qualified PDF.Output as Output (render)
@ -34,7 +34,7 @@ parse [inputFile, key] =
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
where
byId = objectById . ObjectId
byPath path = (origin >>= (// (explode path)) >>= castObject)
byPath path = origin // (explode path)
explode "" = []
explode path =
case break (== '.') path of

View File

@ -4,12 +4,11 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module PDF.Object.Navigation (
Error(..)
, (./)
, (//)
, castObject
, dictionaryById
, (>./)
, (>//)
, getDictionary
, getField
, follow
, objectById
, openStream
, origin
@ -39,21 +38,12 @@ newtype Error a = Error {
instance MonadFail Error where
fail = Error . Left
castDictionary :: MonadFail m => Object -> m Dictionary
castDictionary (Direct (Dictionary aDict)) = return aDict
castDictionary (Stream {header}) = return header
castDictionary obj = expected "dictionary : " obj
castObject :: PDFContent m => DirectObject -> m Object
castObject directObject =
(castObjectId directObject >>= objectById) <|> return (Direct directObject)
castObjectId :: MonadFail m => DirectObject -> m ObjectId
castObjectId (Reference (IndirectObjCoordinates {objectId})) = return objectId
castObjectId directObject = expected "reference" directObject
dictionaryById :: PDFContent m => ObjectId -> m Dictionary
dictionaryById objectId = objectById objectId >>= castDictionary
getDictionary :: PDFContent m => Object -> m Dictionary
getDictionary (Direct (Dictionary aDict)) = return aDict
getDictionary (Direct (Reference (IndirectObjCoordinates {objectId}))) =
objectById objectId >>= getDictionary
getDictionary (Stream {header}) = return header
getDictionary obj = expected "dictionary : " obj
expected :: (MonadFail m, Show a) => String -> a -> m b
expected name = fail . printf "Not a %s: %s" name . show
@ -65,28 +55,30 @@ getField key aDictionary =
errorMessage =
printf "Key %s not found in dictionary %s" key (show aDictionary)
follow :: PDFContent m => DirectObject -> m Object
follow directObject = castObjectId directObject >>= objectById
objectById :: PDFContent m => ObjectId -> m Object
objectById objectId = do
content <- ask
return (objects content ! objectId)
getDictionary :: PDFContent m => DirectObject -> m Dictionary
getDictionary (Dictionary aDictionary) = return aDictionary
getDictionary (Reference (IndirectObjCoordinates {objectId})) =
objectById objectId >>= castDictionary
getDictionary aDirectObject =
expected "resource (dictionary or reference)" aDirectObject
(./) :: PDFContent m => m Object -> String -> m Object
(./) object key = (object >>= getDictionary >>= getField key >>= castObject)
where
castObject (Reference (IndirectObjCoordinates {objectId})) =
objectById objectId
castObject directObject = return $ Direct directObject
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
(//) aDict [] = return $ Dictionary aDict
(//) aDict [key] = getField key aDict
(//) aDict (key:keys) = getField key aDict >>= getDictionary >>= (// keys)
(//) :: PDFContent m => m Object -> [String] -> m Object
(//) object [] = object
(//) object (key:keys) = object ./ key // keys
origin :: PDFContent m => m Dictionary
origin = trailer . docStructure <$> ask
(>./) :: PDFContent m => Object -> String -> m Object
(>./) object = (return object ./)
(>//) :: PDFContent m => Object -> [String] -> m Object
(>//) object = (return object //)
origin :: PDFContent m => m Object
origin = Direct . Dictionary . trailer . docStructure <$> ask
openStream :: MonadFail m => Object -> m ByteString
openStream (Stream {header, streamContent}) = return $

View File

@ -20,10 +20,11 @@ import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..)
, Name(..), Object(..)
,)
import PDF.Object.Navigation (
Error(..), (//), dictionaryById, getDictionary, getField, follow, openStream, origin
Error(..), (//), (>./), (>//), getDictionary, objectById, openStream
, origin
)
import PDF.Output (ObjectId(..))
import Prelude hiding (fail)
@ -36,9 +37,9 @@ data Page = Page {
, source :: ObjectId
}
getFontDictionary :: Dictionary -> T Dictionary
getFontDictionary pageDict =
((pageDict // ["Resources", "Font"]) >>= getDictionary)
getFontDictionary :: Object -> T Dictionary
getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary)
<|> return Map.empty
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
@ -51,20 +52,17 @@ cache loader objectId =
return value
loadFont :: ObjectId -> T Font
loadFont objectId = dictionaryById objectId >>= tryMappings
loadFont objectId = objectById objectId >>= tryMappings
where
tryMappings dictionary =
loadCMap dictionary
<|> (getField "Encoding" dictionary >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show dictionary))
tryMappings object =
(object >./ "ToUnicode" >>= openStream >>= cMap)
<|> (object >./ "Encoding" >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show object))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadCMap :: Dictionary -> T Font
loadCMap dictionary =
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
loadEncoding :: DirectObject -> T Font
loadEncoding (NameObject (Name name)) = encoding name
loadEncoding directObject =
fail $ printf "Encoding must be a name, not that : %s" $ show directObject
loadEncoding :: Object -> T Font
loadEncoding (Direct (NameObject (Name name))) = encoding name
loadEncoding object =
fail $ printf "Encoding must be a name, not that : %s" $ show object
loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList
@ -74,13 +72,13 @@ loadFonts = foldM addFont Map.empty . Map.toList
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
several :: DirectObject -> [DirectObject]
several (Array l) = l
several directObject = [directObject]
several :: Object -> [Object]
several (Direct (Array l)) = Direct <$> l
several object = [object]
pagesList :: T [ObjectId]
pagesList = do
pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary
pages <- origin // ["Root", "Pages"] >>= getDictionary
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
@ -92,22 +90,21 @@ getReferences objects = do
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: Dictionary -> T [Text]
extractText pageDict = do
fonts <- loadFonts =<< getFontDictionary pageDict
objects <- several <$> getField "Contents" pageDict
extractText :: Object -> T [Text]
extractText pageObj = do
fonts <- loadFonts =<< getFontDictionary pageObj
objects <- several <$> pageObj >./ "Contents"
concat <$> mapM (loadContent fonts) objects
where
loadContent :: FontSet -> DirectObject -> T [Text]
loadContent fonts directObject =
follow directObject
>>= openStream
loadContent :: FontSet -> Object -> T [Text]
loadContent fonts object =
openStream object
>>= (either fail return . Content.parse)
>>= renderText fonts
loadPage :: ObjectId -> T Page
loadPage source = do
contents <- extractText =<< dictionaryById source
contents <- extractText =<< objectById source
return $ Page {contents, source}
getAll :: Content -> Either String (Map Int Page)