From af994cb50c46d0ee73a04aa84c7f935d949ecd76 Mon Sep 17 00:00:00 2001 From: Tissevert Date: Tue, 11 Feb 2020 17:59:15 +0100 Subject: [PATCH] WIP: in the process of migrating to Object.Navigation in Pages, still unsure how to manage simple Content parsing and efficient font loading (+ giving a way to edit Contents) --- src/PDF/Pages.hs | 58 ++++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs index b8f09cb..28f6733 100755 --- a/src/PDF/Pages.hs +++ b/src/PDF/Pages.hs @@ -24,7 +24,9 @@ import PDF.Object ( Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Object(..), Name(..), Structure(..) ,) -import PDF.Object.Navigation ((//), getDictionary, origin, stream) +import PDF.Object.Navigation ( + (//), dictionaryById, getDictionary, getField, follow, openStream, origin + ) import PDF.Output (ObjectId(..)) import Prelude hiding (fail) import Text.Printf (printf) @@ -36,26 +38,10 @@ data Page = Page { , source :: ObjectId } -infixl 1 \\= -(\\=) :: T a -> (a -> Either String b) -> T b -x \\= f = x >>= lift . f - -infixl 1 //= -(//=) :: Either String a -> (a -> T b) -> T b -x //= f = lift x >>= f - -getResource :: DirectObject -> T Dictionary -getResource (Dictionary dictionary) = return dictionary -getResource (Reference (IndirectObjCoordinates {objectId})) = - getObject objectId \\= dict -getResource directObject = - lift $ expected "resource (dictionary or reference)" directObject - getFontDictionary :: Dictionary -> T Dictionary getFontDictionary pageDict = - key "Resources" pageDict - //= getResource - >>= either (const $ return Map.empty) getResource . key "Font" + ((pageDict // ["Resources", "Font"]) >>= getDictionary) + <|> return Map.empty cache :: (ObjectId -> T Font) -> ObjectId -> T Font cache loader objectId = @@ -67,18 +53,18 @@ cache loader objectId = return value loadFont :: ObjectId -> T Font -loadFont objectId = getObject objectId \\= dict >>= tryMappings +loadFont objectId = dictionaryById objectId >>= tryMappings where tryMappings dictionary = loadCMap dictionary - <|> lift (key "Encoding" dictionary >>= loadEncoding) + <|> (getField "Encoding" dictionary >>= loadEncoding) <|> (fail $ unknownFormat (show objectId) (show dictionary)) unknownFormat = printf "Unknown font format for object #%s : %s" loadCMap dictionary = - getField "ToUnicode" dictionary >>= follow \\= stream >>= cMap - loadEncoding (NameObject (Name name)) = encoding name + getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap + loadEncoding (NameObject (Name name)) = return $ encoding name loadEncoding directObject = - Left . printf "Encoding must be a name, not that : %s" $ show directObject + fail $ "Encoding must be a name, not that : %s" $ show directObject loadFonts :: Dictionary -> T FontSet loadFonts = foldM addFont Map.empty . Map.toList @@ -100,23 +86,18 @@ key keyName dictionary = errorMessage = printf "Key %s not found in dictionary %s" keyName (show dictionary) -target :: DirectObject -> Either String ObjectId -target (Reference (IndirectObjCoordinates {objectId})) = Right objectId -target directObject = expected "reference" directObject - many :: DirectObject -> [DirectObject] many (Array l) = l many directObject = [directObject] -follow :: DirectObject -> T Object -follow directObject = target directObject //= getObject - +{- dict :: Object -> Either String Dictionary dict (Direct (Dictionary dictionary)) = Right dictionary dict obj = expected "dictionary" obj dictObject :: String -> Dictionary -> T Dictionary dictObject keyName dictionary = key keyName dictionary //= follow \\= dict +-} pagesList :: T [ObjectId] pagesList = do @@ -132,20 +113,23 @@ getReferences objects = do Reference (IndirectObjCoordinates {objectId}) -> [objectId] _ -> [] -extractText :: Object -> T [Text] -extractText object = do - pageDict <- lift $ dict object +extractText :: Dictionary -> T [Text] +extractText pageDict = do fonts <- loadFonts =<< getFontDictionary pageDict let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) concat <$> (objects //= (mapM $ loadContent fonts)) where loadContent :: FontSet -> DirectObject -> T [Text] loadContent fonts directObject = - follow directObject \\= stream \\= Content.parse \\= renderText fonts + follow directObject + >>= openStream + >>= either fail return . Content.parse + >>= renderText fonts loadPage :: ObjectId -> T Page -loadPage source = - (\contents -> Page {contents, source}) <$> (extractText =<< getObject source) +loadPage source = do + contents <- extractText =<< dictionaryById source + return $ Page {contents, source} getAll :: Content -> Either String (Map Int Page) getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty