WIP: in the process of migrating to Object.Navigation in Pages, still unsure how to manage simple Content parsing and efficient font loading (+ giving a way to edit Contents)

This commit is contained in:
Tissevert 2020-02-11 17:59:15 +01:00
parent 704d7a7fcf
commit af994cb50c

View file

@ -24,7 +24,9 @@ import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..) , Object(..), Name(..), Structure(..)
,) ,)
import PDF.Object.Navigation ((//), getDictionary, origin, stream) import PDF.Object.Navigation (
(//), dictionaryById, getDictionary, getField, follow, openStream, origin
)
import PDF.Output (ObjectId(..)) import PDF.Output (ObjectId(..))
import Prelude hiding (fail) import Prelude hiding (fail)
import Text.Printf (printf) import Text.Printf (printf)
@ -36,26 +38,10 @@ data Page = Page {
, source :: ObjectId , source :: ObjectId
} }
infixl 1 \\=
(\\=) :: T a -> (a -> Either String b) -> T b
x \\= f = x >>= lift . f
infixl 1 //=
(//=) :: Either String a -> (a -> T b) -> T b
x //= f = lift x >>= f
getResource :: DirectObject -> T Dictionary
getResource (Dictionary dictionary) = return dictionary
getResource (Reference (IndirectObjCoordinates {objectId})) =
getObject objectId \\= dict
getResource directObject =
lift $ expected "resource (dictionary or reference)" directObject
getFontDictionary :: Dictionary -> T Dictionary getFontDictionary :: Dictionary -> T Dictionary
getFontDictionary pageDict = getFontDictionary pageDict =
key "Resources" pageDict ((pageDict // ["Resources", "Font"]) >>= getDictionary)
//= getResource <|> return Map.empty
>>= either (const $ return Map.empty) getResource . key "Font"
cache :: (ObjectId -> T Font) -> ObjectId -> T Font cache :: (ObjectId -> T Font) -> ObjectId -> T Font
cache loader objectId = cache loader objectId =
@ -67,18 +53,18 @@ cache loader objectId =
return value return value
loadFont :: ObjectId -> T Font loadFont :: ObjectId -> T Font
loadFont objectId = getObject objectId \\= dict >>= tryMappings loadFont objectId = dictionaryById objectId >>= tryMappings
where where
tryMappings dictionary = tryMappings dictionary =
loadCMap dictionary loadCMap dictionary
<|> lift (key "Encoding" dictionary >>= loadEncoding) <|> (getField "Encoding" dictionary >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show dictionary)) <|> (fail $ unknownFormat (show objectId) (show dictionary))
unknownFormat = printf "Unknown font format for object #%s : %s" unknownFormat = printf "Unknown font format for object #%s : %s"
loadCMap dictionary = loadCMap dictionary =
getField "ToUnicode" dictionary >>= follow \\= stream >>= cMap getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
loadEncoding (NameObject (Name name)) = encoding name loadEncoding (NameObject (Name name)) = return $ encoding name
loadEncoding directObject = loadEncoding directObject =
Left . printf "Encoding must be a name, not that : %s" $ show directObject fail $ "Encoding must be a name, not that : %s" $ show directObject
loadFonts :: Dictionary -> T FontSet loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList loadFonts = foldM addFont Map.empty . Map.toList
@ -100,23 +86,18 @@ key keyName dictionary =
errorMessage = errorMessage =
printf "Key %s not found in dictionary %s" keyName (show dictionary) printf "Key %s not found in dictionary %s" keyName (show dictionary)
target :: DirectObject -> Either String ObjectId
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
target directObject = expected "reference" directObject
many :: DirectObject -> [DirectObject] many :: DirectObject -> [DirectObject]
many (Array l) = l many (Array l) = l
many directObject = [directObject] many directObject = [directObject]
follow :: DirectObject -> T Object {-
follow directObject = target directObject //= getObject
dict :: Object -> Either String Dictionary dict :: Object -> Either String Dictionary
dict (Direct (Dictionary dictionary)) = Right dictionary dict (Direct (Dictionary dictionary)) = Right dictionary
dict obj = expected "dictionary" obj dict obj = expected "dictionary" obj
dictObject :: String -> Dictionary -> T Dictionary dictObject :: String -> Dictionary -> T Dictionary
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
-}
pagesList :: T [ObjectId] pagesList :: T [ObjectId]
pagesList = do pagesList = do
@ -132,20 +113,23 @@ getReferences objects = do
Reference (IndirectObjCoordinates {objectId}) -> [objectId] Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> [] _ -> []
extractText :: Object -> T [Text] extractText :: Dictionary -> T [Text]
extractText object = do extractText pageDict = do
pageDict <- lift $ dict object
fonts <- loadFonts =<< getFontDictionary pageDict fonts <- loadFonts =<< getFontDictionary pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent fonts)) concat <$> (objects //= (mapM $ loadContent fonts))
where where
loadContent :: FontSet -> DirectObject -> T [Text] loadContent :: FontSet -> DirectObject -> T [Text]
loadContent fonts directObject = loadContent fonts directObject =
follow directObject \\= stream \\= Content.parse \\= renderText fonts follow directObject
>>= openStream
>>= either fail return . Content.parse
>>= renderText fonts
loadPage :: ObjectId -> T Page loadPage :: ObjectId -> T Page
loadPage source = loadPage source = do
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source) contents <- extractText =<< dictionaryById source
return $ Page {contents, source}
getAll :: Content -> Either String (Map Int Page) getAll :: Content -> Either String (Map Int Page)
getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty