WIP: in the process of migrating to Object.Navigation in Pages, still unsure how to manage simple Content parsing and efficient font loading (+ giving a way to edit Contents)

This commit is contained in:
Tissevert 2020-02-11 17:59:15 +01:00
parent 704d7a7fcf
commit af994cb50c
1 changed files with 21 additions and 37 deletions

View File

@ -24,7 +24,9 @@ import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Object.Navigation ((//), getDictionary, origin, stream)
import PDF.Object.Navigation (
(//), dictionaryById, getDictionary, getField, follow, openStream, origin
)
import PDF.Output (ObjectId(..))
import Prelude hiding (fail)
import Text.Printf (printf)
@ -36,26 +38,10 @@ data Page = Page {
, source :: ObjectId
}
infixl 1 \\=
(\\=) :: T a -> (a -> Either String b) -> T b
x \\= f = x >>= lift . f
infixl 1 //=
(//=) :: Either String a -> (a -> T b) -> T b
x //= f = lift x >>= f
getResource :: DirectObject -> T Dictionary
getResource (Dictionary dictionary) = return dictionary
getResource (Reference (IndirectObjCoordinates {objectId})) =
getObject objectId \\= dict
getResource directObject =
lift $ expected "resource (dictionary or reference)" directObject
getFontDictionary :: Dictionary -> T Dictionary
getFontDictionary pageDict =
key "Resources" pageDict
//= getResource
>>= either (const $ return Map.empty) getResource . key "Font"
((pageDict // ["Resources", "Font"]) >>= getDictionary)
<|> return Map.empty
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
cache loader objectId =
@ -67,18 +53,18 @@ cache loader objectId =
return value
loadFont :: ObjectId -> T Font
loadFont objectId = getObject objectId \\= dict >>= tryMappings
loadFont objectId = dictionaryById objectId >>= tryMappings
where
tryMappings dictionary =
loadCMap dictionary
<|> lift (key "Encoding" dictionary >>= loadEncoding)
<|> (getField "Encoding" dictionary >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show dictionary))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadCMap dictionary =
getField "ToUnicode" dictionary >>= follow \\= stream >>= cMap
loadEncoding (NameObject (Name name)) = encoding name
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
loadEncoding (NameObject (Name name)) = return $ encoding name
loadEncoding directObject =
Left . printf "Encoding must be a name, not that : %s" $ show directObject
fail $ "Encoding must be a name, not that : %s" $ show directObject
loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList
@ -100,23 +86,18 @@ key keyName dictionary =
errorMessage =
printf "Key %s not found in dictionary %s" keyName (show dictionary)
target :: DirectObject -> Either String ObjectId
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
target directObject = expected "reference" directObject
many :: DirectObject -> [DirectObject]
many (Array l) = l
many directObject = [directObject]
follow :: DirectObject -> T Object
follow directObject = target directObject //= getObject
{-
dict :: Object -> Either String Dictionary
dict (Direct (Dictionary dictionary)) = Right dictionary
dict obj = expected "dictionary" obj
dictObject :: String -> Dictionary -> T Dictionary
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
-}
pagesList :: T [ObjectId]
pagesList = do
@ -132,20 +113,23 @@ getReferences objects = do
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: Object -> T [Text]
extractText object = do
pageDict <- lift $ dict object
extractText :: Dictionary -> T [Text]
extractText pageDict = do
fonts <- loadFonts =<< getFontDictionary pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent fonts))
where
loadContent :: FontSet -> DirectObject -> T [Text]
loadContent fonts directObject =
follow directObject \\= stream \\= Content.parse \\= renderText fonts
follow directObject
>>= openStream
>>= either fail return . Content.parse
>>= renderText fonts
loadPage :: ObjectId -> T Page
loadPage source =
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
loadPage source = do
contents <- extractText =<< dictionaryById source
return $ Page {contents, source}
getAll :: Content -> Either String (Map Int Page)
getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty