WIP: in the process of migrating to Object.Navigation in Pages, still unsure how to manage simple Content parsing and efficient font loading (+ giving a way to edit Contents)
This commit is contained in:
parent
704d7a7fcf
commit
af994cb50c
1 changed files with 21 additions and 37 deletions
|
@ -24,7 +24,9 @@ import PDF.Object (
|
|||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Object(..), Name(..), Structure(..)
|
||||
,)
|
||||
import PDF.Object.Navigation ((//), getDictionary, origin, stream)
|
||||
import PDF.Object.Navigation (
|
||||
(//), dictionaryById, getDictionary, getField, follow, openStream, origin
|
||||
)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
@ -36,26 +38,10 @@ data Page = Page {
|
|||
, source :: ObjectId
|
||||
}
|
||||
|
||||
infixl 1 \\=
|
||||
(\\=) :: T a -> (a -> Either String b) -> T b
|
||||
x \\= f = x >>= lift . f
|
||||
|
||||
infixl 1 //=
|
||||
(//=) :: Either String a -> (a -> T b) -> T b
|
||||
x //= f = lift x >>= f
|
||||
|
||||
getResource :: DirectObject -> T Dictionary
|
||||
getResource (Dictionary dictionary) = return dictionary
|
||||
getResource (Reference (IndirectObjCoordinates {objectId})) =
|
||||
getObject objectId \\= dict
|
||||
getResource directObject =
|
||||
lift $ expected "resource (dictionary or reference)" directObject
|
||||
|
||||
getFontDictionary :: Dictionary -> T Dictionary
|
||||
getFontDictionary pageDict =
|
||||
key "Resources" pageDict
|
||||
//= getResource
|
||||
>>= either (const $ return Map.empty) getResource . key "Font"
|
||||
((pageDict // ["Resources", "Font"]) >>= getDictionary)
|
||||
<|> return Map.empty
|
||||
|
||||
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
||||
cache loader objectId =
|
||||
|
@ -67,18 +53,18 @@ cache loader objectId =
|
|||
return value
|
||||
|
||||
loadFont :: ObjectId -> T Font
|
||||
loadFont objectId = getObject objectId \\= dict >>= tryMappings
|
||||
loadFont objectId = dictionaryById objectId >>= tryMappings
|
||||
where
|
||||
tryMappings dictionary =
|
||||
loadCMap dictionary
|
||||
<|> lift (key "Encoding" dictionary >>= loadEncoding)
|
||||
<|> (getField "Encoding" dictionary >>= loadEncoding)
|
||||
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||
loadCMap dictionary =
|
||||
getField "ToUnicode" dictionary >>= follow \\= stream >>= cMap
|
||||
loadEncoding (NameObject (Name name)) = encoding name
|
||||
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
|
||||
loadEncoding (NameObject (Name name)) = return $ encoding name
|
||||
loadEncoding directObject =
|
||||
Left . printf "Encoding must be a name, not that : %s" $ show directObject
|
||||
fail $ "Encoding must be a name, not that : %s" $ show directObject
|
||||
|
||||
loadFonts :: Dictionary -> T FontSet
|
||||
loadFonts = foldM addFont Map.empty . Map.toList
|
||||
|
@ -100,23 +86,18 @@ key keyName dictionary =
|
|||
errorMessage =
|
||||
printf "Key %s not found in dictionary %s" keyName (show dictionary)
|
||||
|
||||
target :: DirectObject -> Either String ObjectId
|
||||
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
|
||||
target directObject = expected "reference" directObject
|
||||
|
||||
many :: DirectObject -> [DirectObject]
|
||||
many (Array l) = l
|
||||
many directObject = [directObject]
|
||||
|
||||
follow :: DirectObject -> T Object
|
||||
follow directObject = target directObject //= getObject
|
||||
|
||||
{-
|
||||
dict :: Object -> Either String Dictionary
|
||||
dict (Direct (Dictionary dictionary)) = Right dictionary
|
||||
dict obj = expected "dictionary" obj
|
||||
|
||||
dictObject :: String -> Dictionary -> T Dictionary
|
||||
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
|
||||
-}
|
||||
|
||||
pagesList :: T [ObjectId]
|
||||
pagesList = do
|
||||
|
@ -132,20 +113,23 @@ getReferences objects = do
|
|||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||
_ -> []
|
||||
|
||||
extractText :: Object -> T [Text]
|
||||
extractText object = do
|
||||
pageDict <- lift $ dict object
|
||||
extractText :: Dictionary -> T [Text]
|
||||
extractText pageDict = do
|
||||
fonts <- loadFonts =<< getFontDictionary pageDict
|
||||
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
||||
concat <$> (objects //= (mapM $ loadContent fonts))
|
||||
where
|
||||
loadContent :: FontSet -> DirectObject -> T [Text]
|
||||
loadContent fonts directObject =
|
||||
follow directObject \\= stream \\= Content.parse \\= renderText fonts
|
||||
follow directObject
|
||||
>>= openStream
|
||||
>>= either fail return . Content.parse
|
||||
>>= renderText fonts
|
||||
|
||||
loadPage :: ObjectId -> T Page
|
||||
loadPage source =
|
||||
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
|
||||
loadPage source = do
|
||||
contents <- extractText =<< dictionaryById source
|
||||
return $ Page {contents, source}
|
||||
|
||||
getAll :: Content -> Either String (Map Int Page)
|
||||
getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty
|
||||
|
|
Loading…
Reference in a new issue