WIP: in the process of migrating to Object.Navigation in Pages, still unsure how to manage simple Content parsing and efficient font loading (+ giving a way to edit Contents)
This commit is contained in:
parent
704d7a7fcf
commit
af994cb50c
1 changed files with 21 additions and 37 deletions
|
@ -24,7 +24,9 @@ import PDF.Object (
|
||||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
, Object(..), Name(..), Structure(..)
|
, Object(..), Name(..), Structure(..)
|
||||||
,)
|
,)
|
||||||
import PDF.Object.Navigation ((//), getDictionary, origin, stream)
|
import PDF.Object.Navigation (
|
||||||
|
(//), dictionaryById, getDictionary, getField, follow, openStream, origin
|
||||||
|
)
|
||||||
import PDF.Output (ObjectId(..))
|
import PDF.Output (ObjectId(..))
|
||||||
import Prelude hiding (fail)
|
import Prelude hiding (fail)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
@ -36,26 +38,10 @@ data Page = Page {
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
}
|
}
|
||||||
|
|
||||||
infixl 1 \\=
|
|
||||||
(\\=) :: T a -> (a -> Either String b) -> T b
|
|
||||||
x \\= f = x >>= lift . f
|
|
||||||
|
|
||||||
infixl 1 //=
|
|
||||||
(//=) :: Either String a -> (a -> T b) -> T b
|
|
||||||
x //= f = lift x >>= f
|
|
||||||
|
|
||||||
getResource :: DirectObject -> T Dictionary
|
|
||||||
getResource (Dictionary dictionary) = return dictionary
|
|
||||||
getResource (Reference (IndirectObjCoordinates {objectId})) =
|
|
||||||
getObject objectId \\= dict
|
|
||||||
getResource directObject =
|
|
||||||
lift $ expected "resource (dictionary or reference)" directObject
|
|
||||||
|
|
||||||
getFontDictionary :: Dictionary -> T Dictionary
|
getFontDictionary :: Dictionary -> T Dictionary
|
||||||
getFontDictionary pageDict =
|
getFontDictionary pageDict =
|
||||||
key "Resources" pageDict
|
((pageDict // ["Resources", "Font"]) >>= getDictionary)
|
||||||
//= getResource
|
<|> return Map.empty
|
||||||
>>= either (const $ return Map.empty) getResource . key "Font"
|
|
||||||
|
|
||||||
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
||||||
cache loader objectId =
|
cache loader objectId =
|
||||||
|
@ -67,18 +53,18 @@ cache loader objectId =
|
||||||
return value
|
return value
|
||||||
|
|
||||||
loadFont :: ObjectId -> T Font
|
loadFont :: ObjectId -> T Font
|
||||||
loadFont objectId = getObject objectId \\= dict >>= tryMappings
|
loadFont objectId = dictionaryById objectId >>= tryMappings
|
||||||
where
|
where
|
||||||
tryMappings dictionary =
|
tryMappings dictionary =
|
||||||
loadCMap dictionary
|
loadCMap dictionary
|
||||||
<|> lift (key "Encoding" dictionary >>= loadEncoding)
|
<|> (getField "Encoding" dictionary >>= loadEncoding)
|
||||||
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
||||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||||
loadCMap dictionary =
|
loadCMap dictionary =
|
||||||
getField "ToUnicode" dictionary >>= follow \\= stream >>= cMap
|
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
|
||||||
loadEncoding (NameObject (Name name)) = encoding name
|
loadEncoding (NameObject (Name name)) = return $ encoding name
|
||||||
loadEncoding directObject =
|
loadEncoding directObject =
|
||||||
Left . printf "Encoding must be a name, not that : %s" $ show directObject
|
fail $ "Encoding must be a name, not that : %s" $ show directObject
|
||||||
|
|
||||||
loadFonts :: Dictionary -> T FontSet
|
loadFonts :: Dictionary -> T FontSet
|
||||||
loadFonts = foldM addFont Map.empty . Map.toList
|
loadFonts = foldM addFont Map.empty . Map.toList
|
||||||
|
@ -100,23 +86,18 @@ key keyName dictionary =
|
||||||
errorMessage =
|
errorMessage =
|
||||||
printf "Key %s not found in dictionary %s" keyName (show dictionary)
|
printf "Key %s not found in dictionary %s" keyName (show dictionary)
|
||||||
|
|
||||||
target :: DirectObject -> Either String ObjectId
|
|
||||||
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
|
|
||||||
target directObject = expected "reference" directObject
|
|
||||||
|
|
||||||
many :: DirectObject -> [DirectObject]
|
many :: DirectObject -> [DirectObject]
|
||||||
many (Array l) = l
|
many (Array l) = l
|
||||||
many directObject = [directObject]
|
many directObject = [directObject]
|
||||||
|
|
||||||
follow :: DirectObject -> T Object
|
{-
|
||||||
follow directObject = target directObject //= getObject
|
|
||||||
|
|
||||||
dict :: Object -> Either String Dictionary
|
dict :: Object -> Either String Dictionary
|
||||||
dict (Direct (Dictionary dictionary)) = Right dictionary
|
dict (Direct (Dictionary dictionary)) = Right dictionary
|
||||||
dict obj = expected "dictionary" obj
|
dict obj = expected "dictionary" obj
|
||||||
|
|
||||||
dictObject :: String -> Dictionary -> T Dictionary
|
dictObject :: String -> Dictionary -> T Dictionary
|
||||||
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
|
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
|
||||||
|
-}
|
||||||
|
|
||||||
pagesList :: T [ObjectId]
|
pagesList :: T [ObjectId]
|
||||||
pagesList = do
|
pagesList = do
|
||||||
|
@ -132,20 +113,23 @@ getReferences objects = do
|
||||||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||||
_ -> []
|
_ -> []
|
||||||
|
|
||||||
extractText :: Object -> T [Text]
|
extractText :: Dictionary -> T [Text]
|
||||||
extractText object = do
|
extractText pageDict = do
|
||||||
pageDict <- lift $ dict object
|
|
||||||
fonts <- loadFonts =<< getFontDictionary pageDict
|
fonts <- loadFonts =<< getFontDictionary pageDict
|
||||||
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
||||||
concat <$> (objects //= (mapM $ loadContent fonts))
|
concat <$> (objects //= (mapM $ loadContent fonts))
|
||||||
where
|
where
|
||||||
loadContent :: FontSet -> DirectObject -> T [Text]
|
loadContent :: FontSet -> DirectObject -> T [Text]
|
||||||
loadContent fonts directObject =
|
loadContent fonts directObject =
|
||||||
follow directObject \\= stream \\= Content.parse \\= renderText fonts
|
follow directObject
|
||||||
|
>>= openStream
|
||||||
|
>>= either fail return . Content.parse
|
||||||
|
>>= renderText fonts
|
||||||
|
|
||||||
loadPage :: ObjectId -> T Page
|
loadPage :: ObjectId -> T Page
|
||||||
loadPage source =
|
loadPage source = do
|
||||||
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
|
contents <- extractText =<< dictionaryById source
|
||||||
|
return $ Page {contents, source}
|
||||||
|
|
||||||
getAll :: Content -> Either String (Map Int Page)
|
getAll :: Content -> Either String (Map Int Page)
|
||||||
getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty
|
getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty
|
||||||
|
|
Loading…
Reference in a new issue