Simplify navigations by centering everything on Objects to avoid needing to many conversion tools between DirectObject / Object / Dictionary
This commit is contained in:
parent
23186100a8
commit
6096a1a237
3 changed files with 55 additions and 66 deletions
|
@ -8,7 +8,7 @@ import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
|
|||
import PDF (Document(..), parseDocument)
|
||||
import PDF.Object (Content(..), Object(..))
|
||||
import PDF.Object.Navigation (
|
||||
Error(..), (//), castObject, objectById, openStream, origin
|
||||
Error(..), (//), objectById, openStream, origin
|
||||
)
|
||||
import PDF.Output (ObjectId(..), Output)
|
||||
import qualified PDF.Output as Output (render)
|
||||
|
@ -34,7 +34,7 @@ parse [inputFile, key] =
|
|||
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
|
||||
where
|
||||
byId = objectById . ObjectId
|
||||
byPath path = (origin >>= (// (explode path)) >>= castObject)
|
||||
byPath path = origin // (explode path)
|
||||
explode "" = []
|
||||
explode path =
|
||||
case break (== '.') path of
|
||||
|
|
|
@ -4,12 +4,11 @@
|
|||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||
module PDF.Object.Navigation (
|
||||
Error(..)
|
||||
, (./)
|
||||
, (//)
|
||||
, castObject
|
||||
, dictionaryById
|
||||
, (>./)
|
||||
, (>//)
|
||||
, getDictionary
|
||||
, getField
|
||||
, follow
|
||||
, objectById
|
||||
, openStream
|
||||
, origin
|
||||
|
@ -39,21 +38,12 @@ newtype Error a = Error {
|
|||
instance MonadFail Error where
|
||||
fail = Error . Left
|
||||
|
||||
castDictionary :: MonadFail m => Object -> m Dictionary
|
||||
castDictionary (Direct (Dictionary aDict)) = return aDict
|
||||
castDictionary (Stream {header}) = return header
|
||||
castDictionary obj = expected "dictionary : " obj
|
||||
|
||||
castObject :: PDFContent m => DirectObject -> m Object
|
||||
castObject directObject =
|
||||
(castObjectId directObject >>= objectById) <|> return (Direct directObject)
|
||||
|
||||
castObjectId :: MonadFail m => DirectObject -> m ObjectId
|
||||
castObjectId (Reference (IndirectObjCoordinates {objectId})) = return objectId
|
||||
castObjectId directObject = expected "reference" directObject
|
||||
|
||||
dictionaryById :: PDFContent m => ObjectId -> m Dictionary
|
||||
dictionaryById objectId = objectById objectId >>= castDictionary
|
||||
getDictionary :: PDFContent m => Object -> m Dictionary
|
||||
getDictionary (Direct (Dictionary aDict)) = return aDict
|
||||
getDictionary (Direct (Reference (IndirectObjCoordinates {objectId}))) =
|
||||
objectById objectId >>= getDictionary
|
||||
getDictionary (Stream {header}) = return header
|
||||
getDictionary obj = expected "dictionary : " obj
|
||||
|
||||
expected :: (MonadFail m, Show a) => String -> a -> m b
|
||||
expected name = fail . printf "Not a %s: %s" name . show
|
||||
|
@ -65,28 +55,30 @@ getField key aDictionary =
|
|||
errorMessage =
|
||||
printf "Key %s not found in dictionary %s" key (show aDictionary)
|
||||
|
||||
follow :: PDFContent m => DirectObject -> m Object
|
||||
follow directObject = castObjectId directObject >>= objectById
|
||||
|
||||
objectById :: PDFContent m => ObjectId -> m Object
|
||||
objectById objectId = do
|
||||
content <- ask
|
||||
return (objects content ! objectId)
|
||||
|
||||
getDictionary :: PDFContent m => DirectObject -> m Dictionary
|
||||
getDictionary (Dictionary aDictionary) = return aDictionary
|
||||
getDictionary (Reference (IndirectObjCoordinates {objectId})) =
|
||||
objectById objectId >>= castDictionary
|
||||
getDictionary aDirectObject =
|
||||
expected "resource (dictionary or reference)" aDirectObject
|
||||
(./) :: PDFContent m => m Object -> String -> m Object
|
||||
(./) object key = (object >>= getDictionary >>= getField key >>= castObject)
|
||||
where
|
||||
castObject (Reference (IndirectObjCoordinates {objectId})) =
|
||||
objectById objectId
|
||||
castObject directObject = return $ Direct directObject
|
||||
|
||||
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
|
||||
(//) aDict [] = return $ Dictionary aDict
|
||||
(//) aDict [key] = getField key aDict
|
||||
(//) aDict (key:keys) = getField key aDict >>= getDictionary >>= (// keys)
|
||||
(//) :: PDFContent m => m Object -> [String] -> m Object
|
||||
(//) object [] = object
|
||||
(//) object (key:keys) = object ./ key // keys
|
||||
|
||||
origin :: PDFContent m => m Dictionary
|
||||
origin = trailer . docStructure <$> ask
|
||||
(>./) :: PDFContent m => Object -> String -> m Object
|
||||
(>./) object = (return object ./)
|
||||
|
||||
(>//) :: PDFContent m => Object -> [String] -> m Object
|
||||
(>//) object = (return object //)
|
||||
|
||||
origin :: PDFContent m => m Object
|
||||
origin = Direct . Dictionary . trailer . docStructure <$> ask
|
||||
|
||||
openStream :: MonadFail m => Object -> m ByteString
|
||||
openStream (Stream {header, streamContent}) = return $
|
||||
|
|
|
@ -20,10 +20,11 @@ import PDF.Encoding (encoding)
|
|||
import PDF.Font (Font, FontSet)
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Name(..)
|
||||
, Name(..), Object(..)
|
||||
,)
|
||||
import PDF.Object.Navigation (
|
||||
Error(..), (//), dictionaryById, getDictionary, getField, follow, openStream, origin
|
||||
Error(..), (//), (>./), (>//), getDictionary, objectById, openStream
|
||||
, origin
|
||||
)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import Prelude hiding (fail)
|
||||
|
@ -36,9 +37,9 @@ data Page = Page {
|
|||
, source :: ObjectId
|
||||
}
|
||||
|
||||
getFontDictionary :: Dictionary -> T Dictionary
|
||||
getFontDictionary pageDict =
|
||||
((pageDict // ["Resources", "Font"]) >>= getDictionary)
|
||||
getFontDictionary :: Object -> T Dictionary
|
||||
getFontDictionary pageObj =
|
||||
(pageObj >// ["Resources", "Font"] >>= getDictionary)
|
||||
<|> return Map.empty
|
||||
|
||||
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
||||
|
@ -51,20 +52,17 @@ cache loader objectId =
|
|||
return value
|
||||
|
||||
loadFont :: ObjectId -> T Font
|
||||
loadFont objectId = dictionaryById objectId >>= tryMappings
|
||||
loadFont objectId = objectById objectId >>= tryMappings
|
||||
where
|
||||
tryMappings dictionary =
|
||||
loadCMap dictionary
|
||||
<|> (getField "Encoding" dictionary >>= loadEncoding)
|
||||
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
||||
tryMappings object =
|
||||
(object >./ "ToUnicode" >>= openStream >>= cMap)
|
||||
<|> (object >./ "Encoding" >>= loadEncoding)
|
||||
<|> (fail $ unknownFormat (show objectId) (show object))
|
||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||
loadCMap :: Dictionary -> T Font
|
||||
loadCMap dictionary =
|
||||
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
|
||||
loadEncoding :: DirectObject -> T Font
|
||||
loadEncoding (NameObject (Name name)) = encoding name
|
||||
loadEncoding directObject =
|
||||
fail $ printf "Encoding must be a name, not that : %s" $ show directObject
|
||||
loadEncoding :: Object -> T Font
|
||||
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
||||
loadEncoding object =
|
||||
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
||||
|
||||
loadFonts :: Dictionary -> T FontSet
|
||||
loadFonts = foldM addFont Map.empty . Map.toList
|
||||
|
@ -74,13 +72,13 @@ loadFonts = foldM addFont Map.empty . Map.toList
|
|||
flip (Map.insert name) output <$> cache loadFont objectId
|
||||
addFont output _ = return output
|
||||
|
||||
several :: DirectObject -> [DirectObject]
|
||||
several (Array l) = l
|
||||
several directObject = [directObject]
|
||||
several :: Object -> [Object]
|
||||
several (Direct (Array l)) = Direct <$> l
|
||||
several object = [object]
|
||||
|
||||
pagesList :: T [ObjectId]
|
||||
pagesList = do
|
||||
pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary
|
||||
pages <- origin // ["Root", "Pages"] >>= getDictionary
|
||||
case Map.lookup (Name "Kids") pages of
|
||||
Just (Array kids) -> return $ getReferences kids
|
||||
_ -> return []
|
||||
|
@ -92,22 +90,21 @@ getReferences objects = do
|
|||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||
_ -> []
|
||||
|
||||
extractText :: Dictionary -> T [Text]
|
||||
extractText pageDict = do
|
||||
fonts <- loadFonts =<< getFontDictionary pageDict
|
||||
objects <- several <$> getField "Contents" pageDict
|
||||
extractText :: Object -> T [Text]
|
||||
extractText pageObj = do
|
||||
fonts <- loadFonts =<< getFontDictionary pageObj
|
||||
objects <- several <$> pageObj >./ "Contents"
|
||||
concat <$> mapM (loadContent fonts) objects
|
||||
where
|
||||
loadContent :: FontSet -> DirectObject -> T [Text]
|
||||
loadContent fonts directObject =
|
||||
follow directObject
|
||||
>>= openStream
|
||||
loadContent :: FontSet -> Object -> T [Text]
|
||||
loadContent fonts object =
|
||||
openStream object
|
||||
>>= (either fail return . Content.parse)
|
||||
>>= renderText fonts
|
||||
|
||||
loadPage :: ObjectId -> T Page
|
||||
loadPage source = do
|
||||
contents <- extractText =<< dictionaryById source
|
||||
contents <- extractText =<< objectById source
|
||||
return $ Page {contents, source}
|
||||
|
||||
getAll :: Content -> Either String (Map Int Page)
|
||||
|
|
Loading…
Reference in a new issue