Simplify navigations by centering everything on Objects to avoid needing to many conversion tools between DirectObject / Object / Dictionary
This commit is contained in:
parent
23186100a8
commit
6096a1a237
|
@ -8,7 +8,7 @@ import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
|
||||||
import PDF (Document(..), parseDocument)
|
import PDF (Document(..), parseDocument)
|
||||||
import PDF.Object (Content(..), Object(..))
|
import PDF.Object (Content(..), Object(..))
|
||||||
import PDF.Object.Navigation (
|
import PDF.Object.Navigation (
|
||||||
Error(..), (//), castObject, objectById, openStream, origin
|
Error(..), (//), objectById, openStream, origin
|
||||||
)
|
)
|
||||||
import PDF.Output (ObjectId(..), Output)
|
import PDF.Output (ObjectId(..), Output)
|
||||||
import qualified PDF.Output as Output (render)
|
import qualified PDF.Output as Output (render)
|
||||||
|
@ -34,7 +34,7 @@ parse [inputFile, key] =
|
||||||
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
|
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
|
||||||
where
|
where
|
||||||
byId = objectById . ObjectId
|
byId = objectById . ObjectId
|
||||||
byPath path = (origin >>= (// (explode path)) >>= castObject)
|
byPath path = origin // (explode path)
|
||||||
explode "" = []
|
explode "" = []
|
||||||
explode path =
|
explode path =
|
||||||
case break (== '.') path of
|
case break (== '.') path of
|
||||||
|
|
|
@ -4,12 +4,11 @@
|
||||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||||
module PDF.Object.Navigation (
|
module PDF.Object.Navigation (
|
||||||
Error(..)
|
Error(..)
|
||||||
|
, (./)
|
||||||
, (//)
|
, (//)
|
||||||
, castObject
|
, (>./)
|
||||||
, dictionaryById
|
, (>//)
|
||||||
, getDictionary
|
, getDictionary
|
||||||
, getField
|
|
||||||
, follow
|
|
||||||
, objectById
|
, objectById
|
||||||
, openStream
|
, openStream
|
||||||
, origin
|
, origin
|
||||||
|
@ -39,21 +38,12 @@ newtype Error a = Error {
|
||||||
instance MonadFail Error where
|
instance MonadFail Error where
|
||||||
fail = Error . Left
|
fail = Error . Left
|
||||||
|
|
||||||
castDictionary :: MonadFail m => Object -> m Dictionary
|
getDictionary :: PDFContent m => Object -> m Dictionary
|
||||||
castDictionary (Direct (Dictionary aDict)) = return aDict
|
getDictionary (Direct (Dictionary aDict)) = return aDict
|
||||||
castDictionary (Stream {header}) = return header
|
getDictionary (Direct (Reference (IndirectObjCoordinates {objectId}))) =
|
||||||
castDictionary obj = expected "dictionary : " obj
|
objectById objectId >>= getDictionary
|
||||||
|
getDictionary (Stream {header}) = return header
|
||||||
castObject :: PDFContent m => DirectObject -> m Object
|
getDictionary obj = expected "dictionary : " obj
|
||||||
castObject directObject =
|
|
||||||
(castObjectId directObject >>= objectById) <|> return (Direct directObject)
|
|
||||||
|
|
||||||
castObjectId :: MonadFail m => DirectObject -> m ObjectId
|
|
||||||
castObjectId (Reference (IndirectObjCoordinates {objectId})) = return objectId
|
|
||||||
castObjectId directObject = expected "reference" directObject
|
|
||||||
|
|
||||||
dictionaryById :: PDFContent m => ObjectId -> m Dictionary
|
|
||||||
dictionaryById objectId = objectById objectId >>= castDictionary
|
|
||||||
|
|
||||||
expected :: (MonadFail m, Show a) => String -> a -> m b
|
expected :: (MonadFail m, Show a) => String -> a -> m b
|
||||||
expected name = fail . printf "Not a %s: %s" name . show
|
expected name = fail . printf "Not a %s: %s" name . show
|
||||||
|
@ -65,28 +55,30 @@ getField key aDictionary =
|
||||||
errorMessage =
|
errorMessage =
|
||||||
printf "Key %s not found in dictionary %s" key (show aDictionary)
|
printf "Key %s not found in dictionary %s" key (show aDictionary)
|
||||||
|
|
||||||
follow :: PDFContent m => DirectObject -> m Object
|
|
||||||
follow directObject = castObjectId directObject >>= objectById
|
|
||||||
|
|
||||||
objectById :: PDFContent m => ObjectId -> m Object
|
objectById :: PDFContent m => ObjectId -> m Object
|
||||||
objectById objectId = do
|
objectById objectId = do
|
||||||
content <- ask
|
content <- ask
|
||||||
return (objects content ! objectId)
|
return (objects content ! objectId)
|
||||||
|
|
||||||
getDictionary :: PDFContent m => DirectObject -> m Dictionary
|
(./) :: PDFContent m => m Object -> String -> m Object
|
||||||
getDictionary (Dictionary aDictionary) = return aDictionary
|
(./) object key = (object >>= getDictionary >>= getField key >>= castObject)
|
||||||
getDictionary (Reference (IndirectObjCoordinates {objectId})) =
|
where
|
||||||
objectById objectId >>= castDictionary
|
castObject (Reference (IndirectObjCoordinates {objectId})) =
|
||||||
getDictionary aDirectObject =
|
objectById objectId
|
||||||
expected "resource (dictionary or reference)" aDirectObject
|
castObject directObject = return $ Direct directObject
|
||||||
|
|
||||||
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
|
(//) :: PDFContent m => m Object -> [String] -> m Object
|
||||||
(//) aDict [] = return $ Dictionary aDict
|
(//) object [] = object
|
||||||
(//) aDict [key] = getField key aDict
|
(//) object (key:keys) = object ./ key // keys
|
||||||
(//) aDict (key:keys) = getField key aDict >>= getDictionary >>= (// keys)
|
|
||||||
|
|
||||||
origin :: PDFContent m => m Dictionary
|
(>./) :: PDFContent m => Object -> String -> m Object
|
||||||
origin = trailer . docStructure <$> ask
|
(>./) object = (return object ./)
|
||||||
|
|
||||||
|
(>//) :: PDFContent m => Object -> [String] -> m Object
|
||||||
|
(>//) object = (return object //)
|
||||||
|
|
||||||
|
origin :: PDFContent m => m Object
|
||||||
|
origin = Direct . Dictionary . trailer . docStructure <$> ask
|
||||||
|
|
||||||
openStream :: MonadFail m => Object -> m ByteString
|
openStream :: MonadFail m => Object -> m ByteString
|
||||||
openStream (Stream {header, streamContent}) = return $
|
openStream (Stream {header, streamContent}) = return $
|
||||||
|
|
|
@ -20,10 +20,11 @@ import PDF.Encoding (encoding)
|
||||||
import PDF.Font (Font, FontSet)
|
import PDF.Font (Font, FontSet)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
, Name(..)
|
, Name(..), Object(..)
|
||||||
,)
|
,)
|
||||||
import PDF.Object.Navigation (
|
import PDF.Object.Navigation (
|
||||||
Error(..), (//), dictionaryById, getDictionary, getField, follow, openStream, origin
|
Error(..), (//), (>./), (>//), getDictionary, objectById, openStream
|
||||||
|
, origin
|
||||||
)
|
)
|
||||||
import PDF.Output (ObjectId(..))
|
import PDF.Output (ObjectId(..))
|
||||||
import Prelude hiding (fail)
|
import Prelude hiding (fail)
|
||||||
|
@ -36,9 +37,9 @@ data Page = Page {
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
}
|
}
|
||||||
|
|
||||||
getFontDictionary :: Dictionary -> T Dictionary
|
getFontDictionary :: Object -> T Dictionary
|
||||||
getFontDictionary pageDict =
|
getFontDictionary pageObj =
|
||||||
((pageDict // ["Resources", "Font"]) >>= getDictionary)
|
(pageObj >// ["Resources", "Font"] >>= getDictionary)
|
||||||
<|> return Map.empty
|
<|> return Map.empty
|
||||||
|
|
||||||
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
||||||
|
@ -51,20 +52,17 @@ cache loader objectId =
|
||||||
return value
|
return value
|
||||||
|
|
||||||
loadFont :: ObjectId -> T Font
|
loadFont :: ObjectId -> T Font
|
||||||
loadFont objectId = dictionaryById objectId >>= tryMappings
|
loadFont objectId = objectById objectId >>= tryMappings
|
||||||
where
|
where
|
||||||
tryMappings dictionary =
|
tryMappings object =
|
||||||
loadCMap dictionary
|
(object >./ "ToUnicode" >>= openStream >>= cMap)
|
||||||
<|> (getField "Encoding" dictionary >>= loadEncoding)
|
<|> (object >./ "Encoding" >>= loadEncoding)
|
||||||
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
<|> (fail $ unknownFormat (show objectId) (show object))
|
||||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||||
loadCMap :: Dictionary -> T Font
|
loadEncoding :: Object -> T Font
|
||||||
loadCMap dictionary =
|
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
||||||
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
|
loadEncoding object =
|
||||||
loadEncoding :: DirectObject -> T Font
|
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
||||||
loadEncoding (NameObject (Name name)) = encoding name
|
|
||||||
loadEncoding directObject =
|
|
||||||
fail $ printf "Encoding must be a name, not that : %s" $ show directObject
|
|
||||||
|
|
||||||
loadFonts :: Dictionary -> T FontSet
|
loadFonts :: Dictionary -> T FontSet
|
||||||
loadFonts = foldM addFont Map.empty . Map.toList
|
loadFonts = foldM addFont Map.empty . Map.toList
|
||||||
|
@ -74,13 +72,13 @@ loadFonts = foldM addFont Map.empty . Map.toList
|
||||||
flip (Map.insert name) output <$> cache loadFont objectId
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
addFont output _ = return output
|
addFont output _ = return output
|
||||||
|
|
||||||
several :: DirectObject -> [DirectObject]
|
several :: Object -> [Object]
|
||||||
several (Array l) = l
|
several (Direct (Array l)) = Direct <$> l
|
||||||
several directObject = [directObject]
|
several object = [object]
|
||||||
|
|
||||||
pagesList :: T [ObjectId]
|
pagesList :: T [ObjectId]
|
||||||
pagesList = do
|
pagesList = do
|
||||||
pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary
|
pages <- origin // ["Root", "Pages"] >>= getDictionary
|
||||||
case Map.lookup (Name "Kids") pages of
|
case Map.lookup (Name "Kids") pages of
|
||||||
Just (Array kids) -> return $ getReferences kids
|
Just (Array kids) -> return $ getReferences kids
|
||||||
_ -> return []
|
_ -> return []
|
||||||
|
@ -92,22 +90,21 @@ getReferences objects = do
|
||||||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||||
_ -> []
|
_ -> []
|
||||||
|
|
||||||
extractText :: Dictionary -> T [Text]
|
extractText :: Object -> T [Text]
|
||||||
extractText pageDict = do
|
extractText pageObj = do
|
||||||
fonts <- loadFonts =<< getFontDictionary pageDict
|
fonts <- loadFonts =<< getFontDictionary pageObj
|
||||||
objects <- several <$> getField "Contents" pageDict
|
objects <- several <$> pageObj >./ "Contents"
|
||||||
concat <$> mapM (loadContent fonts) objects
|
concat <$> mapM (loadContent fonts) objects
|
||||||
where
|
where
|
||||||
loadContent :: FontSet -> DirectObject -> T [Text]
|
loadContent :: FontSet -> Object -> T [Text]
|
||||||
loadContent fonts directObject =
|
loadContent fonts object =
|
||||||
follow directObject
|
openStream object
|
||||||
>>= openStream
|
|
||||||
>>= (either fail return . Content.parse)
|
>>= (either fail return . Content.parse)
|
||||||
>>= renderText fonts
|
>>= renderText fonts
|
||||||
|
|
||||||
loadPage :: ObjectId -> T Page
|
loadPage :: ObjectId -> T Page
|
||||||
loadPage source = do
|
loadPage source = do
|
||||||
contents <- extractText =<< dictionaryById source
|
contents <- extractText =<< objectById source
|
||||||
return $ Page {contents, source}
|
return $ Page {contents, source}
|
||||||
|
|
||||||
getAll :: Content -> Either String (Map Int Page)
|
getAll :: Content -> Either String (Map Int Page)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user