Reimplement getObj with the newest tools in PDF.Object.Navigation, in particular implement browsing by paths or random objectId access

This commit is contained in:
Tissevert 2020-02-15 10:25:09 +01:00
parent b916ab5206
commit 23186100a8
3 changed files with 38 additions and 35 deletions

View file

@ -65,7 +65,7 @@ executable getObj
, bytestring , bytestring
, containers , containers
, Hufflepdf , Hufflepdf
, zlib , mtl
ghc-options: -Wall ghc-options: -Wall
default-language: Haskell2010 default-language: Haskell2010

View file

@ -1,53 +1,51 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleContexts #-}
import Codec.Compression.Zlib (decompress) import Control.Monad.Reader (ReaderT, runReaderT)
import Data.ByteString.Lazy.Char8 (ByteString) import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict) import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
import Data.Map ((!?))
import qualified Data.Map as Map (keys, lookup)
import PDF (Document(..), parseDocument) import PDF (Document(..), parseDocument)
import qualified PDF.EOL as EOL (Style) import PDF.Object (Content(..), Object(..))
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..)) import PDF.Object.Navigation (
import PDF.Output (ObjectId(..)) Error(..), (//), castObject, objectById, openStream, origin
)
import PDF.Output (ObjectId(..), Output)
import qualified PDF.Output as Output (render) import qualified PDF.Output as Output (render)
import PDF.Update (unify) import PDF.Update (unify)
import System.Environment (getArgs, getProgName) import System.Environment (getArgs, getProgName)
import System.Exit (die) import System.Exit (die)
import Text.Printf (printf) import Text.Printf (printf)
import Text.Read (readMaybe)
display :: EOL.Style -> Object -> ByteString decodedStream :: Object -> Object
display eolStyle d@(Direct _) = Output.render eolStyle d decodedStream object =
display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $ maybe object replaceContent $ openStream object
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) -> Stream {
header
, streamContent = Lazy.toStrict . decompress $ Lazy.fromStrict streamContent
}
_ -> s
extractObject :: ObjectId -> Document -> Either String ByteString
extractObject objectId (Document {eolStyle, updates}) =
case objects content !? objectId of
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
Just o -> Right $ display eolStyle o
where where
content = unify updates replaceContent streamContent = object {streamContent}
listObjectIds :: Document -> Either String [String] display :: Output a => ReaderT Content Error a -> Document -> Either String ByteString
listObjectIds = display getter (Document {eolStyle, updates}) =
Right . prependTitle . toString . Map.keys . objects . unify . updates Output.render eolStyle <$> runError (runReaderT getter (unify updates))
parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
parse [inputFile] = return (inputFile, display origin)
parse [inputFile, key] =
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
where where
toString = fmap (show . getObjectId) byId = objectById . ObjectId
prependTitle = ("ObjectIds defined in this PDF:":) byPath path = (origin >>= (// (explode path)) >>= castObject)
explode "" = []
explode path =
case break (== '.') path of
(name, "") -> [name]
(name, rest) -> name : explode (drop 1 rest)
clear = display . fmap decodedStream
parse _ =
die . printf "Syntax: %s inputFile [OBJECT_ID | PATH_TO_OBJ]\n" =<< getProgName
main :: IO () main :: IO ()
main = do main = do
(inputFile, getData) <- parse =<< getArgs (inputFile, getData) <- parse =<< getArgs
input <- BS.readFile inputFile input <- BS.readFile inputFile
either die id $ (parseDocument input >>= getData) either die Lazy.putStrLn $ (parseDocument input >>= getData)
where
parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
parse [inputFile, objectId] = return
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName

View file

@ -5,6 +5,7 @@
module PDF.Object.Navigation ( module PDF.Object.Navigation (
Error(..) Error(..)
, (//) , (//)
, castObject
, dictionaryById , dictionaryById
, getDictionary , getDictionary
, getField , getField
@ -43,6 +44,10 @@ castDictionary (Direct (Dictionary aDict)) = return aDict
castDictionary (Stream {header}) = return header castDictionary (Stream {header}) = return header
castDictionary obj = expected "dictionary : " obj castDictionary obj = expected "dictionary : " obj
castObject :: PDFContent m => DirectObject -> m Object
castObject directObject =
(castObjectId directObject >>= objectById) <|> return (Direct directObject)
castObjectId :: MonadFail m => DirectObject -> m ObjectId castObjectId :: MonadFail m => DirectObject -> m ObjectId
castObjectId (Reference (IndirectObjCoordinates {objectId})) = return objectId castObjectId (Reference (IndirectObjCoordinates {objectId})) = return objectId
castObjectId directObject = expected "reference" directObject castObjectId directObject = expected "reference" directObject