Reimplement getObj with the newest tools in PDF.Object.Navigation, in particular implement browsing by paths or random objectId access

This commit is contained in:
Tissevert 2020-02-15 10:25:09 +01:00
parent b916ab5206
commit 23186100a8
3 changed files with 38 additions and 35 deletions

View file

@ -65,7 +65,7 @@ executable getObj
, bytestring
, containers
, Hufflepdf
, zlib
, mtl
ghc-options: -Wall
default-language: Haskell2010

View file

@ -1,53 +1,51 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleContexts #-}
import Codec.Compression.Zlib (decompress)
import Control.Monad.Reader (ReaderT, runReaderT)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
import Data.Map ((!?))
import qualified Data.Map as Map (keys, lookup)
import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
import PDF (Document(..), parseDocument)
import qualified PDF.EOL as EOL (Style)
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
import PDF.Output (ObjectId(..))
import PDF.Object (Content(..), Object(..))
import PDF.Object.Navigation (
Error(..), (//), castObject, objectById, openStream, origin
)
import PDF.Output (ObjectId(..), Output)
import qualified PDF.Output as Output (render)
import PDF.Update (unify)
import System.Environment (getArgs, getProgName)
import System.Exit (die)
import Text.Printf (printf)
import Text.Read (readMaybe)
display :: EOL.Style -> Object -> ByteString
display eolStyle d@(Direct _) = Output.render eolStyle d
display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) -> Stream {
header
, streamContent = Lazy.toStrict . decompress $ Lazy.fromStrict streamContent
}
_ -> s
extractObject :: ObjectId -> Document -> Either String ByteString
extractObject objectId (Document {eolStyle, updates}) =
case objects content !? objectId of
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
Just o -> Right $ display eolStyle o
decodedStream :: Object -> Object
decodedStream object =
maybe object replaceContent $ openStream object
where
content = unify updates
replaceContent streamContent = object {streamContent}
listObjectIds :: Document -> Either String [String]
listObjectIds =
Right . prependTitle . toString . Map.keys . objects . unify . updates
display :: Output a => ReaderT Content Error a -> Document -> Either String ByteString
display getter (Document {eolStyle, updates}) =
Output.render eolStyle <$> runError (runReaderT getter (unify updates))
parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
parse [inputFile] = return (inputFile, display origin)
parse [inputFile, key] =
return (inputFile, clear . maybe (byPath key) byId $ readMaybe key)
where
toString = fmap (show . getObjectId)
prependTitle = ("ObjectIds defined in this PDF:":)
byId = objectById . ObjectId
byPath path = (origin >>= (// (explode path)) >>= castObject)
explode "" = []
explode path =
case break (== '.') path of
(name, "") -> [name]
(name, rest) -> name : explode (drop 1 rest)
clear = display . fmap decodedStream
parse _ =
die . printf "Syntax: %s inputFile [OBJECT_ID | PATH_TO_OBJ]\n" =<< getProgName
main :: IO ()
main = do
(inputFile, getData) <- parse =<< getArgs
input <- BS.readFile inputFile
either die id $ (parseDocument input >>= getData)
where
parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
parse [inputFile, objectId] = return
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName
either die Lazy.putStrLn $ (parseDocument input >>= getData)

View file

@ -5,6 +5,7 @@
module PDF.Object.Navigation (
Error(..)
, (//)
, castObject
, dictionaryById
, getDictionary
, getField
@ -43,6 +44,10 @@ castDictionary (Direct (Dictionary aDict)) = return aDict
castDictionary (Stream {header}) = return header
castDictionary obj = expected "dictionary : " obj
castObject :: PDFContent m => DirectObject -> m Object
castObject directObject =
(castObjectId directObject >>= objectById) <|> return (Direct directObject)
castObjectId :: MonadFail m => DirectObject -> m ObjectId
castObjectId (Reference (IndirectObjCoordinates {objectId})) = return objectId
castObjectId directObject = expected "reference" directObject