Implement PDF's multilayer updates and use it in getObj to display only the current version of the object taken into account instead of the concatenation of all its versions

This commit is contained in:
Tissevert 2019-09-22 01:40:39 +02:00
parent 3a39c75e6a
commit 68f90d20e2
3 changed files with 54 additions and 3 deletions

View File

@ -20,6 +20,7 @@ library
, PDF.EOL
, PDF.Object
, PDF.Output
, PDF.Update
other-modules: Data.ByteString.Char8.Util
, PDF.Body
, PDF.Parser

View File

@ -3,7 +3,7 @@
import Codec.Compression.Zlib (decompress)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, putStr, toStrict)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
import Data.Map ((!))
import qualified Data.Map as Map (lookup)
import PDF (Document(..), parseDocument)
@ -11,6 +11,7 @@ import qualified PDF.EOL as EOL (Style)
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
import PDF.Output (ObjectId(..))
import qualified PDF.Output as Output (render)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
@ -26,12 +27,14 @@ display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
extractObject :: ObjectId -> Document -> ByteString
extractObject objectId (Document {eolStyle, updates}) =
Lazy.concat $ display eolStyle . (!objectId) . objects <$> updates
display eolStyle . (!objectId) $ objects content
where
content = unify updates
main :: IO ()
main = do
[inputFile, objectId] <- getArgs
result <- parseDocument <$> BS.readFile inputFile
case result of
Left parseError -> hPutStrLn stderr $ show parseError
Left parseError -> hPutStrLn stderr parseError
Right doc -> Lazy.putStr $ extractObject (ObjectId (read objectId)) doc

47
src/PDF/Update.hs Normal file
View File

@ -0,0 +1,47 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.Update (
unify
) where
import Data.Map (member)
import qualified Data.Map as Map (empty, union)
import PDF.Object (
Content(..), IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
, Structure(..)
)
emptyContent :: Content
emptyContent = Content {
docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
, objects = Map.empty
, occurrences = []
}
unify :: [Content] -> Content
unify = foldl complete emptyContent
where
complete tmpContent older =
let mergedObjects = Map.union (objects tmpContent) (objects older) in
Content {
docStructure =
unifyDocStructure (docStructure tmpContent) (docStructure older)
, objects = mergedObjects
, occurrences =
unifyOccurrences mergedObjects (occurrences tmpContent) (occurrences older)
}
unifyDocStructure :: Structure -> Structure -> Structure
unifyDocStructure update original = Structure {
xRef = Map.union (xRef update) (xRef original)
, trailer = Map.union (trailer update) (trailer original)
}
unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
unifyOccurrences objects update = foldr addOlder update
where
addOlder occurrence@(Comment _) existing = occurrence : existing
addOlder occurrence@(Indirect indirect) existing =
if objectId indirect `member` objects
then occurrence : existing
else existing