From bcf2e05bfbd8d8ad96e2ed70c80a63b5f6c62feb Mon Sep 17 00:00:00 2001 From: Tissevert Date: Mon, 17 Feb 2020 15:29:59 +0100 Subject: [PATCH] Move Content out of Object module into a separate one incorporating PDF.Update (which is actually an operation that is defined only on that structure), and rename it Layer to avoid confusion with Content streams as defined in the specs (which have their own PDF.Content module already) --- Hufflepdf.cabal | 2 +- examples/getContent.hs | 11 +++++ examples/getObj.hs | 10 ++--- examples/getText.hs | 9 ++-- src/PDF.hs | 13 +++--- src/PDF/Body.hs | 9 ++-- src/PDF/Layer.hs | 80 ++++++++++++++++++++++++++++++++++++ src/PDF/Object.hs | 48 ++++------------------ src/PDF/Object/Navigation.hs | 5 ++- src/PDF/Pages.hs | 9 ++-- src/PDF/Update.hs | 47 --------------------- 11 files changed, 129 insertions(+), 114 deletions(-) create mode 100644 examples/getContent.hs create mode 100644 src/PDF/Layer.hs delete mode 100644 src/PDF/Update.hs diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index 8279b94..710156b 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -20,12 +20,12 @@ library , PDF.CMap , PDF.Content , PDF.EOL + , PDF.Layer , PDF.Object , PDF.Object.Navigation , PDF.Output , PDF.Parser , PDF.Pages - , PDF.Update other-modules: Data.ByteString.Char8.Util , PDF.Content.Operator , PDF.Content.Operator.Color diff --git a/examples/getContent.hs b/examples/getContent.hs new file mode 100644 index 0000000..a43d48d --- /dev/null +++ b/examples/getContent.hs @@ -0,0 +1,11 @@ +main :: IO () +main = do + hSetBuffering stdout LineBuffering + args <- getArgs + case args of + --[inputFile] -> wholeDoc inputFile + [inputFile, pageNumber] -> do + content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile + get content (read pageNumber) + singlePage inputFile (read pageNumber) + _ -> die "Syntax: getContent INPUT_FILE PAGE_NUMBER" diff --git a/examples/getObj.hs b/examples/getObj.hs index 1b05412..c0a43f0 100644 --- a/examples/getObj.hs +++ b/examples/getObj.hs @@ -6,13 +6,13 @@ import Data.ByteString.Lazy.Char8 (ByteString) import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn) import PDF (Document(..), parseDocument) -import PDF.Object (Content(..), Object(..)) +import PDF.Layer (Layer(..), unify) +import PDF.Object (Object(..)) import PDF.Object.Navigation ( Error(..), (//), objectById, openStream, origin ) import PDF.Output (ObjectId(..), Output) import qualified PDF.Output as Output (render) -import PDF.Update (unify) import System.Environment (getArgs, getProgName) import System.Exit (die) import Text.Printf (printf) @@ -24,9 +24,9 @@ decodedStream object = where replaceContent streamContent = object {streamContent} -display :: Output a => ReaderT Content Error a -> Document -> Either String ByteString -display getter (Document {eolStyle, updates}) = - Output.render eolStyle <$> runError (runReaderT getter (unify updates)) +display :: Output a => ReaderT Layer Error a -> Document -> Either String ByteString +display getter (Document {eolStyle, layers}) = + Output.render eolStyle <$> runError (runReaderT getter (unify layers)) parse :: [String] -> IO (FilePath, Document -> Either String ByteString) parse [inputFile] = return (inputFile, display origin) diff --git a/examples/getText.hs b/examples/getText.hs index 51a2a9f..506b751 100644 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -2,17 +2,16 @@ import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.Map as Map (toList) import qualified Data.Text.IO as Text (putStrLn) import PDF (Document(..), parseDocument) -import PDF.Object (Content) +import PDF.Layer (Layer, unify) import PDF.Pages (Page(..), get, getAll) -import PDF.Update (unify) import System.Environment (getArgs) import System.Exit (die) import System.IO (BufferMode(..), hSetBuffering, stdout) -onDoc :: FilePath -> (Content -> Either String a) -> IO a +onDoc :: FilePath -> (Layer -> Either String a) -> IO a onDoc inputFile f = do - content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile - case content >>= f of + layer <- fmap (unify . layers) . parseDocument <$> BS.readFile inputFile + case layer >>= f of Left someError -> die someError Right value -> return value diff --git a/src/PDF.hs b/src/PDF.hs index d140e1e..f60991f 100644 --- a/src/PDF.hs +++ b/src/PDF.hs @@ -14,8 +14,9 @@ import qualified Data.ByteString.Lazy as Lazy (ByteString) import qualified Data.Map as Map (lookup) import PDF.Body (populate) import qualified PDF.EOL as EOL (Style(..), charset, parser) +import PDF.Layer (Layer) import PDF.Object ( - Content(..), DirectObject(..), InputStructure(..), Name(..), Number(..) + DirectObject(..), InputStructure(..), Name(..), Number(..) , Structure(..) , eofMarker, magicNumber, structure ) @@ -27,13 +28,13 @@ import Text.Printf (printf) data Document = Document { pdfVersion :: String , eolStyle :: EOL.Style - , updates :: [Content] + , layers :: [Layer] } deriving Show instance Output Document where - output (Document {pdfVersion, updates}) = + output (Document {pdfVersion, layers}) = Output.line (printf "%%PDF-%s" pdfVersion) - `mappend` output updates + `mappend` output layers render :: Document -> Lazy.ByteString render document@(Document {eolStyle}) = Output.render eolStyle document @@ -99,5 +100,5 @@ parseDocument input = do (pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input startXref <- readStartXref eolStyle input structuresRead <- readStructures startXref input - let updates = populate input <$> structuresRead - return $ Document {pdfVersion, eolStyle, updates} + let layers = populate input <$> structuresRead + return $ Document {pdfVersion, eolStyle, layers} diff --git a/src/PDF/Body.hs b/src/PDF/Body.hs index a2d996c..f5d81ed 100644 --- a/src/PDF/Body.hs +++ b/src/PDF/Body.hs @@ -12,8 +12,9 @@ import qualified Data.ByteString.Char8 as BS (cons, drop, unpack) import Data.Map ((!)) import qualified Data.Map as Map (empty, insert, lookup) import qualified PDF.EOL as EOL (charset, parser) +import PDF.Layer (Layer(..)) import PDF.Object ( - Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..) + DirectObject(..), Flow(..), IndirectObjCoordinates(..) , InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..) , Structure(..), XRefEntry(..), XRefSection , blank, dictionary, directObject, integer, line @@ -107,14 +108,14 @@ occurrence :: SParser Occurrence occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates "comment or object" -populate :: ByteString -> InputStructure -> Content +populate :: ByteString -> InputStructure -> Layer populate input structure = let bodyInput = BS.drop (startOffset structure) input in case evalParser recurseOnOccurrences initialState bodyInput of - Left _ -> Content {occurrences = [], objects = Map.empty, docStructure} + Left _ -> Layer {occurrences = [], objects = Map.empty, docStructure} Right finalState -> let Flow {occurrencesStack, tmpObjects} = flow finalState in - Content { + Layer { occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure } where diff --git a/src/PDF/Layer.hs b/src/PDF/Layer.hs new file mode 100644 index 0000000..97e0f0b --- /dev/null +++ b/src/PDF/Layer.hs @@ -0,0 +1,80 @@ +{-# LANGUAGE NamedFieldPuns #-} +module PDF.Layer ( + Layer(..) + , unify + ) where + +import Data.Map (Map, (!), mapWithKey, member) +import qualified Data.Map as Map (empty, union) +import PDF.Object ( + IndexedObjects, IndirectObjCoordinates(..), Occurrence(..) + , Structure(..), XRefEntry(..), XRefSection, eofMarker, outputBody + ) +import qualified PDF.Output as Output (line) +import PDF.Output ( + Offset, Output(..), Resource(..), byteString, getOffset, getOffsets, newLine + ) +import Text.Printf (printf) + +data Layer = Layer { + occurrences :: [Occurrence] + , objects :: IndexedObjects + , docStructure :: Structure + } deriving Show + +updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset) +updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef) + where + updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)} + updateEntry _ e = e + +instance Output Layer where + output (Layer {occurrences, objects, docStructure}) = + fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects)) + >>= \(body, (xref, startXRef)) -> mconcat [ + body + , Output.line "xref" + , output xref + , Output.line "trailer" + , output trailer, newLine + , Output.line "startxref" + , Output.line (printf "%d" (getOffset startXRef)) + , byteString eofMarker + ] + where + Structure {xRef, trailer} = docStructure + +emptyLayer :: Layer +emptyLayer = Layer { + docStructure = Structure {xRef = Map.empty, trailer = Map.empty} + , objects = Map.empty + , occurrences = [] + } + +unify :: [Layer] -> Layer +unify = foldl complete emptyLayer + where + complete tmpLayer older = + let mergedObjects = Map.union (objects tmpLayer) (objects older) in + Layer { + docStructure = + unifyDocStructure (docStructure tmpLayer) (docStructure older) + , objects = mergedObjects + , occurrences = + unifyOccurrences mergedObjects (occurrences tmpLayer) (occurrences older) + } + +unifyDocStructure :: Structure -> Structure -> Structure +unifyDocStructure update original = Structure { + xRef = Map.union (xRef update) (xRef original) + , trailer = Map.union (trailer update) (trailer original) + } + +unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence] +unifyOccurrences objects update = foldr addOlder update + where + addOlder occurrence@(Comment _) existing = occurrence : existing + addOlder occurrence@(Indirect indirect) existing = + if objectId indirect `member` objects + then occurrence : existing + else existing diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index f7b6400..ce0bcdc 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -2,8 +2,7 @@ {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE FlexibleInstances #-} module PDF.Object ( - Content(..) - , Dictionary + Dictionary , DirectObject(..) , Flow(..) , IndexedObjects @@ -27,6 +26,7 @@ module PDF.Object ( , magicNumber , name , number + , outputBody , regular , stringObject , structure @@ -41,7 +41,7 @@ import qualified Data.ByteString.Char8 as Char8 ( cons, length, pack, singleton, snoc, unpack ) import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape) -import Data.Map (Map, (!), mapWithKey) +import Data.Map (Map, (!)) import qualified Data.Map as Map ( delete, empty, fromList, lookup, minViewWithKey, toList, union ) @@ -50,8 +50,7 @@ import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.Output as Output (line, string) import PDF.Output ( OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..) - , byteString, getObjectId, getOffset, getOffsets, join, newLine - , saveOffset + , byteString, getObjectId, getOffset, join, newLine, saveOffset ) import PDF.Parser (MonadParser(..), Parser, (), octDigit, oneOf) import Text.Printf (printf) @@ -272,6 +271,10 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum , Output.line "endobj" ] +outputBody :: ([Occurrence], IndexedObjects) -> OBuilder +outputBody (occurrences, objects) = + output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef + ------------------------------------- -- XREF TABLE ------------------------------------- @@ -369,12 +372,6 @@ structure = <$> xRefSection <*> (string "trailer" *> blank *> dictionary <* EOL.parser) -updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset) -updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef) - where - updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)} - updateEntry _ e = e - -- -- Flow -- @@ -382,32 +379,3 @@ data Flow = Flow { occurrencesStack :: [Occurrence] , tmpObjects :: IndexedObjects } deriving Show - --- --- Content --- -data Content = Content { - occurrences :: [Occurrence] - , objects :: IndexedObjects - , docStructure :: Structure - } deriving Show - -outputBody :: ([Occurrence], IndexedObjects) -> OBuilder -outputBody (occurrences, objects) = - output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef - -instance Output Content where - output (Content {occurrences, objects, docStructure}) = - fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects)) - >>= \(body, (xref, startXRef)) -> mconcat [ - body - , Output.line "xref" - , output xref - , Output.line "trailer" - , output trailer, newLine - , Output.line "startxref" - , Output.line (printf "%d" (getOffset startXRef)) - , byteString eofMarker - ] - where - Structure {xRef, trailer} = docStructure diff --git a/src/PDF/Object/Navigation.hs b/src/PDF/Object/Navigation.hs index 02c0568..92ec370 100644 --- a/src/PDF/Object/Navigation.hs +++ b/src/PDF/Object/Navigation.hs @@ -23,15 +23,16 @@ import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map ((!)) import qualified Data.Map as Map (lookup) +import PDF.Layer (Layer(..)) import PDF.Object ( - Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) + Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Name(..), Object(..), Structure(..) ) import PDF.Output (ObjectId) import Prelude hiding (fail) import Text.Printf (printf) -type PDFContent m = (Alternative m, MonadReader Content m, MonadFail m) +type PDFContent m = (Alternative m, MonadReader Layer m, MonadFail m) newtype Error a = Error { runError :: Either String a } deriving (Alternative, Functor, Applicative, Monad, MonadPlus) diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs index b332f8d..b951a74 100755 --- a/src/PDF/Pages.hs +++ b/src/PDF/Pages.hs @@ -18,8 +18,9 @@ import qualified PDF.Content as Content (parse) import PDF.Content.Text (renderText) import PDF.Encoding (encoding) import PDF.Font (Font, FontSet) +import PDF.Layer (Layer(..)) import PDF.Object ( - Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) + Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Name(..), Object(..) ,) import PDF.Object.Navigation ( @@ -31,7 +32,7 @@ import Prelude hiding (fail) import Text.Printf (printf) type CachedFonts = Map ObjectId Font -type T = RWST Content () CachedFonts Error +type T = RWST Layer () CachedFonts Error data Page = Page { contents :: [Text] , source :: ObjectId @@ -107,13 +108,13 @@ loadPage source = do contents <- extractText =<< objectById source return $ Page {contents, source} -getAll :: Content -> Either String (Map Int Page) +getAll :: Layer -> Either String (Map Int Page) getAll content = runError $ fst <$> evalRWST getPages content Map.empty where numbered = Map.fromList . zip [1..] getPages = numbered <$> (mapM loadPage =<< pagesList) -get :: Content -> Int -> Either String Page +get :: Layer -> Int -> Either String Page get content pageNumber | pageNumber < 1 = Left "Pages start at 1" | otherwise = runError $ fst <$> evalRWST getPage content Map.empty diff --git a/src/PDF/Update.hs b/src/PDF/Update.hs deleted file mode 100644 index 57ced3c..0000000 --- a/src/PDF/Update.hs +++ /dev/null @@ -1,47 +0,0 @@ -{-# LANGUAGE NamedFieldPuns #-} -module PDF.Update ( - unify - ) where - -import Data.Map (member) -import qualified Data.Map as Map (empty, union) -import PDF.Object ( - Content(..), IndexedObjects, IndirectObjCoordinates(..), Occurrence(..) - , Structure(..) - ) - -emptyContent :: Content -emptyContent = Content { - docStructure = Structure {xRef = Map.empty, trailer = Map.empty} - , objects = Map.empty - , occurrences = [] - } - -unify :: [Content] -> Content -unify = foldl complete emptyContent - where - complete tmpContent older = - let mergedObjects = Map.union (objects tmpContent) (objects older) in - Content { - docStructure = - unifyDocStructure (docStructure tmpContent) (docStructure older) - , objects = mergedObjects - , occurrences = - unifyOccurrences mergedObjects (occurrences tmpContent) (occurrences older) - } - -unifyDocStructure :: Structure -> Structure -> Structure -unifyDocStructure update original = Structure { - xRef = Map.union (xRef update) (xRef original) - , trailer = Map.union (trailer update) (trailer original) - } - -unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence] -unifyOccurrences objects update = foldr addOlder update - where - addOlder occurrence@(Comment _) existing = occurrence : existing - addOlder occurrence@(Indirect indirect) existing = - if objectId indirect `member` objects - then occurrence : existing - else existing -