diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index 8279b94..710156b 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -20,12 +20,12 @@ library , PDF.CMap , PDF.Content , PDF.EOL + , PDF.Layer , PDF.Object , PDF.Object.Navigation , PDF.Output , PDF.Parser , PDF.Pages - , PDF.Update other-modules: Data.ByteString.Char8.Util , PDF.Content.Operator , PDF.Content.Operator.Color diff --git a/examples/getContent.hs b/examples/getContent.hs new file mode 100644 index 0000000..a43d48d --- /dev/null +++ b/examples/getContent.hs @@ -0,0 +1,11 @@ +main :: IO () +main = do + hSetBuffering stdout LineBuffering + args <- getArgs + case args of + --[inputFile] -> wholeDoc inputFile + [inputFile, pageNumber] -> do + content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile + get content (read pageNumber) + singlePage inputFile (read pageNumber) + _ -> die "Syntax: getContent INPUT_FILE PAGE_NUMBER" diff --git a/examples/getObj.hs b/examples/getObj.hs index 1b05412..c0a43f0 100644 --- a/examples/getObj.hs +++ b/examples/getObj.hs @@ -6,13 +6,13 @@ import Data.ByteString.Lazy.Char8 (ByteString) import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn) import PDF (Document(..), parseDocument) -import PDF.Object (Content(..), Object(..)) +import PDF.Layer (Layer(..), unify) +import PDF.Object (Object(..)) import PDF.Object.Navigation ( Error(..), (//), objectById, openStream, origin ) import PDF.Output (ObjectId(..), Output) import qualified PDF.Output as Output (render) -import PDF.Update (unify) import System.Environment (getArgs, getProgName) import System.Exit (die) import Text.Printf (printf) @@ -24,9 +24,9 @@ decodedStream object = where replaceContent streamContent = object {streamContent} -display :: Output a => ReaderT Content Error a -> Document -> Either String ByteString -display getter (Document {eolStyle, updates}) = - Output.render eolStyle <$> runError (runReaderT getter (unify updates)) +display :: Output a => ReaderT Layer Error a -> Document -> Either String ByteString +display getter (Document {eolStyle, layers}) = + Output.render eolStyle <$> runError (runReaderT getter (unify layers)) parse :: [String] -> IO (FilePath, Document -> Either String ByteString) parse [inputFile] = return (inputFile, display origin) diff --git a/examples/getText.hs b/examples/getText.hs index 51a2a9f..506b751 100644 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -2,17 +2,16 @@ import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.Map as Map (toList) import qualified Data.Text.IO as Text (putStrLn) import PDF (Document(..), parseDocument) -import PDF.Object (Content) +import PDF.Layer (Layer, unify) import PDF.Pages (Page(..), get, getAll) -import PDF.Update (unify) import System.Environment (getArgs) import System.Exit (die) import System.IO (BufferMode(..), hSetBuffering, stdout) -onDoc :: FilePath -> (Content -> Either String a) -> IO a +onDoc :: FilePath -> (Layer -> Either String a) -> IO a onDoc inputFile f = do - content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile - case content >>= f of + layer <- fmap (unify . layers) . parseDocument <$> BS.readFile inputFile + case layer >>= f of Left someError -> die someError Right value -> return value diff --git a/src/PDF.hs b/src/PDF.hs index d140e1e..f60991f 100644 --- a/src/PDF.hs +++ b/src/PDF.hs @@ -14,8 +14,9 @@ import qualified Data.ByteString.Lazy as Lazy (ByteString) import qualified Data.Map as Map (lookup) import PDF.Body (populate) import qualified PDF.EOL as EOL (Style(..), charset, parser) +import PDF.Layer (Layer) import PDF.Object ( - Content(..), DirectObject(..), InputStructure(..), Name(..), Number(..) + DirectObject(..), InputStructure(..), Name(..), Number(..) , Structure(..) , eofMarker, magicNumber, structure ) @@ -27,13 +28,13 @@ import Text.Printf (printf) data Document = Document { pdfVersion :: String , eolStyle :: EOL.Style - , updates :: [Content] + , layers :: [Layer] } deriving Show instance Output Document where - output (Document {pdfVersion, updates}) = + output (Document {pdfVersion, layers}) = Output.line (printf "%%PDF-%s" pdfVersion) - `mappend` output updates + `mappend` output layers render :: Document -> Lazy.ByteString render document@(Document {eolStyle}) = Output.render eolStyle document @@ -99,5 +100,5 @@ parseDocument input = do (pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input startXref <- readStartXref eolStyle input structuresRead <- readStructures startXref input - let updates = populate input <$> structuresRead - return $ Document {pdfVersion, eolStyle, updates} + let layers = populate input <$> structuresRead + return $ Document {pdfVersion, eolStyle, layers} diff --git a/src/PDF/Body.hs b/src/PDF/Body.hs index a2d996c..f5d81ed 100644 --- a/src/PDF/Body.hs +++ b/src/PDF/Body.hs @@ -12,8 +12,9 @@ import qualified Data.ByteString.Char8 as BS (cons, drop, unpack) import Data.Map ((!)) import qualified Data.Map as Map (empty, insert, lookup) import qualified PDF.EOL as EOL (charset, parser) +import PDF.Layer (Layer(..)) import PDF.Object ( - Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..) + DirectObject(..), Flow(..), IndirectObjCoordinates(..) , InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..) , Structure(..), XRefEntry(..), XRefSection , blank, dictionary, directObject, integer, line @@ -107,14 +108,14 @@ occurrence :: SParser Occurrence occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates "comment or object" -populate :: ByteString -> InputStructure -> Content +populate :: ByteString -> InputStructure -> Layer populate input structure = let bodyInput = BS.drop (startOffset structure) input in case evalParser recurseOnOccurrences initialState bodyInput of - Left _ -> Content {occurrences = [], objects = Map.empty, docStructure} + Left _ -> Layer {occurrences = [], objects = Map.empty, docStructure} Right finalState -> let Flow {occurrencesStack, tmpObjects} = flow finalState in - Content { + Layer { occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure } where diff --git a/src/PDF/Layer.hs b/src/PDF/Layer.hs new file mode 100644 index 0000000..97e0f0b --- /dev/null +++ b/src/PDF/Layer.hs @@ -0,0 +1,80 @@ +{-# LANGUAGE NamedFieldPuns #-} +module PDF.Layer ( + Layer(..) + , unify + ) where + +import Data.Map (Map, (!), mapWithKey, member) +import qualified Data.Map as Map (empty, union) +import PDF.Object ( + IndexedObjects, IndirectObjCoordinates(..), Occurrence(..) + , Structure(..), XRefEntry(..), XRefSection, eofMarker, outputBody + ) +import qualified PDF.Output as Output (line) +import PDF.Output ( + Offset, Output(..), Resource(..), byteString, getOffset, getOffsets, newLine + ) +import Text.Printf (printf) + +data Layer = Layer { + occurrences :: [Occurrence] + , objects :: IndexedObjects + , docStructure :: Structure + } deriving Show + +updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset) +updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef) + where + updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)} + updateEntry _ e = e + +instance Output Layer where + output (Layer {occurrences, objects, docStructure}) = + fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects)) + >>= \(body, (xref, startXRef)) -> mconcat [ + body + , Output.line "xref" + , output xref + , Output.line "trailer" + , output trailer, newLine + , Output.line "startxref" + , Output.line (printf "%d" (getOffset startXRef)) + , byteString eofMarker + ] + where + Structure {xRef, trailer} = docStructure + +emptyLayer :: Layer +emptyLayer = Layer { + docStructure = Structure {xRef = Map.empty, trailer = Map.empty} + , objects = Map.empty + , occurrences = [] + } + +unify :: [Layer] -> Layer +unify = foldl complete emptyLayer + where + complete tmpLayer older = + let mergedObjects = Map.union (objects tmpLayer) (objects older) in + Layer { + docStructure = + unifyDocStructure (docStructure tmpLayer) (docStructure older) + , objects = mergedObjects + , occurrences = + unifyOccurrences mergedObjects (occurrences tmpLayer) (occurrences older) + } + +unifyDocStructure :: Structure -> Structure -> Structure +unifyDocStructure update original = Structure { + xRef = Map.union (xRef update) (xRef original) + , trailer = Map.union (trailer update) (trailer original) + } + +unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence] +unifyOccurrences objects update = foldr addOlder update + where + addOlder occurrence@(Comment _) existing = occurrence : existing + addOlder occurrence@(Indirect indirect) existing = + if objectId indirect `member` objects + then occurrence : existing + else existing diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index f7b6400..ce0bcdc 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -2,8 +2,7 @@ {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE FlexibleInstances #-} module PDF.Object ( - Content(..) - , Dictionary + Dictionary , DirectObject(..) , Flow(..) , IndexedObjects @@ -27,6 +26,7 @@ module PDF.Object ( , magicNumber , name , number + , outputBody , regular , stringObject , structure @@ -41,7 +41,7 @@ import qualified Data.ByteString.Char8 as Char8 ( cons, length, pack, singleton, snoc, unpack ) import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape) -import Data.Map (Map, (!), mapWithKey) +import Data.Map (Map, (!)) import qualified Data.Map as Map ( delete, empty, fromList, lookup, minViewWithKey, toList, union ) @@ -50,8 +50,7 @@ import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.Output as Output (line, string) import PDF.Output ( OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..) - , byteString, getObjectId, getOffset, getOffsets, join, newLine - , saveOffset + , byteString, getObjectId, getOffset, join, newLine, saveOffset ) import PDF.Parser (MonadParser(..), Parser, (), octDigit, oneOf) import Text.Printf (printf) @@ -272,6 +271,10 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum , Output.line "endobj" ] +outputBody :: ([Occurrence], IndexedObjects) -> OBuilder +outputBody (occurrences, objects) = + output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef + ------------------------------------- -- XREF TABLE ------------------------------------- @@ -369,12 +372,6 @@ structure = <$> xRefSection <*> (string "trailer" *> blank *> dictionary <* EOL.parser) -updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset) -updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef) - where - updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)} - updateEntry _ e = e - -- -- Flow -- @@ -382,32 +379,3 @@ data Flow = Flow { occurrencesStack :: [Occurrence] , tmpObjects :: IndexedObjects } deriving Show - --- --- Content --- -data Content = Content { - occurrences :: [Occurrence] - , objects :: IndexedObjects - , docStructure :: Structure - } deriving Show - -outputBody :: ([Occurrence], IndexedObjects) -> OBuilder -outputBody (occurrences, objects) = - output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef - -instance Output Content where - output (Content {occurrences, objects, docStructure}) = - fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects)) - >>= \(body, (xref, startXRef)) -> mconcat [ - body - , Output.line "xref" - , output xref - , Output.line "trailer" - , output trailer, newLine - , Output.line "startxref" - , Output.line (printf "%d" (getOffset startXRef)) - , byteString eofMarker - ] - where - Structure {xRef, trailer} = docStructure diff --git a/src/PDF/Object/Navigation.hs b/src/PDF/Object/Navigation.hs index 02c0568..92ec370 100644 --- a/src/PDF/Object/Navigation.hs +++ b/src/PDF/Object/Navigation.hs @@ -23,15 +23,16 @@ import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) import Data.Map ((!)) import qualified Data.Map as Map (lookup) +import PDF.Layer (Layer(..)) import PDF.Object ( - Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) + Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Name(..), Object(..), Structure(..) ) import PDF.Output (ObjectId) import Prelude hiding (fail) import Text.Printf (printf) -type PDFContent m = (Alternative m, MonadReader Content m, MonadFail m) +type PDFContent m = (Alternative m, MonadReader Layer m, MonadFail m) newtype Error a = Error { runError :: Either String a } deriving (Alternative, Functor, Applicative, Monad, MonadPlus) diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs index b332f8d..b951a74 100755 --- a/src/PDF/Pages.hs +++ b/src/PDF/Pages.hs @@ -18,8 +18,9 @@ import qualified PDF.Content as Content (parse) import PDF.Content.Text (renderText) import PDF.Encoding (encoding) import PDF.Font (Font, FontSet) +import PDF.Layer (Layer(..)) import PDF.Object ( - Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) + Dictionary, DirectObject(..), IndirectObjCoordinates(..) , Name(..), Object(..) ,) import PDF.Object.Navigation ( @@ -31,7 +32,7 @@ import Prelude hiding (fail) import Text.Printf (printf) type CachedFonts = Map ObjectId Font -type T = RWST Content () CachedFonts Error +type T = RWST Layer () CachedFonts Error data Page = Page { contents :: [Text] , source :: ObjectId @@ -107,13 +108,13 @@ loadPage source = do contents <- extractText =<< objectById source return $ Page {contents, source} -getAll :: Content -> Either String (Map Int Page) +getAll :: Layer -> Either String (Map Int Page) getAll content = runError $ fst <$> evalRWST getPages content Map.empty where numbered = Map.fromList . zip [1..] getPages = numbered <$> (mapM loadPage =<< pagesList) -get :: Content -> Int -> Either String Page +get :: Layer -> Int -> Either String Page get content pageNumber | pageNumber < 1 = Left "Pages start at 1" | otherwise = runError $ fst <$> evalRWST getPage content Map.empty diff --git a/src/PDF/Update.hs b/src/PDF/Update.hs deleted file mode 100644 index 57ced3c..0000000 --- a/src/PDF/Update.hs +++ /dev/null @@ -1,47 +0,0 @@ -{-# LANGUAGE NamedFieldPuns #-} -module PDF.Update ( - unify - ) where - -import Data.Map (member) -import qualified Data.Map as Map (empty, union) -import PDF.Object ( - Content(..), IndexedObjects, IndirectObjCoordinates(..), Occurrence(..) - , Structure(..) - ) - -emptyContent :: Content -emptyContent = Content { - docStructure = Structure {xRef = Map.empty, trailer = Map.empty} - , objects = Map.empty - , occurrences = [] - } - -unify :: [Content] -> Content -unify = foldl complete emptyContent - where - complete tmpContent older = - let mergedObjects = Map.union (objects tmpContent) (objects older) in - Content { - docStructure = - unifyDocStructure (docStructure tmpContent) (docStructure older) - , objects = mergedObjects - , occurrences = - unifyOccurrences mergedObjects (occurrences tmpContent) (occurrences older) - } - -unifyDocStructure :: Structure -> Structure -> Structure -unifyDocStructure update original = Structure { - xRef = Map.union (xRef update) (xRef original) - , trailer = Map.union (trailer update) (trailer original) - } - -unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence] -unifyOccurrences objects update = foldr addOlder update - where - addOlder occurrence@(Comment _) existing = occurrence : existing - addOlder occurrence@(Indirect indirect) existing = - if objectId indirect `member` objects - then occurrence : existing - else existing -