From bcf2e05bfbd8d8ad96e2ed70c80a63b5f6c62feb Mon Sep 17 00:00:00 2001
From: Tissevert <tissevert+devel@marvid.fr>
Date: Mon, 17 Feb 2020 15:29:59 +0100
Subject: [PATCH] Move Content out of Object module into a separate one
 incorporating PDF.Update (which is actually an operation that is defined only
 on that structure), and rename it Layer to avoid confusion with Content
 streams as defined in the specs (which have their own PDF.Content module
 already)

---
 Hufflepdf.cabal              |  2 +-
 examples/getContent.hs       | 11 +++++
 examples/getObj.hs           | 10 ++---
 examples/getText.hs          |  9 ++--
 src/PDF.hs                   | 13 +++---
 src/PDF/Body.hs              |  9 ++--
 src/PDF/Layer.hs             | 80 ++++++++++++++++++++++++++++++++++++
 src/PDF/Object.hs            | 48 ++++------------------
 src/PDF/Object/Navigation.hs |  5 ++-
 src/PDF/Pages.hs             |  9 ++--
 src/PDF/Update.hs            | 47 ---------------------
 11 files changed, 129 insertions(+), 114 deletions(-)
 create mode 100644 examples/getContent.hs
 create mode 100644 src/PDF/Layer.hs
 delete mode 100644 src/PDF/Update.hs

diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal
index 8279b94..710156b 100644
--- a/Hufflepdf.cabal
+++ b/Hufflepdf.cabal
@@ -20,12 +20,12 @@ library
                      , PDF.CMap
                      , PDF.Content
                      , PDF.EOL
+                     , PDF.Layer
                      , PDF.Object
                      , PDF.Object.Navigation
                      , PDF.Output
                      , PDF.Parser
                      , PDF.Pages
-                     , PDF.Update
   other-modules:       Data.ByteString.Char8.Util
                      , PDF.Content.Operator
                      , PDF.Content.Operator.Color
diff --git a/examples/getContent.hs b/examples/getContent.hs
new file mode 100644
index 0000000..a43d48d
--- /dev/null
+++ b/examples/getContent.hs
@@ -0,0 +1,11 @@
+main :: IO ()
+main = do
+  hSetBuffering stdout LineBuffering
+  args <- getArgs
+  case args of
+    --[inputFile] -> wholeDoc inputFile
+    [inputFile, pageNumber] -> do
+      content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
+      get content (read pageNumber)
+      singlePage inputFile (read pageNumber)
+    _ -> die "Syntax: getContent INPUT_FILE PAGE_NUMBER"
diff --git a/examples/getObj.hs b/examples/getObj.hs
index 1b05412..c0a43f0 100644
--- a/examples/getObj.hs
+++ b/examples/getObj.hs
@@ -6,13 +6,13 @@ import Data.ByteString.Lazy.Char8 (ByteString)
 import qualified Data.ByteString.Char8 as BS (readFile)
 import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
 import PDF (Document(..), parseDocument)
-import PDF.Object (Content(..), Object(..))
+import PDF.Layer (Layer(..), unify)
+import PDF.Object (Object(..))
 import PDF.Object.Navigation (
     Error(..), (//), objectById, openStream, origin
   )
 import PDF.Output (ObjectId(..), Output)
 import qualified PDF.Output as Output (render)
-import PDF.Update (unify)
 import System.Environment (getArgs, getProgName)
 import System.Exit (die)
 import Text.Printf (printf)
@@ -24,9 +24,9 @@ decodedStream object =
   where
     replaceContent streamContent = object {streamContent}
 
-display :: Output a => ReaderT Content Error a -> Document -> Either String ByteString
-display getter (Document {eolStyle, updates}) =
-  Output.render eolStyle <$> runError (runReaderT getter (unify updates))
+display :: Output a => ReaderT Layer Error a -> Document -> Either String ByteString
+display getter (Document {eolStyle, layers}) =
+  Output.render eolStyle <$> runError (runReaderT getter (unify layers))
 
 parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
 parse [inputFile] = return (inputFile, display origin)
diff --git a/examples/getText.hs b/examples/getText.hs
index 51a2a9f..506b751 100644
--- a/examples/getText.hs
+++ b/examples/getText.hs
@@ -2,17 +2,16 @@ import qualified Data.ByteString.Char8 as BS (readFile)
 import qualified Data.Map as Map (toList)
 import qualified Data.Text.IO as Text (putStrLn)
 import PDF (Document(..), parseDocument)
-import PDF.Object (Content)
+import PDF.Layer (Layer, unify)
 import PDF.Pages (Page(..), get, getAll)
-import PDF.Update (unify)
 import System.Environment (getArgs)
 import System.Exit (die)
 import System.IO (BufferMode(..), hSetBuffering, stdout)
 
-onDoc :: FilePath -> (Content -> Either String a) -> IO a
+onDoc :: FilePath -> (Layer -> Either String a) -> IO a
 onDoc inputFile f = do
-  content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
-  case content >>= f of
+  layer <- fmap (unify . layers) . parseDocument <$> BS.readFile inputFile
+  case layer >>= f of
     Left someError -> die someError
     Right value -> return value
 
diff --git a/src/PDF.hs b/src/PDF.hs
index d140e1e..f60991f 100644
--- a/src/PDF.hs
+++ b/src/PDF.hs
@@ -14,8 +14,9 @@ import qualified Data.ByteString.Lazy as Lazy (ByteString)
 import qualified Data.Map as Map (lookup)
 import PDF.Body (populate)
 import qualified PDF.EOL as EOL (Style(..), charset, parser)
+import PDF.Layer (Layer)
 import PDF.Object (
-      Content(..), DirectObject(..), InputStructure(..), Name(..), Number(..)
+      DirectObject(..), InputStructure(..), Name(..), Number(..)
     , Structure(..)
     , eofMarker, magicNumber, structure
   )
@@ -27,13 +28,13 @@ import Text.Printf (printf)
 data Document = Document {
       pdfVersion :: String
     , eolStyle :: EOL.Style
-    , updates :: [Content]
+    , layers :: [Layer]
   } deriving Show
 
 instance Output Document where
-  output (Document {pdfVersion, updates}) =
+  output (Document {pdfVersion, layers}) =
       Output.line (printf "%%PDF-%s" pdfVersion)
-    `mappend` output updates
+    `mappend` output layers
 
 render :: Document -> Lazy.ByteString
 render document@(Document {eolStyle}) = Output.render eolStyle document
@@ -99,5 +100,5 @@ parseDocument input = do
   (pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
   startXref <- readStartXref eolStyle input
   structuresRead <- readStructures startXref input
-  let updates = populate input <$> structuresRead
-  return $ Document {pdfVersion, eolStyle, updates}
+  let layers = populate input <$> structuresRead
+  return $ Document {pdfVersion, eolStyle, layers}
diff --git a/src/PDF/Body.hs b/src/PDF/Body.hs
index a2d996c..f5d81ed 100644
--- a/src/PDF/Body.hs
+++ b/src/PDF/Body.hs
@@ -12,8 +12,9 @@ import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
 import Data.Map ((!))
 import qualified Data.Map as Map (empty, insert, lookup)
 import qualified PDF.EOL as EOL (charset, parser)
+import PDF.Layer (Layer(..))
 import PDF.Object (
-      Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
+      DirectObject(..), Flow(..), IndirectObjCoordinates(..)
     , InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
     , Structure(..), XRefEntry(..), XRefSection
     , blank, dictionary, directObject, integer, line
@@ -107,14 +108,14 @@ occurrence :: SParser Occurrence
 occurrence =
   Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
 
-populate :: ByteString -> InputStructure -> Content
+populate :: ByteString -> InputStructure -> Layer
 populate input structure =
   let bodyInput = BS.drop (startOffset structure) input in
   case evalParser recurseOnOccurrences initialState bodyInput of
-    Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
+    Left _ -> Layer {occurrences = [], objects = Map.empty, docStructure}
     Right finalState ->
       let Flow {occurrencesStack, tmpObjects} = flow finalState in
-      Content {
+      Layer {
           occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure
         }
   where
diff --git a/src/PDF/Layer.hs b/src/PDF/Layer.hs
new file mode 100644
index 0000000..97e0f0b
--- /dev/null
+++ b/src/PDF/Layer.hs
@@ -0,0 +1,80 @@
+{-# LANGUAGE NamedFieldPuns #-}
+module PDF.Layer (
+      Layer(..)
+    , unify
+  ) where
+
+import Data.Map (Map, (!), mapWithKey, member)
+import qualified Data.Map as Map (empty, union)
+import PDF.Object (
+      IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
+    , Structure(..), XRefEntry(..), XRefSection, eofMarker, outputBody
+  )
+import qualified PDF.Output as Output (line)
+import PDF.Output (
+    Offset, Output(..), Resource(..), byteString, getOffset, getOffsets, newLine
+  )
+import Text.Printf (printf)
+
+data Layer = Layer {
+      occurrences :: [Occurrence]
+    , objects :: IndexedObjects
+    , docStructure :: Structure
+  } deriving Show
+
+updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
+updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
+  where
+    updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
+    updateEntry _ e = e
+
+instance Output Layer where
+  output (Layer {occurrences, objects, docStructure}) =
+    fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
+    >>= \(body, (xref, startXRef)) -> mconcat [
+          body
+        , Output.line "xref"
+        , output xref
+        , Output.line "trailer"
+        , output trailer, newLine
+        , Output.line "startxref"
+        , Output.line (printf "%d" (getOffset startXRef))
+        , byteString eofMarker
+      ]
+    where
+      Structure {xRef, trailer} = docStructure
+
+emptyLayer :: Layer
+emptyLayer = Layer {
+      docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
+    , objects = Map.empty
+    , occurrences = []
+  }
+
+unify :: [Layer] -> Layer
+unify = foldl complete emptyLayer
+  where
+    complete tmpLayer older =
+      let mergedObjects = Map.union (objects tmpLayer) (objects older) in
+      Layer {
+          docStructure =
+            unifyDocStructure (docStructure tmpLayer) (docStructure older)
+        , objects = mergedObjects
+        , occurrences = 
+            unifyOccurrences mergedObjects (occurrences tmpLayer) (occurrences older)
+      }
+
+unifyDocStructure :: Structure -> Structure -> Structure
+unifyDocStructure update original = Structure {
+      xRef = Map.union (xRef update) (xRef original)
+    , trailer = Map.union (trailer update) (trailer original)
+  }
+
+unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
+unifyOccurrences objects update = foldr addOlder update
+  where
+    addOlder occurrence@(Comment _) existing = occurrence : existing
+    addOlder occurrence@(Indirect indirect) existing =
+      if objectId indirect `member` objects
+      then occurrence : existing
+      else existing
diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs
index f7b6400..ce0bcdc 100644
--- a/src/PDF/Object.hs
+++ b/src/PDF/Object.hs
@@ -2,8 +2,7 @@
 {-# LANGUAGE NamedFieldPuns #-}
 {-# LANGUAGE FlexibleInstances #-}
 module PDF.Object (
-      Content(..)
-    , Dictionary
+      Dictionary
     , DirectObject(..)
     , Flow(..)
     , IndexedObjects
@@ -27,6 +26,7 @@ module PDF.Object (
     , magicNumber
     , name
     , number
+    , outputBody
     , regular
     , stringObject
     , structure
@@ -41,7 +41,7 @@ import qualified Data.ByteString.Char8 as Char8 (
     cons, length, pack, singleton, snoc, unpack
   )
 import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
-import Data.Map (Map, (!), mapWithKey)
+import Data.Map (Map, (!))
 import qualified Data.Map as Map (
     delete, empty, fromList, lookup, minViewWithKey, toList, union
   )
@@ -50,8 +50,7 @@ import qualified PDF.EOL as EOL (charset, parser)
 import qualified PDF.Output as Output (line, string)
 import PDF.Output (
       OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
-    , byteString, getObjectId, getOffset, getOffsets, join, newLine
-    , saveOffset
+    , byteString, getObjectId, getOffset, join, newLine, saveOffset
   )
 import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
 import Text.Printf (printf)
@@ -272,6 +271,10 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum
       , Output.line "endobj"
     ]
 
+outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
+outputBody (occurrences, objects) =
+  output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
+
 -------------------------------------
 --          XREF TABLE
 -------------------------------------
@@ -369,12 +372,6 @@ structure =
   <$> xRefSection
   <*> (string "trailer" *> blank *> dictionary <* EOL.parser)
 
-updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
-updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
-  where
-    updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
-    updateEntry _ e = e
-
 --
 -- Flow
 --
@@ -382,32 +379,3 @@ data Flow = Flow {
       occurrencesStack :: [Occurrence]
     , tmpObjects :: IndexedObjects
   } deriving Show
-
---
--- Content
---
-data Content = Content {
-      occurrences :: [Occurrence]
-    , objects :: IndexedObjects
-    , docStructure :: Structure
-  } deriving Show
-
-outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
-outputBody (occurrences, objects) =
-  output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
-
-instance Output Content where
-  output (Content {occurrences, objects, docStructure}) =
-    fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
-    >>= \(body, (xref, startXRef)) -> mconcat [
-          body
-        , Output.line "xref"
-        , output xref
-        , Output.line "trailer"
-        , output trailer, newLine
-        , Output.line "startxref"
-        , Output.line (printf "%d" (getOffset startXRef))
-        , byteString eofMarker
-      ]
-    where
-      Structure {xRef, trailer} = docStructure
diff --git a/src/PDF/Object/Navigation.hs b/src/PDF/Object/Navigation.hs
index 02c0568..92ec370 100644
--- a/src/PDF/Object/Navigation.hs
+++ b/src/PDF/Object/Navigation.hs
@@ -23,15 +23,16 @@ import Data.ByteString (ByteString)
 import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
 import Data.Map ((!))
 import qualified Data.Map as Map (lookup)
+import PDF.Layer (Layer(..))
 import PDF.Object (
-      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
+      Dictionary, DirectObject(..), IndirectObjCoordinates(..)
     , Name(..), Object(..), Structure(..)
   )
 import PDF.Output (ObjectId)
 import Prelude hiding (fail)
 import Text.Printf (printf)
 
-type PDFContent m = (Alternative m, MonadReader Content m, MonadFail m)
+type PDFContent m = (Alternative m, MonadReader Layer m, MonadFail m)
 newtype Error a = Error {
     runError :: Either String a
   } deriving (Alternative, Functor, Applicative, Monad, MonadPlus)
diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs
index b332f8d..b951a74 100755
--- a/src/PDF/Pages.hs
+++ b/src/PDF/Pages.hs
@@ -18,8 +18,9 @@ import qualified PDF.Content as Content (parse)
 import PDF.Content.Text (renderText)
 import PDF.Encoding (encoding)
 import PDF.Font (Font, FontSet)
+import PDF.Layer (Layer(..))
 import PDF.Object (
-      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
+      Dictionary, DirectObject(..), IndirectObjCoordinates(..)
     , Name(..), Object(..)
   ,)
 import PDF.Object.Navigation (
@@ -31,7 +32,7 @@ import Prelude hiding (fail)
 import Text.Printf (printf)
 
 type CachedFonts = Map ObjectId Font
-type T = RWST Content () CachedFonts Error
+type T = RWST Layer () CachedFonts Error
 data Page = Page {
       contents :: [Text]
     , source :: ObjectId
@@ -107,13 +108,13 @@ loadPage source = do
   contents <- extractText =<< objectById source
   return $ Page {contents, source}
 
-getAll :: Content -> Either String (Map Int Page)
+getAll :: Layer -> Either String (Map Int Page)
 getAll content = runError $ fst <$> evalRWST getPages content Map.empty
   where
     numbered = Map.fromList . zip [1..]
     getPages = numbered <$> (mapM loadPage =<< pagesList)
 
-get :: Content -> Int -> Either String Page
+get :: Layer -> Int -> Either String Page
 get content pageNumber
   | pageNumber < 1 = Left "Pages start at 1"
   | otherwise = runError $ fst <$> evalRWST getPage content Map.empty
diff --git a/src/PDF/Update.hs b/src/PDF/Update.hs
deleted file mode 100644
index 57ced3c..0000000
--- a/src/PDF/Update.hs
+++ /dev/null
@@ -1,47 +0,0 @@
-{-# LANGUAGE NamedFieldPuns #-}
-module PDF.Update (
-    unify
-  ) where
-
-import Data.Map (member)
-import qualified Data.Map as Map (empty, union)
-import PDF.Object (
-      Content(..), IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
-    , Structure(..)
-  )
-
-emptyContent :: Content
-emptyContent = Content {
-      docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
-    , objects = Map.empty
-    , occurrences = []
-  }
-
-unify :: [Content] -> Content
-unify = foldl complete emptyContent
-  where
-    complete tmpContent older =
-      let mergedObjects = Map.union (objects tmpContent) (objects older) in
-      Content {
-          docStructure =
-            unifyDocStructure (docStructure tmpContent) (docStructure older)
-        , objects = mergedObjects
-        , occurrences = 
-            unifyOccurrences mergedObjects (occurrences tmpContent) (occurrences older)
-      }
-
-unifyDocStructure :: Structure -> Structure -> Structure
-unifyDocStructure update original = Structure {
-      xRef = Map.union (xRef update) (xRef original)
-    , trailer = Map.union (trailer update) (trailer original)
-  }
-
-unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
-unifyOccurrences objects update = foldr addOlder update
-  where
-    addOlder occurrence@(Comment _) existing = occurrence : existing
-    addOlder occurrence@(Indirect indirect) existing =
-      if objectId indirect `member` objects
-      then occurrence : existing
-      else existing
-