Move Content out of Object module into a separate one incorporating PDF.Update (which is actually an operation that is defined only on that structure), and rename it Layer to avoid confusion with Content streams as defined in the specs (which have their own PDF.Content module already)

This commit is contained in:
Tissevert 2020-02-17 15:29:59 +01:00
parent 6096a1a237
commit bcf2e05bfb
11 changed files with 129 additions and 114 deletions

View File

@ -20,12 +20,12 @@ library
, PDF.CMap
, PDF.Content
, PDF.EOL
, PDF.Layer
, PDF.Object
, PDF.Object.Navigation
, PDF.Output
, PDF.Parser
, PDF.Pages
, PDF.Update
other-modules: Data.ByteString.Char8.Util
, PDF.Content.Operator
, PDF.Content.Operator.Color

11
examples/getContent.hs Normal file
View File

@ -0,0 +1,11 @@
main :: IO ()
main = do
hSetBuffering stdout LineBuffering
args <- getArgs
case args of
--[inputFile] -> wholeDoc inputFile
[inputFile, pageNumber] -> do
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
get content (read pageNumber)
singlePage inputFile (read pageNumber)
_ -> die "Syntax: getContent INPUT_FILE PAGE_NUMBER"

View File

@ -6,13 +6,13 @@ import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
import PDF (Document(..), parseDocument)
import PDF.Object (Content(..), Object(..))
import PDF.Layer (Layer(..), unify)
import PDF.Object (Object(..))
import PDF.Object.Navigation (
Error(..), (//), objectById, openStream, origin
)
import PDF.Output (ObjectId(..), Output)
import qualified PDF.Output as Output (render)
import PDF.Update (unify)
import System.Environment (getArgs, getProgName)
import System.Exit (die)
import Text.Printf (printf)
@ -24,9 +24,9 @@ decodedStream object =
where
replaceContent streamContent = object {streamContent}
display :: Output a => ReaderT Content Error a -> Document -> Either String ByteString
display getter (Document {eolStyle, updates}) =
Output.render eolStyle <$> runError (runReaderT getter (unify updates))
display :: Output a => ReaderT Layer Error a -> Document -> Either String ByteString
display getter (Document {eolStyle, layers}) =
Output.render eolStyle <$> runError (runReaderT getter (unify layers))
parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
parse [inputFile] = return (inputFile, display origin)

View File

@ -2,17 +2,16 @@ import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.Map as Map (toList)
import qualified Data.Text.IO as Text (putStrLn)
import PDF (Document(..), parseDocument)
import PDF.Object (Content)
import PDF.Layer (Layer, unify)
import PDF.Pages (Page(..), get, getAll)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout)
onDoc :: FilePath -> (Content -> Either String a) -> IO a
onDoc :: FilePath -> (Layer -> Either String a) -> IO a
onDoc inputFile f = do
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
case content >>= f of
layer <- fmap (unify . layers) . parseDocument <$> BS.readFile inputFile
case layer >>= f of
Left someError -> die someError
Right value -> return value

View File

@ -14,8 +14,9 @@ import qualified Data.ByteString.Lazy as Lazy (ByteString)
import qualified Data.Map as Map (lookup)
import PDF.Body (populate)
import qualified PDF.EOL as EOL (Style(..), charset, parser)
import PDF.Layer (Layer)
import PDF.Object (
Content(..), DirectObject(..), InputStructure(..), Name(..), Number(..)
DirectObject(..), InputStructure(..), Name(..), Number(..)
, Structure(..)
, eofMarker, magicNumber, structure
)
@ -27,13 +28,13 @@ import Text.Printf (printf)
data Document = Document {
pdfVersion :: String
, eolStyle :: EOL.Style
, updates :: [Content]
, layers :: [Layer]
} deriving Show
instance Output Document where
output (Document {pdfVersion, updates}) =
output (Document {pdfVersion, layers}) =
Output.line (printf "%%PDF-%s" pdfVersion)
`mappend` output updates
`mappend` output layers
render :: Document -> Lazy.ByteString
render document@(Document {eolStyle}) = Output.render eolStyle document
@ -99,5 +100,5 @@ parseDocument input = do
(pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
startXref <- readStartXref eolStyle input
structuresRead <- readStructures startXref input
let updates = populate input <$> structuresRead
return $ Document {pdfVersion, eolStyle, updates}
let layers = populate input <$> structuresRead
return $ Document {pdfVersion, eolStyle, layers}

View File

@ -12,8 +12,9 @@ import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
import Data.Map ((!))
import qualified Data.Map as Map (empty, insert, lookup)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Layer (Layer(..))
import PDF.Object (
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
DirectObject(..), Flow(..), IndirectObjCoordinates(..)
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
, Structure(..), XRefEntry(..), XRefSection
, blank, dictionary, directObject, integer, line
@ -107,14 +108,14 @@ occurrence :: SParser Occurrence
occurrence =
Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
populate :: ByteString -> InputStructure -> Content
populate :: ByteString -> InputStructure -> Layer
populate input structure =
let bodyInput = BS.drop (startOffset structure) input in
case evalParser recurseOnOccurrences initialState bodyInput of
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
Left _ -> Layer {occurrences = [], objects = Map.empty, docStructure}
Right finalState ->
let Flow {occurrencesStack, tmpObjects} = flow finalState in
Content {
Layer {
occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure
}
where

80
src/PDF/Layer.hs Normal file
View File

@ -0,0 +1,80 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.Layer (
Layer(..)
, unify
) where
import Data.Map (Map, (!), mapWithKey, member)
import qualified Data.Map as Map (empty, union)
import PDF.Object (
IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
, Structure(..), XRefEntry(..), XRefSection, eofMarker, outputBody
)
import qualified PDF.Output as Output (line)
import PDF.Output (
Offset, Output(..), Resource(..), byteString, getOffset, getOffsets, newLine
)
import Text.Printf (printf)
data Layer = Layer {
occurrences :: [Occurrence]
, objects :: IndexedObjects
, docStructure :: Structure
} deriving Show
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
where
updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
updateEntry _ e = e
instance Output Layer where
output (Layer {occurrences, objects, docStructure}) =
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
>>= \(body, (xref, startXRef)) -> mconcat [
body
, Output.line "xref"
, output xref
, Output.line "trailer"
, output trailer, newLine
, Output.line "startxref"
, Output.line (printf "%d" (getOffset startXRef))
, byteString eofMarker
]
where
Structure {xRef, trailer} = docStructure
emptyLayer :: Layer
emptyLayer = Layer {
docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
, objects = Map.empty
, occurrences = []
}
unify :: [Layer] -> Layer
unify = foldl complete emptyLayer
where
complete tmpLayer older =
let mergedObjects = Map.union (objects tmpLayer) (objects older) in
Layer {
docStructure =
unifyDocStructure (docStructure tmpLayer) (docStructure older)
, objects = mergedObjects
, occurrences =
unifyOccurrences mergedObjects (occurrences tmpLayer) (occurrences older)
}
unifyDocStructure :: Structure -> Structure -> Structure
unifyDocStructure update original = Structure {
xRef = Map.union (xRef update) (xRef original)
, trailer = Map.union (trailer update) (trailer original)
}
unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
unifyOccurrences objects update = foldr addOlder update
where
addOlder occurrence@(Comment _) existing = occurrence : existing
addOlder occurrence@(Indirect indirect) existing =
if objectId indirect `member` objects
then occurrence : existing
else existing

View File

@ -2,8 +2,7 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleInstances #-}
module PDF.Object (
Content(..)
, Dictionary
Dictionary
, DirectObject(..)
, Flow(..)
, IndexedObjects
@ -27,6 +26,7 @@ module PDF.Object (
, magicNumber
, name
, number
, outputBody
, regular
, stringObject
, structure
@ -41,7 +41,7 @@ import qualified Data.ByteString.Char8 as Char8 (
cons, length, pack, singleton, snoc, unpack
)
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
import Data.Map (Map, (!), mapWithKey)
import Data.Map (Map, (!))
import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union
)
@ -50,8 +50,7 @@ import qualified PDF.EOL as EOL (charset, parser)
import qualified PDF.Output as Output (line, string)
import PDF.Output (
OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
, byteString, getObjectId, getOffset, getOffsets, join, newLine
, saveOffset
, byteString, getObjectId, getOffset, join, newLine, saveOffset
)
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import Text.Printf (printf)
@ -272,6 +271,10 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum
, Output.line "endobj"
]
outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
outputBody (occurrences, objects) =
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
-------------------------------------
-- XREF TABLE
-------------------------------------
@ -369,12 +372,6 @@ structure =
<$> xRefSection
<*> (string "trailer" *> blank *> dictionary <* EOL.parser)
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
where
updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
updateEntry _ e = e
--
-- Flow
--
@ -382,32 +379,3 @@ data Flow = Flow {
occurrencesStack :: [Occurrence]
, tmpObjects :: IndexedObjects
} deriving Show
--
-- Content
--
data Content = Content {
occurrences :: [Occurrence]
, objects :: IndexedObjects
, docStructure :: Structure
} deriving Show
outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
outputBody (occurrences, objects) =
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
instance Output Content where
output (Content {occurrences, objects, docStructure}) =
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
>>= \(body, (xref, startXRef)) -> mconcat [
body
, Output.line "xref"
, output xref
, Output.line "trailer"
, output trailer, newLine
, Output.line "startxref"
, Output.line (printf "%d" (getOffset startXRef))
, byteString eofMarker
]
where
Structure {xRef, trailer} = docStructure

View File

@ -23,15 +23,16 @@ import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map ((!))
import qualified Data.Map as Map (lookup)
import PDF.Layer (Layer(..))
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..), Object(..), Structure(..)
)
import PDF.Output (ObjectId)
import Prelude hiding (fail)
import Text.Printf (printf)
type PDFContent m = (Alternative m, MonadReader Content m, MonadFail m)
type PDFContent m = (Alternative m, MonadReader Layer m, MonadFail m)
newtype Error a = Error {
runError :: Either String a
} deriving (Alternative, Functor, Applicative, Monad, MonadPlus)

View File

@ -18,8 +18,9 @@ import qualified PDF.Content as Content (parse)
import PDF.Content.Text (renderText)
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Layer (Layer(..))
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..), Object(..)
,)
import PDF.Object.Navigation (
@ -31,7 +32,7 @@ import Prelude hiding (fail)
import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedFonts Error
type T = RWST Layer () CachedFonts Error
data Page = Page {
contents :: [Text]
, source :: ObjectId
@ -107,13 +108,13 @@ loadPage source = do
contents <- extractText =<< objectById source
return $ Page {contents, source}
getAll :: Content -> Either String (Map Int Page)
getAll :: Layer -> Either String (Map Int Page)
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
where
numbered = Map.fromList . zip [1..]
getPages = numbered <$> (mapM loadPage =<< pagesList)
get :: Content -> Int -> Either String Page
get :: Layer -> Int -> Either String Page
get content pageNumber
| pageNumber < 1 = Left "Pages start at 1"
| otherwise = runError $ fst <$> evalRWST getPage content Map.empty

View File

@ -1,47 +0,0 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.Update (
unify
) where
import Data.Map (member)
import qualified Data.Map as Map (empty, union)
import PDF.Object (
Content(..), IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
, Structure(..)
)
emptyContent :: Content
emptyContent = Content {
docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
, objects = Map.empty
, occurrences = []
}
unify :: [Content] -> Content
unify = foldl complete emptyContent
where
complete tmpContent older =
let mergedObjects = Map.union (objects tmpContent) (objects older) in
Content {
docStructure =
unifyDocStructure (docStructure tmpContent) (docStructure older)
, objects = mergedObjects
, occurrences =
unifyOccurrences mergedObjects (occurrences tmpContent) (occurrences older)
}
unifyDocStructure :: Structure -> Structure -> Structure
unifyDocStructure update original = Structure {
xRef = Map.union (xRef update) (xRef original)
, trailer = Map.union (trailer update) (trailer original)
}
unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
unifyOccurrences objects update = foldr addOlder update
where
addOlder occurrence@(Comment _) existing = occurrence : existing
addOlder occurrence@(Indirect indirect) existing =
if objectId indirect `member` objects
then occurrence : existing
else existing