Move Content out of Object module into a separate one incorporating PDF.Update (which is actually an operation that is defined only on that structure), and rename it Layer to avoid confusion with Content streams as defined in the specs (which have their own PDF.Content module already)
This commit is contained in:
parent
6096a1a237
commit
bcf2e05bfb
11 changed files with 129 additions and 114 deletions
|
@ -20,12 +20,12 @@ library
|
|||
, PDF.CMap
|
||||
, PDF.Content
|
||||
, PDF.EOL
|
||||
, PDF.Layer
|
||||
, PDF.Object
|
||||
, PDF.Object.Navigation
|
||||
, PDF.Output
|
||||
, PDF.Parser
|
||||
, PDF.Pages
|
||||
, PDF.Update
|
||||
other-modules: Data.ByteString.Char8.Util
|
||||
, PDF.Content.Operator
|
||||
, PDF.Content.Operator.Color
|
||||
|
|
11
examples/getContent.hs
Normal file
11
examples/getContent.hs
Normal file
|
@ -0,0 +1,11 @@
|
|||
main :: IO ()
|
||||
main = do
|
||||
hSetBuffering stdout LineBuffering
|
||||
args <- getArgs
|
||||
case args of
|
||||
--[inputFile] -> wholeDoc inputFile
|
||||
[inputFile, pageNumber] -> do
|
||||
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
|
||||
get content (read pageNumber)
|
||||
singlePage inputFile (read pageNumber)
|
||||
_ -> die "Syntax: getContent INPUT_FILE PAGE_NUMBER"
|
|
@ -6,13 +6,13 @@ import Data.ByteString.Lazy.Char8 (ByteString)
|
|||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (putStrLn)
|
||||
import PDF (Document(..), parseDocument)
|
||||
import PDF.Object (Content(..), Object(..))
|
||||
import PDF.Layer (Layer(..), unify)
|
||||
import PDF.Object (Object(..))
|
||||
import PDF.Object.Navigation (
|
||||
Error(..), (//), objectById, openStream, origin
|
||||
)
|
||||
import PDF.Output (ObjectId(..), Output)
|
||||
import qualified PDF.Output as Output (render)
|
||||
import PDF.Update (unify)
|
||||
import System.Environment (getArgs, getProgName)
|
||||
import System.Exit (die)
|
||||
import Text.Printf (printf)
|
||||
|
@ -24,9 +24,9 @@ decodedStream object =
|
|||
where
|
||||
replaceContent streamContent = object {streamContent}
|
||||
|
||||
display :: Output a => ReaderT Content Error a -> Document -> Either String ByteString
|
||||
display getter (Document {eolStyle, updates}) =
|
||||
Output.render eolStyle <$> runError (runReaderT getter (unify updates))
|
||||
display :: Output a => ReaderT Layer Error a -> Document -> Either String ByteString
|
||||
display getter (Document {eolStyle, layers}) =
|
||||
Output.render eolStyle <$> runError (runReaderT getter (unify layers))
|
||||
|
||||
parse :: [String] -> IO (FilePath, Document -> Either String ByteString)
|
||||
parse [inputFile] = return (inputFile, display origin)
|
||||
|
|
|
@ -2,17 +2,16 @@ import qualified Data.ByteString.Char8 as BS (readFile)
|
|||
import qualified Data.Map as Map (toList)
|
||||
import qualified Data.Text.IO as Text (putStrLn)
|
||||
import PDF (Document(..), parseDocument)
|
||||
import PDF.Object (Content)
|
||||
import PDF.Layer (Layer, unify)
|
||||
import PDF.Pages (Page(..), get, getAll)
|
||||
import PDF.Update (unify)
|
||||
import System.Environment (getArgs)
|
||||
import System.Exit (die)
|
||||
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||
|
||||
onDoc :: FilePath -> (Content -> Either String a) -> IO a
|
||||
onDoc :: FilePath -> (Layer -> Either String a) -> IO a
|
||||
onDoc inputFile f = do
|
||||
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
|
||||
case content >>= f of
|
||||
layer <- fmap (unify . layers) . parseDocument <$> BS.readFile inputFile
|
||||
case layer >>= f of
|
||||
Left someError -> die someError
|
||||
Right value -> return value
|
||||
|
||||
|
|
13
src/PDF.hs
13
src/PDF.hs
|
@ -14,8 +14,9 @@ import qualified Data.ByteString.Lazy as Lazy (ByteString)
|
|||
import qualified Data.Map as Map (lookup)
|
||||
import PDF.Body (populate)
|
||||
import qualified PDF.EOL as EOL (Style(..), charset, parser)
|
||||
import PDF.Layer (Layer)
|
||||
import PDF.Object (
|
||||
Content(..), DirectObject(..), InputStructure(..), Name(..), Number(..)
|
||||
DirectObject(..), InputStructure(..), Name(..), Number(..)
|
||||
, Structure(..)
|
||||
, eofMarker, magicNumber, structure
|
||||
)
|
||||
|
@ -27,13 +28,13 @@ import Text.Printf (printf)
|
|||
data Document = Document {
|
||||
pdfVersion :: String
|
||||
, eolStyle :: EOL.Style
|
||||
, updates :: [Content]
|
||||
, layers :: [Layer]
|
||||
} deriving Show
|
||||
|
||||
instance Output Document where
|
||||
output (Document {pdfVersion, updates}) =
|
||||
output (Document {pdfVersion, layers}) =
|
||||
Output.line (printf "%%PDF-%s" pdfVersion)
|
||||
`mappend` output updates
|
||||
`mappend` output layers
|
||||
|
||||
render :: Document -> Lazy.ByteString
|
||||
render document@(Document {eolStyle}) = Output.render eolStyle document
|
||||
|
@ -99,5 +100,5 @@ parseDocument input = do
|
|||
(pdfVersion, eolStyle) <- evalParser ((,) <$> version <*> EOL.parser) () input
|
||||
startXref <- readStartXref eolStyle input
|
||||
structuresRead <- readStructures startXref input
|
||||
let updates = populate input <$> structuresRead
|
||||
return $ Document {pdfVersion, eolStyle, updates}
|
||||
let layers = populate input <$> structuresRead
|
||||
return $ Document {pdfVersion, eolStyle, layers}
|
||||
|
|
|
@ -12,8 +12,9 @@ import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
|
|||
import Data.Map ((!))
|
||||
import qualified Data.Map as Map (empty, insert, lookup)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import PDF.Layer (Layer(..))
|
||||
import PDF.Object (
|
||||
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
|
||||
DirectObject(..), Flow(..), IndirectObjCoordinates(..)
|
||||
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
|
||||
, Structure(..), XRefEntry(..), XRefSection
|
||||
, blank, dictionary, directObject, integer, line
|
||||
|
@ -107,14 +108,14 @@ occurrence :: SParser Occurrence
|
|||
occurrence =
|
||||
Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
|
||||
|
||||
populate :: ByteString -> InputStructure -> Content
|
||||
populate :: ByteString -> InputStructure -> Layer
|
||||
populate input structure =
|
||||
let bodyInput = BS.drop (startOffset structure) input in
|
||||
case evalParser recurseOnOccurrences initialState bodyInput of
|
||||
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
|
||||
Left _ -> Layer {occurrences = [], objects = Map.empty, docStructure}
|
||||
Right finalState ->
|
||||
let Flow {occurrencesStack, tmpObjects} = flow finalState in
|
||||
Content {
|
||||
Layer {
|
||||
occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure
|
||||
}
|
||||
where
|
||||
|
|
80
src/PDF/Layer.hs
Normal file
80
src/PDF/Layer.hs
Normal file
|
@ -0,0 +1,80 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module PDF.Layer (
|
||||
Layer(..)
|
||||
, unify
|
||||
) where
|
||||
|
||||
import Data.Map (Map, (!), mapWithKey, member)
|
||||
import qualified Data.Map as Map (empty, union)
|
||||
import PDF.Object (
|
||||
IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
|
||||
, Structure(..), XRefEntry(..), XRefSection, eofMarker, outputBody
|
||||
)
|
||||
import qualified PDF.Output as Output (line)
|
||||
import PDF.Output (
|
||||
Offset, Output(..), Resource(..), byteString, getOffset, getOffsets, newLine
|
||||
)
|
||||
import Text.Printf (printf)
|
||||
|
||||
data Layer = Layer {
|
||||
occurrences :: [Occurrence]
|
||||
, objects :: IndexedObjects
|
||||
, docStructure :: Structure
|
||||
} deriving Show
|
||||
|
||||
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
|
||||
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
|
||||
where
|
||||
updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
|
||||
updateEntry _ e = e
|
||||
|
||||
instance Output Layer where
|
||||
output (Layer {occurrences, objects, docStructure}) =
|
||||
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
|
||||
>>= \(body, (xref, startXRef)) -> mconcat [
|
||||
body
|
||||
, Output.line "xref"
|
||||
, output xref
|
||||
, Output.line "trailer"
|
||||
, output trailer, newLine
|
||||
, Output.line "startxref"
|
||||
, Output.line (printf "%d" (getOffset startXRef))
|
||||
, byteString eofMarker
|
||||
]
|
||||
where
|
||||
Structure {xRef, trailer} = docStructure
|
||||
|
||||
emptyLayer :: Layer
|
||||
emptyLayer = Layer {
|
||||
docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
|
||||
, objects = Map.empty
|
||||
, occurrences = []
|
||||
}
|
||||
|
||||
unify :: [Layer] -> Layer
|
||||
unify = foldl complete emptyLayer
|
||||
where
|
||||
complete tmpLayer older =
|
||||
let mergedObjects = Map.union (objects tmpLayer) (objects older) in
|
||||
Layer {
|
||||
docStructure =
|
||||
unifyDocStructure (docStructure tmpLayer) (docStructure older)
|
||||
, objects = mergedObjects
|
||||
, occurrences =
|
||||
unifyOccurrences mergedObjects (occurrences tmpLayer) (occurrences older)
|
||||
}
|
||||
|
||||
unifyDocStructure :: Structure -> Structure -> Structure
|
||||
unifyDocStructure update original = Structure {
|
||||
xRef = Map.union (xRef update) (xRef original)
|
||||
, trailer = Map.union (trailer update) (trailer original)
|
||||
}
|
||||
|
||||
unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
|
||||
unifyOccurrences objects update = foldr addOlder update
|
||||
where
|
||||
addOlder occurrence@(Comment _) existing = occurrence : existing
|
||||
addOlder occurrence@(Indirect indirect) existing =
|
||||
if objectId indirect `member` objects
|
||||
then occurrence : existing
|
||||
else existing
|
|
@ -2,8 +2,7 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
module PDF.Object (
|
||||
Content(..)
|
||||
, Dictionary
|
||||
Dictionary
|
||||
, DirectObject(..)
|
||||
, Flow(..)
|
||||
, IndexedObjects
|
||||
|
@ -27,6 +26,7 @@ module PDF.Object (
|
|||
, magicNumber
|
||||
, name
|
||||
, number
|
||||
, outputBody
|
||||
, regular
|
||||
, stringObject
|
||||
, structure
|
||||
|
@ -41,7 +41,7 @@ import qualified Data.ByteString.Char8 as Char8 (
|
|||
cons, length, pack, singleton, snoc, unpack
|
||||
)
|
||||
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
|
||||
import Data.Map (Map, (!), mapWithKey)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (
|
||||
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
||||
)
|
||||
|
@ -50,8 +50,7 @@ import qualified PDF.EOL as EOL (charset, parser)
|
|||
import qualified PDF.Output as Output (line, string)
|
||||
import PDF.Output (
|
||||
OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
|
||||
, byteString, getObjectId, getOffset, getOffsets, join, newLine
|
||||
, saveOffset
|
||||
, byteString, getObjectId, getOffset, join, newLine, saveOffset
|
||||
)
|
||||
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
||||
import Text.Printf (printf)
|
||||
|
@ -272,6 +271,10 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum
|
|||
, Output.line "endobj"
|
||||
]
|
||||
|
||||
outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
|
||||
outputBody (occurrences, objects) =
|
||||
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
|
||||
|
||||
-------------------------------------
|
||||
-- XREF TABLE
|
||||
-------------------------------------
|
||||
|
@ -369,12 +372,6 @@ structure =
|
|||
<$> xRefSection
|
||||
<*> (string "trailer" *> blank *> dictionary <* EOL.parser)
|
||||
|
||||
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
|
||||
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
|
||||
where
|
||||
updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
|
||||
updateEntry _ e = e
|
||||
|
||||
--
|
||||
-- Flow
|
||||
--
|
||||
|
@ -382,32 +379,3 @@ data Flow = Flow {
|
|||
occurrencesStack :: [Occurrence]
|
||||
, tmpObjects :: IndexedObjects
|
||||
} deriving Show
|
||||
|
||||
--
|
||||
-- Content
|
||||
--
|
||||
data Content = Content {
|
||||
occurrences :: [Occurrence]
|
||||
, objects :: IndexedObjects
|
||||
, docStructure :: Structure
|
||||
} deriving Show
|
||||
|
||||
outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
|
||||
outputBody (occurrences, objects) =
|
||||
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
|
||||
|
||||
instance Output Content where
|
||||
output (Content {occurrences, objects, docStructure}) =
|
||||
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
|
||||
>>= \(body, (xref, startXRef)) -> mconcat [
|
||||
body
|
||||
, Output.line "xref"
|
||||
, output xref
|
||||
, Output.line "trailer"
|
||||
, output trailer, newLine
|
||||
, Output.line "startxref"
|
||||
, Output.line (printf "%d" (getOffset startXRef))
|
||||
, byteString eofMarker
|
||||
]
|
||||
where
|
||||
Structure {xRef, trailer} = docStructure
|
||||
|
|
|
@ -23,15 +23,16 @@ import Data.ByteString (ByteString)
|
|||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||
import Data.Map ((!))
|
||||
import qualified Data.Map as Map (lookup)
|
||||
import PDF.Layer (Layer(..))
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Name(..), Object(..), Structure(..)
|
||||
)
|
||||
import PDF.Output (ObjectId)
|
||||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
||||
type PDFContent m = (Alternative m, MonadReader Content m, MonadFail m)
|
||||
type PDFContent m = (Alternative m, MonadReader Layer m, MonadFail m)
|
||||
newtype Error a = Error {
|
||||
runError :: Either String a
|
||||
} deriving (Alternative, Functor, Applicative, Monad, MonadPlus)
|
||||
|
|
|
@ -18,8 +18,9 @@ import qualified PDF.Content as Content (parse)
|
|||
import PDF.Content.Text (renderText)
|
||||
import PDF.Encoding (encoding)
|
||||
import PDF.Font (Font, FontSet)
|
||||
import PDF.Layer (Layer(..))
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Name(..), Object(..)
|
||||
,)
|
||||
import PDF.Object.Navigation (
|
||||
|
@ -31,7 +32,7 @@ import Prelude hiding (fail)
|
|||
import Text.Printf (printf)
|
||||
|
||||
type CachedFonts = Map ObjectId Font
|
||||
type T = RWST Content () CachedFonts Error
|
||||
type T = RWST Layer () CachedFonts Error
|
||||
data Page = Page {
|
||||
contents :: [Text]
|
||||
, source :: ObjectId
|
||||
|
@ -107,13 +108,13 @@ loadPage source = do
|
|||
contents <- extractText =<< objectById source
|
||||
return $ Page {contents, source}
|
||||
|
||||
getAll :: Content -> Either String (Map Int Page)
|
||||
getAll :: Layer -> Either String (Map Int Page)
|
||||
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
|
||||
where
|
||||
numbered = Map.fromList . zip [1..]
|
||||
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
||||
|
||||
get :: Content -> Int -> Either String Page
|
||||
get :: Layer -> Int -> Either String Page
|
||||
get content pageNumber
|
||||
| pageNumber < 1 = Left "Pages start at 1"
|
||||
| otherwise = runError $ fst <$> evalRWST getPage content Map.empty
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module PDF.Update (
|
||||
unify
|
||||
) where
|
||||
|
||||
import Data.Map (member)
|
||||
import qualified Data.Map as Map (empty, union)
|
||||
import PDF.Object (
|
||||
Content(..), IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
|
||||
, Structure(..)
|
||||
)
|
||||
|
||||
emptyContent :: Content
|
||||
emptyContent = Content {
|
||||
docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
|
||||
, objects = Map.empty
|
||||
, occurrences = []
|
||||
}
|
||||
|
||||
unify :: [Content] -> Content
|
||||
unify = foldl complete emptyContent
|
||||
where
|
||||
complete tmpContent older =
|
||||
let mergedObjects = Map.union (objects tmpContent) (objects older) in
|
||||
Content {
|
||||
docStructure =
|
||||
unifyDocStructure (docStructure tmpContent) (docStructure older)
|
||||
, objects = mergedObjects
|
||||
, occurrences =
|
||||
unifyOccurrences mergedObjects (occurrences tmpContent) (occurrences older)
|
||||
}
|
||||
|
||||
unifyDocStructure :: Structure -> Structure -> Structure
|
||||
unifyDocStructure update original = Structure {
|
||||
xRef = Map.union (xRef update) (xRef original)
|
||||
, trailer = Map.union (trailer update) (trailer original)
|
||||
}
|
||||
|
||||
unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
|
||||
unifyOccurrences objects update = foldr addOlder update
|
||||
where
|
||||
addOlder occurrence@(Comment _) existing = occurrence : existing
|
||||
addOlder occurrence@(Indirect indirect) existing =
|
||||
if objectId indirect `member` objects
|
||||
then occurrence : existing
|
||||
else existing
|
||||
|
Loading…
Reference in a new issue