Implement retrieving objects in the body of the document and use it to populate the structure previously parsed

This commit is contained in:
Tissevert 2019-05-14 18:42:11 +02:00
parent 8043f84da8
commit 91292d6401
4 changed files with 169 additions and 65 deletions

View file

@ -17,6 +17,7 @@ cabal-version: >=1.10
library
exposed-modules: PDF
, PDF.Body
, PDF.Object
, Data.ByteString.Lazy.Char8.Util
other-modules:

View file

@ -13,6 +13,7 @@ import Data.ByteString.Lazy.Char8.Util (previous, subBS)
import Data.Int (Int64)
import qualified Data.Map as Map (lookup)
import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset)
import PDF.Body (populate)
import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser)
import Text.Parsec.Pos (newPos)
@ -60,7 +61,8 @@ parseDocument :: ByteString -> Either ParseError Document
parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
startXref <- readStartXref eolStyle input
contents <- iterateContents startXref input
structures <- iterateContents startXref input
let contents = populate input <$> structures
return $ Document {pdfVersion, contents}
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]

119
src/PDF/Body.hs Normal file
View file

@ -0,0 +1,119 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.Body (
populate
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
import Data.Functor.Identity (Identity)
import Data.Int (Int64)
import qualified Data.Map as Map (insert, lookup)
import PDF.Object (
Content(..), DirectObject(..), IndirectObjCoordinates(..), Object(..)
, Occurrence(..), XRefEntry(..), XRefSection, XRefSubSection(..)
, eol, eolCharset, dictionary, directObject, integer, line
)
import Text.Parsec
data UserState = UserState {
input :: ByteString
, content :: Content
}
type SParser = ParsecT ByteString UserState Identity
modifyContent :: (Content -> Content) -> SParser ()
modifyContent f = modifyState $ \state -> state {content = f $ content state}
addObject :: Int -> Object -> SParser ()
addObject objectId newObject = modifyContent $ \content -> content {
objects = Map.insert objectId newObject $ objects content
}
pushOccurrence :: Occurrence -> SParser ()
pushOccurrence newOccurrence = modifyContent $ \content -> content {
body = newOccurrence : (body content)
}
comment :: SParser String
comment = char '%' *> many (noneOf eolCharset) <* eol
lookupOffset :: Int -> XRefSection -> Maybe Int64
lookupOffset _ [] = Nothing
lookupOffset objectId (xrefSubSection:others) =
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
let index = objectId - firstObjectId in
if index >= 0 && index < entriesNumber
then
case Map.lookup index entries of
Just (InUse {offset}) -> Just offset
_ -> Nothing
else lookupOffset objectId others
getOffset :: Int -> SParser Int64
getOffset objectId = do
Content {xrefSection} <- content <$> getState
case lookupOffset objectId xrefSection of
Nothing -> fail $
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
Just offset -> return offset
loadNumber :: Int -> SParser Float
loadNumber objectId = do
offset <- getOffset objectId
state <- getState
let objectStart = BS.drop offset (input state)
case runParser object state ("object@" ++ show offset) objectStart of
Left _ -> fail $ "Could not parse length object@" ++ show offset
Right number@(Direct (Number n)) ->
addObject objectId number >> return n
Right obj -> fail $
"Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
invalidValue :: Object -> String
invalidValue v = "Invalid value " ++ show v
getSize :: Maybe DirectObject -> SParser Float
getSize Nothing = fail "Missing '/Length' key on stream"
getSize (Just (Number size)) = return size
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
Content {objects} <- content <$> getState
case Map.lookup objectId objects of
Nothing -> loadNumber objectId
Just (Direct (Number size)) -> return size
Just v -> fail $
invalidValue v ++ " for obj " ++ show objectId ++ "used as /Length"
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
object :: SParser Object
object =
Direct <$> directObject
<|> do
header <- dictionary
size <- getSize (Map.lookup "Length" header)
streamContent <- BS.pack <$> stream (truncate size)
return $ Stream {header, streamContent}
where
stream size = line "stream" *> count size anyChar <* eol <* line "endstream"
occurrence :: SParser Occurrence
occurrence = Comment <$> comment <|> indirectObj
where
indirectObj = do
objectId <- integer
coordinates <- IndirectObjCoordinates objectId <$> integer
objectValue <- line "obj" *> object <* eol <* line "endobj"
addObject objectId objectValue
return $ Indirect coordinates
populate :: ByteString -> Content -> Content
populate input initialContent =
case runParser recurseOnOccurrences initialState "" input of
Left _ -> initialContent
Right finalState -> content finalState
where
initialState = UserState {input, content = initialContent}
recurseOnOccurrences :: SParser UserState
recurseOnOccurrences =
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState

View file

@ -3,23 +3,29 @@ module PDF.Object (
Content(..)
, DirectObject(..)
, EOLStyle(..)
, IndirectObjCoordinates(..)
, Object(..)
, Occurrence(..)
, XRefEntry(..)
, XRefSection
, XRefSubSection(..)
, content
, dictionary
, directObject
, eol
, eolCharset
, occurrence
, integer
, line
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (pack)
import Data.Int (Int64)
import Data.Map (Map)
import qualified Data.Map as Map (fromList, lookup)
import qualified Data.Map as Map (empty, fromList)
import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser)
--import Text.Parsec.ByteString.Lazy (Parser)
type Parser u = Parsec ByteString u
data EOLStyle = CR | LF | CRLF
@ -35,7 +41,7 @@ data DirectObject =
| Array [DirectObject]
| Dictionary Dictionary
| Null
| Reference (Int, Int)
| Reference IndirectObjCoordinates
deriving Show
data Object =
@ -46,14 +52,12 @@ data Object =
}
deriving Show
data Occurrence =
Comment String
| Indirect {
objId :: Int
, versionNumber :: Int
, objectContent :: Object
}
deriving Show
data IndirectObjCoordinates = IndirectObjCoordinates {
objectId :: Int
, versionNumber :: Int
} deriving Show
data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
data XRefEntry = InUse {
offset :: Int64
@ -65,12 +69,16 @@ data XRefEntry = InUse {
data XRefSubSection = XRefSubSection {
firstObjectId :: Int
, entriesNumber :: Int
, entries :: Map Int XRefEntry
} deriving Show
type XRefSection = [XRefSubSection]
data Content = Content {
body :: [Occurrence]
, xrefSection :: [XRefSubSection]
, objects :: Map Int Object
, xrefSection :: XRefSection
, trailer :: Dictionary
, startXrefPosition :: Int64
} deriving Show
@ -78,39 +86,39 @@ data Content = Content {
eolCharset :: String
eolCharset = "\r\n"
eol :: Parser EOLStyle
eol :: Parser u EOLStyle
eol =
try (string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR)
<|> (string "\n" >> return LF)
line :: String -> Parser ()
line :: String -> Parser u ()
line l = string l *> eol *> return ()
whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 "
whiteSpace :: Parser ()
whiteSpace :: Parser u ()
whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
blank :: Parser ()
blank :: Parser u ()
blank = skipMany whiteSpace
delimiterCharset :: String
delimiterCharset = "()<>[]{}/%"
{-
delimiter :: Parser Char
delimiter :: Parser u Char
delimiter = oneOf delimiterCharset
-}
regular :: Parser Char
regular :: Parser u Char
regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
integer :: (Read a, Num a) => Parser a
integer :: (Read a, Num a) => Parser u a
integer = read <$> many1 digit <* whiteSpace
directObject :: Parser DirectObject
directObject :: Parser u DirectObject
directObject =
Boolean <$> try boolean
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -}
@ -121,17 +129,17 @@ directObject =
<|> Dictionary <$> try dictionary
<|> const Null <$> try nullObject
boolean :: Parser Bool
boolean :: Parser u Bool
boolean = (string "true" *> return True) <|> (string "false" *> return False)
number :: Parser Float
number :: Parser u Float
number = read <$> (mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart))
where
sign = string "-" <|> option "" (char '+' >> return "")
integerPart = mappend <$> many1 digit <*> option "" floatPart
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit)
stringObj :: Parser StringObj
stringObj :: Parser u StringObj
stringObj =
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
@ -142,70 +150,44 @@ stringObj =
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
name :: Parser String
name :: Parser u String
name = char '/' *> many regular
array :: Parser [DirectObject]
array :: Parser u [DirectObject]
array = char '[' *> blank *> directObject `endBy` blank <* char ']'
dictionary :: Parser Dictionary
dictionary :: Parser u Dictionary
dictionary =
try (string "<<" *> blank *> keyValPairs <* string ">>")
where
keyVal = (,) <$> name <* blank <*> directObject
keyValPairs = Map.fromList <$> keyVal `endBy` blank
nullObject :: Parser ()
nullObject :: Parser u ()
nullObject = string "null" *> return ()
comment :: Parser String
comment = char '%' *> many (noneOf eolCharset) <* eol
reference :: Parser u IndirectObjCoordinates
reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R'
reference :: Parser (Int, Int)
reference = (,) <$> integer <*> integer <* char 'R'
object :: Parser Object
object =
Direct <$> directObject
<|> do
header <- dictionary
streamContent <- BS.pack <$> stream (Map.lookup "Length" header)
return $ Stream header streamContent
where
stream value =
case value of
Nothing -> fail "Missing 'Length' key on stream"
Just (Number size) ->
line "stream" *> count (truncate size) anyChar <* eol <* line "endstream"
_ -> fail "Expected number"
occurrence :: Parser Occurrence
occurrence = Comment <$> comment <|> indirectObj
where
indirectObj =
Indirect <$> integer <*> integer <*> (line "obj"
*> object
<* eol <* line "endobj")
entry :: Parser XRefEntry
entry :: Parser u XRefEntry
entry = do
(big, small) <- (,) <$> integer <*> integer
(inUse big small <|> free big small) <* blank
where
inUse :: Int64 -> Int -> Parser XRefEntry
inUse :: Int64 -> Int -> Parser u XRefEntry
inUse offset generation = char 'n' *> return (InUse {offset, generation})
free :: Int64 -> Int -> Parser XRefEntry
free :: Int64 -> Int -> Parser u XRefEntry
free nextFree generation = char 'f' *> return (Free {nextFree, generation})
xrefSubSection :: Parser XRefSubSection
xrefSubSection :: Parser u XRefSubSection
xrefSubSection = do
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
return $ XRefSubSection {firstObjectId, entries}
return $ XRefSubSection {firstObjectId, entriesNumber, entries}
content :: Parser Content
content :: Parser u Content
content =
Content []
Content [] Map.empty
<$> (line "xref" *> xrefSubSection `sepBy` eol)
<*> (line "trailer" *> dictionary <* eol)
<*> (line "startxref" *> integer)