Implement retrieving objects in the body of the document and use it to populate the structure previously parsed

This commit is contained in:
Tissevert 2019-05-14 18:42:11 +02:00
parent 8043f84da8
commit 91292d6401
4 changed files with 169 additions and 65 deletions

View file

@ -17,6 +17,7 @@ cabal-version: >=1.10
library library
exposed-modules: PDF exposed-modules: PDF
, PDF.Body
, PDF.Object , PDF.Object
, Data.ByteString.Lazy.Char8.Util , Data.ByteString.Lazy.Char8.Util
other-modules: other-modules:

View file

@ -13,6 +13,7 @@ import Data.ByteString.Lazy.Char8.Util (previous, subBS)
import Data.Int (Int64) import Data.Int (Int64)
import qualified Data.Map as Map (lookup) import qualified Data.Map as Map (lookup)
import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset) import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset)
import PDF.Body (populate)
import Text.Parsec import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser) import Text.Parsec.ByteString.Lazy (Parser)
import Text.Parsec.Pos (newPos) import Text.Parsec.Pos (newPos)
@ -60,7 +61,8 @@ parseDocument :: ByteString -> Either ParseError Document
parseDocument input = do parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
startXref <- readStartXref eolStyle input startXref <- readStartXref eolStyle input
contents <- iterateContents startXref input structures <- iterateContents startXref input
let contents = populate input <$> structures
return $ Document {pdfVersion, contents} return $ Document {pdfVersion, contents}
iterateContents :: Int64 -> ByteString -> Either ParseError [Content] iterateContents :: Int64 -> ByteString -> Either ParseError [Content]

119
src/PDF/Body.hs Normal file
View file

@ -0,0 +1,119 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.Body (
populate
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
import Data.Functor.Identity (Identity)
import Data.Int (Int64)
import qualified Data.Map as Map (insert, lookup)
import PDF.Object (
Content(..), DirectObject(..), IndirectObjCoordinates(..), Object(..)
, Occurrence(..), XRefEntry(..), XRefSection, XRefSubSection(..)
, eol, eolCharset, dictionary, directObject, integer, line
)
import Text.Parsec
data UserState = UserState {
input :: ByteString
, content :: Content
}
type SParser = ParsecT ByteString UserState Identity
modifyContent :: (Content -> Content) -> SParser ()
modifyContent f = modifyState $ \state -> state {content = f $ content state}
addObject :: Int -> Object -> SParser ()
addObject objectId newObject = modifyContent $ \content -> content {
objects = Map.insert objectId newObject $ objects content
}
pushOccurrence :: Occurrence -> SParser ()
pushOccurrence newOccurrence = modifyContent $ \content -> content {
body = newOccurrence : (body content)
}
comment :: SParser String
comment = char '%' *> many (noneOf eolCharset) <* eol
lookupOffset :: Int -> XRefSection -> Maybe Int64
lookupOffset _ [] = Nothing
lookupOffset objectId (xrefSubSection:others) =
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
let index = objectId - firstObjectId in
if index >= 0 && index < entriesNumber
then
case Map.lookup index entries of
Just (InUse {offset}) -> Just offset
_ -> Nothing
else lookupOffset objectId others
getOffset :: Int -> SParser Int64
getOffset objectId = do
Content {xrefSection} <- content <$> getState
case lookupOffset objectId xrefSection of
Nothing -> fail $
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
Just offset -> return offset
loadNumber :: Int -> SParser Float
loadNumber objectId = do
offset <- getOffset objectId
state <- getState
let objectStart = BS.drop offset (input state)
case runParser object state ("object@" ++ show offset) objectStart of
Left _ -> fail $ "Could not parse length object@" ++ show offset
Right number@(Direct (Number n)) ->
addObject objectId number >> return n
Right obj -> fail $
"Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
invalidValue :: Object -> String
invalidValue v = "Invalid value " ++ show v
getSize :: Maybe DirectObject -> SParser Float
getSize Nothing = fail "Missing '/Length' key on stream"
getSize (Just (Number size)) = return size
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
Content {objects} <- content <$> getState
case Map.lookup objectId objects of
Nothing -> loadNumber objectId
Just (Direct (Number size)) -> return size
Just v -> fail $
invalidValue v ++ " for obj " ++ show objectId ++ "used as /Length"
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
object :: SParser Object
object =
Direct <$> directObject
<|> do
header <- dictionary
size <- getSize (Map.lookup "Length" header)
streamContent <- BS.pack <$> stream (truncate size)
return $ Stream {header, streamContent}
where
stream size = line "stream" *> count size anyChar <* eol <* line "endstream"
occurrence :: SParser Occurrence
occurrence = Comment <$> comment <|> indirectObj
where
indirectObj = do
objectId <- integer
coordinates <- IndirectObjCoordinates objectId <$> integer
objectValue <- line "obj" *> object <* eol <* line "endobj"
addObject objectId objectValue
return $ Indirect coordinates
populate :: ByteString -> Content -> Content
populate input initialContent =
case runParser recurseOnOccurrences initialState "" input of
Left _ -> initialContent
Right finalState -> content finalState
where
initialState = UserState {input, content = initialContent}
recurseOnOccurrences :: SParser UserState
recurseOnOccurrences =
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState

View file

@ -3,23 +3,29 @@ module PDF.Object (
Content(..) Content(..)
, DirectObject(..) , DirectObject(..)
, EOLStyle(..) , EOLStyle(..)
, IndirectObjCoordinates(..)
, Object(..) , Object(..)
, Occurrence(..) , Occurrence(..)
, XRefEntry(..) , XRefEntry(..)
, XRefSection
, XRefSubSection(..) , XRefSubSection(..)
, content , content
, dictionary
, directObject
, eol , eol
, eolCharset , eolCharset
, occurrence , integer
, line
) where ) where
import Data.ByteString.Lazy.Char8 (ByteString) import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (pack)
import Data.Int (Int64) import Data.Int (Int64)
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map as Map (fromList, lookup) import qualified Data.Map as Map (empty, fromList)
import Text.Parsec import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser) --import Text.Parsec.ByteString.Lazy (Parser)
type Parser u = Parsec ByteString u
data EOLStyle = CR | LF | CRLF data EOLStyle = CR | LF | CRLF
@ -35,7 +41,7 @@ data DirectObject =
| Array [DirectObject] | Array [DirectObject]
| Dictionary Dictionary | Dictionary Dictionary
| Null | Null
| Reference (Int, Int) | Reference IndirectObjCoordinates
deriving Show deriving Show
data Object = data Object =
@ -46,14 +52,12 @@ data Object =
} }
deriving Show deriving Show
data Occurrence = data IndirectObjCoordinates = IndirectObjCoordinates {
Comment String objectId :: Int
| Indirect { , versionNumber :: Int
objId :: Int } deriving Show
, versionNumber :: Int
, objectContent :: Object data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
}
deriving Show
data XRefEntry = InUse { data XRefEntry = InUse {
offset :: Int64 offset :: Int64
@ -65,12 +69,16 @@ data XRefEntry = InUse {
data XRefSubSection = XRefSubSection { data XRefSubSection = XRefSubSection {
firstObjectId :: Int firstObjectId :: Int
, entriesNumber :: Int
, entries :: Map Int XRefEntry , entries :: Map Int XRefEntry
} deriving Show } deriving Show
type XRefSection = [XRefSubSection]
data Content = Content { data Content = Content {
body :: [Occurrence] body :: [Occurrence]
, xrefSection :: [XRefSubSection] , objects :: Map Int Object
, xrefSection :: XRefSection
, trailer :: Dictionary , trailer :: Dictionary
, startXrefPosition :: Int64 , startXrefPosition :: Int64
} deriving Show } deriving Show
@ -78,39 +86,39 @@ data Content = Content {
eolCharset :: String eolCharset :: String
eolCharset = "\r\n" eolCharset = "\r\n"
eol :: Parser EOLStyle eol :: Parser u EOLStyle
eol = eol =
try (string "\r\n" >> return CRLF) try (string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR) <|> (string "\r" >> return CR)
<|> (string "\n" >> return LF) <|> (string "\n" >> return LF)
line :: String -> Parser () line :: String -> Parser u ()
line l = string l *> eol *> return () line l = string l *> eol *> return ()
whiteSpaceCharset :: String whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 " whiteSpaceCharset = "\0\t\12 "
whiteSpace :: Parser () whiteSpace :: Parser u ()
whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return () whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
blank :: Parser () blank :: Parser u ()
blank = skipMany whiteSpace blank = skipMany whiteSpace
delimiterCharset :: String delimiterCharset :: String
delimiterCharset = "()<>[]{}/%" delimiterCharset = "()<>[]{}/%"
{- {-
delimiter :: Parser Char delimiter :: Parser u Char
delimiter = oneOf delimiterCharset delimiter = oneOf delimiterCharset
-} -}
regular :: Parser Char regular :: Parser u Char
regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
integer :: (Read a, Num a) => Parser a integer :: (Read a, Num a) => Parser u a
integer = read <$> many1 digit <* whiteSpace integer = read <$> many1 digit <* whiteSpace
directObject :: Parser DirectObject directObject :: Parser u DirectObject
directObject = directObject =
Boolean <$> try boolean Boolean <$> try boolean
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -} <|> Reference <$> try reference {- defined before Number because Number is a prefix of it -}
@ -121,17 +129,17 @@ directObject =
<|> Dictionary <$> try dictionary <|> Dictionary <$> try dictionary
<|> const Null <$> try nullObject <|> const Null <$> try nullObject
boolean :: Parser Bool boolean :: Parser u Bool
boolean = (string "true" *> return True) <|> (string "false" *> return False) boolean = (string "true" *> return True) <|> (string "false" *> return False)
number :: Parser Float number :: Parser u Float
number = read <$> (mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart)) number = read <$> (mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart))
where where
sign = string "-" <|> option "" (char '+' >> return "") sign = string "-" <|> option "" (char '+' >> return "")
integerPart = mappend <$> many1 digit <*> option "" floatPart integerPart = mappend <$> many1 digit <*> option "" floatPart
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit) floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit)
stringObj :: Parser StringObj stringObj :: Parser u StringObj
stringObj = stringObj =
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')') Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>') <|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
@ -142,70 +150,44 @@ stringObj =
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode) escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> count n octDigit) <$> [1..3] octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
name :: Parser String name :: Parser u String
name = char '/' *> many regular name = char '/' *> many regular
array :: Parser [DirectObject] array :: Parser u [DirectObject]
array = char '[' *> blank *> directObject `endBy` blank <* char ']' array = char '[' *> blank *> directObject `endBy` blank <* char ']'
dictionary :: Parser Dictionary dictionary :: Parser u Dictionary
dictionary = dictionary =
try (string "<<" *> blank *> keyValPairs <* string ">>") try (string "<<" *> blank *> keyValPairs <* string ">>")
where where
keyVal = (,) <$> name <* blank <*> directObject keyVal = (,) <$> name <* blank <*> directObject
keyValPairs = Map.fromList <$> keyVal `endBy` blank keyValPairs = Map.fromList <$> keyVal `endBy` blank
nullObject :: Parser () nullObject :: Parser u ()
nullObject = string "null" *> return () nullObject = string "null" *> return ()
comment :: Parser String reference :: Parser u IndirectObjCoordinates
comment = char '%' *> many (noneOf eolCharset) <* eol reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R'
reference :: Parser (Int, Int) entry :: Parser u XRefEntry
reference = (,) <$> integer <*> integer <* char 'R'
object :: Parser Object
object =
Direct <$> directObject
<|> do
header <- dictionary
streamContent <- BS.pack <$> stream (Map.lookup "Length" header)
return $ Stream header streamContent
where
stream value =
case value of
Nothing -> fail "Missing 'Length' key on stream"
Just (Number size) ->
line "stream" *> count (truncate size) anyChar <* eol <* line "endstream"
_ -> fail "Expected number"
occurrence :: Parser Occurrence
occurrence = Comment <$> comment <|> indirectObj
where
indirectObj =
Indirect <$> integer <*> integer <*> (line "obj"
*> object
<* eol <* line "endobj")
entry :: Parser XRefEntry
entry = do entry = do
(big, small) <- (,) <$> integer <*> integer (big, small) <- (,) <$> integer <*> integer
(inUse big small <|> free big small) <* blank (inUse big small <|> free big small) <* blank
where where
inUse :: Int64 -> Int -> Parser XRefEntry inUse :: Int64 -> Int -> Parser u XRefEntry
inUse offset generation = char 'n' *> return (InUse {offset, generation}) inUse offset generation = char 'n' *> return (InUse {offset, generation})
free :: Int64 -> Int -> Parser XRefEntry free :: Int64 -> Int -> Parser u XRefEntry
free nextFree generation = char 'f' *> return (Free {nextFree, generation}) free nextFree generation = char 'f' *> return (Free {nextFree, generation})
xrefSubSection :: Parser XRefSubSection xrefSubSection :: Parser u XRefSubSection
xrefSubSection = do xrefSubSection = do
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer (firstObjectId, entriesNumber) <- (,) <$> integer <*> integer
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
return $ XRefSubSection {firstObjectId, entries} return $ XRefSubSection {firstObjectId, entriesNumber, entries}
content :: Parser Content content :: Parser u Content
content = content =
Content [] Content [] Map.empty
<$> (line "xref" *> xrefSubSection `sepBy` eol) <$> (line "xref" *> xrefSubSection `sepBy` eol)
<*> (line "trailer" *> dictionary <* eol) <*> (line "trailer" *> dictionary <* eol)
<*> (line "startxref" *> integer) <*> (line "startxref" *> integer)