From 91292d640193d4ee89a4373bc849c5cbc846143a Mon Sep 17 00:00:00 2001 From: Tissevert Date: Tue, 14 May 2019 18:42:11 +0200 Subject: [PATCH] Implement retrieving objects in the body of the document and use it to populate the structure previously parsed --- hufflepdf.cabal | 1 + src/PDF.hs | 4 +- src/PDF/Body.hs | 119 ++++++++++++++++++++++++++++++++++++++++++++++ src/PDF/Object.hs | 110 ++++++++++++++++++------------------------ 4 files changed, 169 insertions(+), 65 deletions(-) create mode 100644 src/PDF/Body.hs diff --git a/hufflepdf.cabal b/hufflepdf.cabal index c7f4485..9685fc0 100644 --- a/hufflepdf.cabal +++ b/hufflepdf.cabal @@ -17,6 +17,7 @@ cabal-version: >=1.10 library exposed-modules: PDF + , PDF.Body , PDF.Object , Data.ByteString.Lazy.Char8.Util other-modules: diff --git a/src/PDF.hs b/src/PDF.hs index f28847e..57c90a3 100644 --- a/src/PDF.hs +++ b/src/PDF.hs @@ -13,6 +13,7 @@ import Data.ByteString.Lazy.Char8.Util (previous, subBS) import Data.Int (Int64) import qualified Data.Map as Map (lookup) import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset) +import PDF.Body (populate) import Text.Parsec import Text.Parsec.ByteString.Lazy (Parser) import Text.Parsec.Pos (newPos) @@ -60,7 +61,8 @@ parseDocument :: ByteString -> Either ParseError Document parseDocument input = do (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input startXref <- readStartXref eolStyle input - contents <- iterateContents startXref input + structures <- iterateContents startXref input + let contents = populate input <$> structures return $ Document {pdfVersion, contents} iterateContents :: Int64 -> ByteString -> Either ParseError [Content] diff --git a/src/PDF/Body.hs b/src/PDF/Body.hs new file mode 100644 index 0000000..77bf8e2 --- /dev/null +++ b/src/PDF/Body.hs @@ -0,0 +1,119 @@ +{-# LANGUAGE NamedFieldPuns #-} +module PDF.Body ( + populate + ) where + +import Data.ByteString.Lazy.Char8 (ByteString) +import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack) +import Data.Functor.Identity (Identity) +import Data.Int (Int64) +import qualified Data.Map as Map (insert, lookup) +import PDF.Object ( + Content(..), DirectObject(..), IndirectObjCoordinates(..), Object(..) + , Occurrence(..), XRefEntry(..), XRefSection, XRefSubSection(..) + , eol, eolCharset, dictionary, directObject, integer, line + ) +import Text.Parsec + +data UserState = UserState { + input :: ByteString + , content :: Content + } + +type SParser = ParsecT ByteString UserState Identity + +modifyContent :: (Content -> Content) -> SParser () +modifyContent f = modifyState $ \state -> state {content = f $ content state} + +addObject :: Int -> Object -> SParser () +addObject objectId newObject = modifyContent $ \content -> content { + objects = Map.insert objectId newObject $ objects content + } + +pushOccurrence :: Occurrence -> SParser () +pushOccurrence newOccurrence = modifyContent $ \content -> content { + body = newOccurrence : (body content) + } + +comment :: SParser String +comment = char '%' *> many (noneOf eolCharset) <* eol + +lookupOffset :: Int -> XRefSection -> Maybe Int64 +lookupOffset _ [] = Nothing +lookupOffset objectId (xrefSubSection:others) = + let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in + let index = objectId - firstObjectId in + if index >= 0 && index < entriesNumber + then + case Map.lookup index entries of + Just (InUse {offset}) -> Just offset + _ -> Nothing + else lookupOffset objectId others + +getOffset :: Int -> SParser Int64 +getOffset objectId = do + Content {xrefSection} <- content <$> getState + case lookupOffset objectId xrefSection of + Nothing -> fail $ + "obj " ++ show objectId ++ " is referenced but missing in XRef table" + Just offset -> return offset + +loadNumber :: Int -> SParser Float +loadNumber objectId = do + offset <- getOffset objectId + state <- getState + let objectStart = BS.drop offset (input state) + case runParser object state ("object@" ++ show offset) objectStart of + Left _ -> fail $ "Could not parse length object@" ++ show offset + Right number@(Direct (Number n)) -> + addObject objectId number >> return n + Right obj -> fail $ + "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number" + +invalidValue :: Object -> String +invalidValue v = "Invalid value " ++ show v + +getSize :: Maybe DirectObject -> SParser Float +getSize Nothing = fail "Missing '/Length' key on stream" +getSize (Just (Number size)) = return size +getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do + Content {objects} <- content <$> getState + case Map.lookup objectId objects of + Nothing -> loadNumber objectId + Just (Direct (Number size)) -> return size + Just v -> fail $ + invalidValue v ++ " for obj " ++ show objectId ++ "used as /Length" +getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length" + +object :: SParser Object +object = + Direct <$> directObject + <|> do + header <- dictionary + size <- getSize (Map.lookup "Length" header) + streamContent <- BS.pack <$> stream (truncate size) + return $ Stream {header, streamContent} + where + stream size = line "stream" *> count size anyChar <* eol <* line "endstream" + +occurrence :: SParser Occurrence +occurrence = Comment <$> comment <|> indirectObj + where + indirectObj = do + objectId <- integer + coordinates <- IndirectObjCoordinates objectId <$> integer + objectValue <- line "obj" *> object <* eol <* line "endobj" + addObject objectId objectValue + return $ Indirect coordinates + +populate :: ByteString -> Content -> Content +populate input initialContent = + case runParser recurseOnOccurrences initialState "" input of + Left _ -> initialContent + Right finalState -> content finalState + where + initialState = UserState {input, content = initialContent} + +recurseOnOccurrences :: SParser UserState +recurseOnOccurrences = + (occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index aea9297..9de40c5 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -3,23 +3,29 @@ module PDF.Object ( Content(..) , DirectObject(..) , EOLStyle(..) + , IndirectObjCoordinates(..) , Object(..) , Occurrence(..) , XRefEntry(..) + , XRefSection , XRefSubSection(..) , content + , dictionary + , directObject , eol , eolCharset - , occurrence + , integer + , line ) where import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Lazy.Char8 as BS (pack) import Data.Int (Int64) import Data.Map (Map) -import qualified Data.Map as Map (fromList, lookup) +import qualified Data.Map as Map (empty, fromList) import Text.Parsec -import Text.Parsec.ByteString.Lazy (Parser) +--import Text.Parsec.ByteString.Lazy (Parser) + +type Parser u = Parsec ByteString u data EOLStyle = CR | LF | CRLF @@ -35,7 +41,7 @@ data DirectObject = | Array [DirectObject] | Dictionary Dictionary | Null - | Reference (Int, Int) + | Reference IndirectObjCoordinates deriving Show data Object = @@ -46,14 +52,12 @@ data Object = } deriving Show -data Occurrence = - Comment String - | Indirect { - objId :: Int - , versionNumber :: Int - , objectContent :: Object - } - deriving Show +data IndirectObjCoordinates = IndirectObjCoordinates { + objectId :: Int + , versionNumber :: Int + } deriving Show + +data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show data XRefEntry = InUse { offset :: Int64 @@ -65,12 +69,16 @@ data XRefEntry = InUse { data XRefSubSection = XRefSubSection { firstObjectId :: Int + , entriesNumber :: Int , entries :: Map Int XRefEntry } deriving Show +type XRefSection = [XRefSubSection] + data Content = Content { body :: [Occurrence] - , xrefSection :: [XRefSubSection] + , objects :: Map Int Object + , xrefSection :: XRefSection , trailer :: Dictionary , startXrefPosition :: Int64 } deriving Show @@ -78,39 +86,39 @@ data Content = Content { eolCharset :: String eolCharset = "\r\n" -eol :: Parser EOLStyle +eol :: Parser u EOLStyle eol = try (string "\r\n" >> return CRLF) <|> (string "\r" >> return CR) <|> (string "\n" >> return LF) -line :: String -> Parser () +line :: String -> Parser u () line l = string l *> eol *> return () whiteSpaceCharset :: String whiteSpaceCharset = "\0\t\12 " -whiteSpace :: Parser () +whiteSpace :: Parser u () whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return () -blank :: Parser () +blank :: Parser u () blank = skipMany whiteSpace delimiterCharset :: String delimiterCharset = "()<>[]{}/%" {- -delimiter :: Parser Char +delimiter :: Parser u Char delimiter = oneOf delimiterCharset -} -regular :: Parser Char +regular :: Parser u Char regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset -integer :: (Read a, Num a) => Parser a +integer :: (Read a, Num a) => Parser u a integer = read <$> many1 digit <* whiteSpace -directObject :: Parser DirectObject +directObject :: Parser u DirectObject directObject = Boolean <$> try boolean <|> Reference <$> try reference {- defined before Number because Number is a prefix of it -} @@ -121,17 +129,17 @@ directObject = <|> Dictionary <$> try dictionary <|> const Null <$> try nullObject -boolean :: Parser Bool +boolean :: Parser u Bool boolean = (string "true" *> return True) <|> (string "false" *> return False) -number :: Parser Float +number :: Parser u Float number = read <$> (mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart)) where sign = string "-" <|> option "" (char '+' >> return "") integerPart = mappend <$> many1 digit <*> option "" floatPart floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit) -stringObj :: Parser StringObj +stringObj :: Parser u StringObj stringObj = Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')') <|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>') @@ -142,70 +150,44 @@ stringObj = escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode) octalCode = choice $ (\n -> count n octDigit) <$> [1..3] -name :: Parser String +name :: Parser u String name = char '/' *> many regular -array :: Parser [DirectObject] +array :: Parser u [DirectObject] array = char '[' *> blank *> directObject `endBy` blank <* char ']' -dictionary :: Parser Dictionary +dictionary :: Parser u Dictionary dictionary = try (string "<<" *> blank *> keyValPairs <* string ">>") where keyVal = (,) <$> name <* blank <*> directObject keyValPairs = Map.fromList <$> keyVal `endBy` blank -nullObject :: Parser () +nullObject :: Parser u () nullObject = string "null" *> return () -comment :: Parser String -comment = char '%' *> many (noneOf eolCharset) <* eol +reference :: Parser u IndirectObjCoordinates +reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R' -reference :: Parser (Int, Int) -reference = (,) <$> integer <*> integer <* char 'R' - -object :: Parser Object -object = - Direct <$> directObject - <|> do - header <- dictionary - streamContent <- BS.pack <$> stream (Map.lookup "Length" header) - return $ Stream header streamContent - where - stream value = - case value of - Nothing -> fail "Missing 'Length' key on stream" - Just (Number size) -> - line "stream" *> count (truncate size) anyChar <* eol <* line "endstream" - _ -> fail "Expected number" - -occurrence :: Parser Occurrence -occurrence = Comment <$> comment <|> indirectObj - where - indirectObj = - Indirect <$> integer <*> integer <*> (line "obj" - *> object - <* eol <* line "endobj") - -entry :: Parser XRefEntry +entry :: Parser u XRefEntry entry = do (big, small) <- (,) <$> integer <*> integer (inUse big small <|> free big small) <* blank where - inUse :: Int64 -> Int -> Parser XRefEntry + inUse :: Int64 -> Int -> Parser u XRefEntry inUse offset generation = char 'n' *> return (InUse {offset, generation}) - free :: Int64 -> Int -> Parser XRefEntry + free :: Int64 -> Int -> Parser u XRefEntry free nextFree generation = char 'f' *> return (Free {nextFree, generation}) -xrefSubSection :: Parser XRefSubSection +xrefSubSection :: Parser u XRefSubSection xrefSubSection = do (firstObjectId, entriesNumber) <- (,) <$> integer <*> integer entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry - return $ XRefSubSection {firstObjectId, entries} + return $ XRefSubSection {firstObjectId, entriesNumber, entries} -content :: Parser Content +content :: Parser u Content content = - Content [] + Content [] Map.empty <$> (line "xref" *> xrefSubSection `sepBy` eol) <*> (line "trailer" *> dictionary <* eol) <*> (line "startxref" *> integer)