Implement retrieving objects in the body of the document and use it to populate the structure previously parsed
This commit is contained in:
parent
8043f84da8
commit
91292d6401
4 changed files with 169 additions and 65 deletions
|
@ -17,6 +17,7 @@ cabal-version: >=1.10
|
|||
|
||||
library
|
||||
exposed-modules: PDF
|
||||
, PDF.Body
|
||||
, PDF.Object
|
||||
, Data.ByteString.Lazy.Char8.Util
|
||||
other-modules:
|
||||
|
|
|
@ -13,6 +13,7 @@ import Data.ByteString.Lazy.Char8.Util (previous, subBS)
|
|||
import Data.Int (Int64)
|
||||
import qualified Data.Map as Map (lookup)
|
||||
import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset)
|
||||
import PDF.Body (populate)
|
||||
import Text.Parsec
|
||||
import Text.Parsec.ByteString.Lazy (Parser)
|
||||
import Text.Parsec.Pos (newPos)
|
||||
|
@ -60,7 +61,8 @@ parseDocument :: ByteString -> Either ParseError Document
|
|||
parseDocument input = do
|
||||
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
|
||||
startXref <- readStartXref eolStyle input
|
||||
contents <- iterateContents startXref input
|
||||
structures <- iterateContents startXref input
|
||||
let contents = populate input <$> structures
|
||||
return $ Document {pdfVersion, contents}
|
||||
|
||||
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
|
||||
|
|
119
src/PDF/Body.hs
Normal file
119
src/PDF/Body.hs
Normal file
|
@ -0,0 +1,119 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module PDF.Body (
|
||||
populate
|
||||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
|
||||
import Data.Functor.Identity (Identity)
|
||||
import Data.Int (Int64)
|
||||
import qualified Data.Map as Map (insert, lookup)
|
||||
import PDF.Object (
|
||||
Content(..), DirectObject(..), IndirectObjCoordinates(..), Object(..)
|
||||
, Occurrence(..), XRefEntry(..), XRefSection, XRefSubSection(..)
|
||||
, eol, eolCharset, dictionary, directObject, integer, line
|
||||
)
|
||||
import Text.Parsec
|
||||
|
||||
data UserState = UserState {
|
||||
input :: ByteString
|
||||
, content :: Content
|
||||
}
|
||||
|
||||
type SParser = ParsecT ByteString UserState Identity
|
||||
|
||||
modifyContent :: (Content -> Content) -> SParser ()
|
||||
modifyContent f = modifyState $ \state -> state {content = f $ content state}
|
||||
|
||||
addObject :: Int -> Object -> SParser ()
|
||||
addObject objectId newObject = modifyContent $ \content -> content {
|
||||
objects = Map.insert objectId newObject $ objects content
|
||||
}
|
||||
|
||||
pushOccurrence :: Occurrence -> SParser ()
|
||||
pushOccurrence newOccurrence = modifyContent $ \content -> content {
|
||||
body = newOccurrence : (body content)
|
||||
}
|
||||
|
||||
comment :: SParser String
|
||||
comment = char '%' *> many (noneOf eolCharset) <* eol
|
||||
|
||||
lookupOffset :: Int -> XRefSection -> Maybe Int64
|
||||
lookupOffset _ [] = Nothing
|
||||
lookupOffset objectId (xrefSubSection:others) =
|
||||
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
|
||||
let index = objectId - firstObjectId in
|
||||
if index >= 0 && index < entriesNumber
|
||||
then
|
||||
case Map.lookup index entries of
|
||||
Just (InUse {offset}) -> Just offset
|
||||
_ -> Nothing
|
||||
else lookupOffset objectId others
|
||||
|
||||
getOffset :: Int -> SParser Int64
|
||||
getOffset objectId = do
|
||||
Content {xrefSection} <- content <$> getState
|
||||
case lookupOffset objectId xrefSection of
|
||||
Nothing -> fail $
|
||||
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
|
||||
Just offset -> return offset
|
||||
|
||||
loadNumber :: Int -> SParser Float
|
||||
loadNumber objectId = do
|
||||
offset <- getOffset objectId
|
||||
state <- getState
|
||||
let objectStart = BS.drop offset (input state)
|
||||
case runParser object state ("object@" ++ show offset) objectStart of
|
||||
Left _ -> fail $ "Could not parse length object@" ++ show offset
|
||||
Right number@(Direct (Number n)) ->
|
||||
addObject objectId number >> return n
|
||||
Right obj -> fail $
|
||||
"Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
|
||||
|
||||
invalidValue :: Object -> String
|
||||
invalidValue v = "Invalid value " ++ show v
|
||||
|
||||
getSize :: Maybe DirectObject -> SParser Float
|
||||
getSize Nothing = fail "Missing '/Length' key on stream"
|
||||
getSize (Just (Number size)) = return size
|
||||
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
|
||||
Content {objects} <- content <$> getState
|
||||
case Map.lookup objectId objects of
|
||||
Nothing -> loadNumber objectId
|
||||
Just (Direct (Number size)) -> return size
|
||||
Just v -> fail $
|
||||
invalidValue v ++ " for obj " ++ show objectId ++ "used as /Length"
|
||||
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
|
||||
|
||||
object :: SParser Object
|
||||
object =
|
||||
Direct <$> directObject
|
||||
<|> do
|
||||
header <- dictionary
|
||||
size <- getSize (Map.lookup "Length" header)
|
||||
streamContent <- BS.pack <$> stream (truncate size)
|
||||
return $ Stream {header, streamContent}
|
||||
where
|
||||
stream size = line "stream" *> count size anyChar <* eol <* line "endstream"
|
||||
|
||||
occurrence :: SParser Occurrence
|
||||
occurrence = Comment <$> comment <|> indirectObj
|
||||
where
|
||||
indirectObj = do
|
||||
objectId <- integer
|
||||
coordinates <- IndirectObjCoordinates objectId <$> integer
|
||||
objectValue <- line "obj" *> object <* eol <* line "endobj"
|
||||
addObject objectId objectValue
|
||||
return $ Indirect coordinates
|
||||
|
||||
populate :: ByteString -> Content -> Content
|
||||
populate input initialContent =
|
||||
case runParser recurseOnOccurrences initialState "" input of
|
||||
Left _ -> initialContent
|
||||
Right finalState -> content finalState
|
||||
where
|
||||
initialState = UserState {input, content = initialContent}
|
||||
|
||||
recurseOnOccurrences :: SParser UserState
|
||||
recurseOnOccurrences =
|
||||
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState
|
|
@ -3,23 +3,29 @@ module PDF.Object (
|
|||
Content(..)
|
||||
, DirectObject(..)
|
||||
, EOLStyle(..)
|
||||
, IndirectObjCoordinates(..)
|
||||
, Object(..)
|
||||
, Occurrence(..)
|
||||
, XRefEntry(..)
|
||||
, XRefSection
|
||||
, XRefSubSection(..)
|
||||
, content
|
||||
, dictionary
|
||||
, directObject
|
||||
, eol
|
||||
, eolCharset
|
||||
, occurrence
|
||||
, integer
|
||||
, line
|
||||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (pack)
|
||||
import Data.Int (Int64)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (fromList, lookup)
|
||||
import qualified Data.Map as Map (empty, fromList)
|
||||
import Text.Parsec
|
||||
import Text.Parsec.ByteString.Lazy (Parser)
|
||||
--import Text.Parsec.ByteString.Lazy (Parser)
|
||||
|
||||
type Parser u = Parsec ByteString u
|
||||
|
||||
data EOLStyle = CR | LF | CRLF
|
||||
|
||||
|
@ -35,7 +41,7 @@ data DirectObject =
|
|||
| Array [DirectObject]
|
||||
| Dictionary Dictionary
|
||||
| Null
|
||||
| Reference (Int, Int)
|
||||
| Reference IndirectObjCoordinates
|
||||
deriving Show
|
||||
|
||||
data Object =
|
||||
|
@ -46,14 +52,12 @@ data Object =
|
|||
}
|
||||
deriving Show
|
||||
|
||||
data Occurrence =
|
||||
Comment String
|
||||
| Indirect {
|
||||
objId :: Int
|
||||
data IndirectObjCoordinates = IndirectObjCoordinates {
|
||||
objectId :: Int
|
||||
, versionNumber :: Int
|
||||
, objectContent :: Object
|
||||
}
|
||||
deriving Show
|
||||
} deriving Show
|
||||
|
||||
data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
|
||||
|
||||
data XRefEntry = InUse {
|
||||
offset :: Int64
|
||||
|
@ -65,12 +69,16 @@ data XRefEntry = InUse {
|
|||
|
||||
data XRefSubSection = XRefSubSection {
|
||||
firstObjectId :: Int
|
||||
, entriesNumber :: Int
|
||||
, entries :: Map Int XRefEntry
|
||||
} deriving Show
|
||||
|
||||
type XRefSection = [XRefSubSection]
|
||||
|
||||
data Content = Content {
|
||||
body :: [Occurrence]
|
||||
, xrefSection :: [XRefSubSection]
|
||||
, objects :: Map Int Object
|
||||
, xrefSection :: XRefSection
|
||||
, trailer :: Dictionary
|
||||
, startXrefPosition :: Int64
|
||||
} deriving Show
|
||||
|
@ -78,39 +86,39 @@ data Content = Content {
|
|||
eolCharset :: String
|
||||
eolCharset = "\r\n"
|
||||
|
||||
eol :: Parser EOLStyle
|
||||
eol :: Parser u EOLStyle
|
||||
eol =
|
||||
try (string "\r\n" >> return CRLF)
|
||||
<|> (string "\r" >> return CR)
|
||||
<|> (string "\n" >> return LF)
|
||||
|
||||
line :: String -> Parser ()
|
||||
line :: String -> Parser u ()
|
||||
line l = string l *> eol *> return ()
|
||||
|
||||
whiteSpaceCharset :: String
|
||||
whiteSpaceCharset = "\0\t\12 "
|
||||
|
||||
whiteSpace :: Parser ()
|
||||
whiteSpace :: Parser u ()
|
||||
whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
|
||||
|
||||
blank :: Parser ()
|
||||
blank :: Parser u ()
|
||||
blank = skipMany whiteSpace
|
||||
|
||||
delimiterCharset :: String
|
||||
delimiterCharset = "()<>[]{}/%"
|
||||
|
||||
{-
|
||||
delimiter :: Parser Char
|
||||
delimiter :: Parser u Char
|
||||
delimiter = oneOf delimiterCharset
|
||||
-}
|
||||
|
||||
regular :: Parser Char
|
||||
regular :: Parser u Char
|
||||
regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
|
||||
|
||||
integer :: (Read a, Num a) => Parser a
|
||||
integer :: (Read a, Num a) => Parser u a
|
||||
integer = read <$> many1 digit <* whiteSpace
|
||||
|
||||
directObject :: Parser DirectObject
|
||||
directObject :: Parser u DirectObject
|
||||
directObject =
|
||||
Boolean <$> try boolean
|
||||
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -}
|
||||
|
@ -121,17 +129,17 @@ directObject =
|
|||
<|> Dictionary <$> try dictionary
|
||||
<|> const Null <$> try nullObject
|
||||
|
||||
boolean :: Parser Bool
|
||||
boolean :: Parser u Bool
|
||||
boolean = (string "true" *> return True) <|> (string "false" *> return False)
|
||||
|
||||
number :: Parser Float
|
||||
number :: Parser u Float
|
||||
number = read <$> (mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart))
|
||||
where
|
||||
sign = string "-" <|> option "" (char '+' >> return "")
|
||||
integerPart = mappend <$> many1 digit <*> option "" floatPart
|
||||
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit)
|
||||
|
||||
stringObj :: Parser StringObj
|
||||
stringObj :: Parser u StringObj
|
||||
stringObj =
|
||||
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
|
||||
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
|
||||
|
@ -142,70 +150,44 @@ stringObj =
|
|||
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
|
||||
|
||||
name :: Parser String
|
||||
name :: Parser u String
|
||||
name = char '/' *> many regular
|
||||
|
||||
array :: Parser [DirectObject]
|
||||
array :: Parser u [DirectObject]
|
||||
array = char '[' *> blank *> directObject `endBy` blank <* char ']'
|
||||
|
||||
dictionary :: Parser Dictionary
|
||||
dictionary :: Parser u Dictionary
|
||||
dictionary =
|
||||
try (string "<<" *> blank *> keyValPairs <* string ">>")
|
||||
where
|
||||
keyVal = (,) <$> name <* blank <*> directObject
|
||||
keyValPairs = Map.fromList <$> keyVal `endBy` blank
|
||||
|
||||
nullObject :: Parser ()
|
||||
nullObject :: Parser u ()
|
||||
nullObject = string "null" *> return ()
|
||||
|
||||
comment :: Parser String
|
||||
comment = char '%' *> many (noneOf eolCharset) <* eol
|
||||
reference :: Parser u IndirectObjCoordinates
|
||||
reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R'
|
||||
|
||||
reference :: Parser (Int, Int)
|
||||
reference = (,) <$> integer <*> integer <* char 'R'
|
||||
|
||||
object :: Parser Object
|
||||
object =
|
||||
Direct <$> directObject
|
||||
<|> do
|
||||
header <- dictionary
|
||||
streamContent <- BS.pack <$> stream (Map.lookup "Length" header)
|
||||
return $ Stream header streamContent
|
||||
where
|
||||
stream value =
|
||||
case value of
|
||||
Nothing -> fail "Missing 'Length' key on stream"
|
||||
Just (Number size) ->
|
||||
line "stream" *> count (truncate size) anyChar <* eol <* line "endstream"
|
||||
_ -> fail "Expected number"
|
||||
|
||||
occurrence :: Parser Occurrence
|
||||
occurrence = Comment <$> comment <|> indirectObj
|
||||
where
|
||||
indirectObj =
|
||||
Indirect <$> integer <*> integer <*> (line "obj"
|
||||
*> object
|
||||
<* eol <* line "endobj")
|
||||
|
||||
entry :: Parser XRefEntry
|
||||
entry :: Parser u XRefEntry
|
||||
entry = do
|
||||
(big, small) <- (,) <$> integer <*> integer
|
||||
(inUse big small <|> free big small) <* blank
|
||||
where
|
||||
inUse :: Int64 -> Int -> Parser XRefEntry
|
||||
inUse :: Int64 -> Int -> Parser u XRefEntry
|
||||
inUse offset generation = char 'n' *> return (InUse {offset, generation})
|
||||
free :: Int64 -> Int -> Parser XRefEntry
|
||||
free :: Int64 -> Int -> Parser u XRefEntry
|
||||
free nextFree generation = char 'f' *> return (Free {nextFree, generation})
|
||||
|
||||
xrefSubSection :: Parser XRefSubSection
|
||||
xrefSubSection :: Parser u XRefSubSection
|
||||
xrefSubSection = do
|
||||
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer
|
||||
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
|
||||
return $ XRefSubSection {firstObjectId, entries}
|
||||
return $ XRefSubSection {firstObjectId, entriesNumber, entries}
|
||||
|
||||
content :: Parser Content
|
||||
content :: Parser u Content
|
||||
content =
|
||||
Content []
|
||||
Content [] Map.empty
|
||||
<$> (line "xref" *> xrefSubSection `sepBy` eol)
|
||||
<*> (line "trailer" *> dictionary <* eol)
|
||||
<*> (line "startxref" *> integer)
|
||||
|
|
Loading…
Reference in a new issue