Cut PDF module in two, implement basic parsing up to reading XRef table

This commit is contained in:
Tissevert 2019-05-13 18:22:05 +02:00
parent 6eacb55fc4
commit 8043f84da8
4 changed files with 242 additions and 156 deletions

View file

@ -17,6 +17,7 @@ cabal-version: >=1.10
library
exposed-modules: PDF
, PDF.Object
, Data.ByteString.Lazy.Char8.Util
other-modules:
-- other-extensions:
@ -25,4 +26,5 @@ library
, containers
, parsec
hs-source-dirs: src
ghc-options: -Wall
default-language: Haskell2010

View file

@ -4,7 +4,7 @@ module Data.ByteString.Lazy.Char8.Util (
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, pack, take)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
import Data.Int (Int64)
import Prelude hiding (length)

View file

@ -1,14 +1,18 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NamedFieldPuns #-}
module PDF (
parseDocument
, Document(..)
, Content(..)
, DirectObject(..)
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, pack, unpack)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack)
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
import Data.Int (Int64)
import Data.Map (Map, lookup)
import qualified Data.Map as Map (empty, fromList)
import qualified Data.Map as Map (lookup)
import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset)
import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser)
import Text.Parsec.Pos (newPos)
@ -16,157 +20,24 @@ import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
data Document = Document {
pdfVersion :: String
, objectsById :: Map Int Object
, flow :: [Occurrence]
, xref :: [ByteString]
, trailer :: ByteString
, startXref :: Int64
, contents :: [Content]
} deriving Show
type Dictionary = Map String DirectObject
data DirectObject =
Boolean Bool
| Number Float
| String StringObj
| Name String
| Array [DirectObject]
| Dictionary Dictionary
| Null
| Reference (Int, Int)
deriving Show
data Object =
Direct DirectObject
| Stream {
header :: Dictionary
, content :: ByteString
}
deriving Show
data Occurrence =
Comment String
| Indirect {
objId :: Int
, versionNumber :: Int
, objectContent :: Object
}
deriving Show
data StringObj = Literal String | Hexadecimal String deriving Show
data EOLStyle = CR | LF | CRLF
eolCharset :: String
eolCharset = "\r\n"
eol :: Parser EOLStyle
eol =
try (string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR)
<|> (string "\n" >> return LF)
whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 "
whiteSpace :: Parser ()
whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
blank :: Parser ()
blank = skipMany whiteSpace
delimiterCharset :: String
delimiterCharset = "()<>[]{}/%"
delimiter :: Parser Char
delimiter = oneOf delimiterCharset
regular :: Parser Char
regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
int :: Parser Int
int = read <$> many1 digit <* whiteSpace
directObject :: Parser DirectObject
directObject =
Boolean <$> boolean
<|> Number <$> number
<|> String <$> stringObj
<|> Name <$> name
<|> Array <$> array
<|> const Null <$> nullObject
<|> Reference <$> reference
boolean :: Parser Bool
boolean = (string "true" *> return True) <|> (string "false" *> return False)
number :: Parser Float
number = read <$> (mappend <$> (mappend <$> sign <*> integerPart) <*> floatPart)
where
sign = string "-" <|> option "" (char '+' >> return "")
integerPart = option "0" $ many1 digit
floatPart = option "" $ (:) <$> char '.' <*> integerPart
stringObj :: Parser StringObj
stringObj =
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
where
literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar
matchingParenthesis =
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
name :: Parser String
name = char '/' *> many regular
array :: Parser [DirectObject]
array = char '[' *> directObject `sepBy` whiteSpace <* char ']'
dictionary :: Parser Dictionary
dictionary =
string "<<" *> blank *> keyValPairs <* blank <* string ">>"
where
keyValPairs = Map.fromList <$> many ((,) <$> name <*> directObject)
nullObject :: Parser ()
nullObject = string "null" *> return ()
comment :: Parser String
comment = char '%' *> many (noneOf eolCharset) <* eol
reference :: Parser (Int, Int)
reference = (,) <$> int <*> int <* char 'R'
object :: Parser Object
object =
Direct <$> directObject
<|> Stream <$> dictionary <*> (BS.pack <$> stream)
where
stream = string "stream" *> eol *> many anyChar <* eol <* string "endstream"
occurrence :: Parser Occurrence
occurrence = Comment <$> comment <|> indirectObj
where
indirectObj =
Indirect <$> int <*> int <*> (string "obj" *> eol
*> object
<* eol <* string "endobj")
version :: Parser String
version = string magicNumber *> many (noneOf eolCharset)
magicNumber :: String
magicNumber = "%PDF-"
version :: Parser String
version = string magicNumber *> many (noneOf eolCharset)
eofMarker :: ByteString
eofMarker = "%%EOF"
parseError :: String -> Either ParseError a
parseError errorMessage =
Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0)
check :: Bool -> String -> Either ParseError ()
check test errorMessage = if test then return () else Left parseError
where
parseError = newErrorMessage (Message errorMessage) (newPos "" 0 0)
check test errorMessage = if test then return () else parseError errorMessage
readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64
readStartXref eolStyle input =
@ -189,14 +60,16 @@ parseDocument :: ByteString -> Either ParseError Document
parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
startXref <- readStartXref eolStyle input
return . fillObjects input $ Document {
pdfVersion
, objectsById = Map.empty
, flow = []
, xref = []
, trailer = ""
, startXref
}
contents <- iterateContents startXref input
return $ Document {pdfVersion, contents}
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
iterateContents startXref input =
parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
where
stopOrFollow c@(Content {trailer}) =
case Map.lookup "Prev" trailer of
Nothing -> Right [c]
Just (Number f) -> (c:) <$> (iterateContents (truncate f) input)
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
fillObjects :: ByteString -> Document -> Document
fillObjects input document = document

211
src/PDF/Object.hs Normal file
View file

@ -0,0 +1,211 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.Object (
Content(..)
, DirectObject(..)
, EOLStyle(..)
, Object(..)
, Occurrence(..)
, XRefEntry(..)
, XRefSubSection(..)
, content
, eol
, eolCharset
, occurrence
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (pack)
import Data.Int (Int64)
import Data.Map (Map)
import qualified Data.Map as Map (fromList, lookup)
import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser)
data EOLStyle = CR | LF | CRLF
type Dictionary = Map String DirectObject
data StringObj = Literal String | Hexadecimal String deriving Show
data DirectObject =
Boolean Bool
| Number Float
| String StringObj
| Name String
| Array [DirectObject]
| Dictionary Dictionary
| Null
| Reference (Int, Int)
deriving Show
data Object =
Direct DirectObject
| Stream {
header :: Dictionary
, streamContent :: ByteString
}
deriving Show
data Occurrence =
Comment String
| Indirect {
objId :: Int
, versionNumber :: Int
, objectContent :: Object
}
deriving Show
data XRefEntry = InUse {
offset :: Int64
, generation :: Int
} | Free {
nextFree :: Int64
, generation :: Int
} deriving Show
data XRefSubSection = XRefSubSection {
firstObjectId :: Int
, entries :: Map Int XRefEntry
} deriving Show
data Content = Content {
body :: [Occurrence]
, xrefSection :: [XRefSubSection]
, trailer :: Dictionary
, startXrefPosition :: Int64
} deriving Show
eolCharset :: String
eolCharset = "\r\n"
eol :: Parser EOLStyle
eol =
try (string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR)
<|> (string "\n" >> return LF)
line :: String -> Parser ()
line l = string l *> eol *> return ()
whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 "
whiteSpace :: Parser ()
whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
blank :: Parser ()
blank = skipMany whiteSpace
delimiterCharset :: String
delimiterCharset = "()<>[]{}/%"
{-
delimiter :: Parser Char
delimiter = oneOf delimiterCharset
-}
regular :: Parser Char
regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
integer :: (Read a, Num a) => Parser a
integer = read <$> many1 digit <* whiteSpace
directObject :: Parser DirectObject
directObject =
Boolean <$> try boolean
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -}
<|> Number <$> try number
<|> String <$> try stringObj
<|> Name <$> try name
<|> Array <$> try array
<|> Dictionary <$> try dictionary
<|> const Null <$> try nullObject
boolean :: Parser Bool
boolean = (string "true" *> return True) <|> (string "false" *> return False)
number :: Parser Float
number = read <$> (mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart))
where
sign = string "-" <|> option "" (char '+' >> return "")
integerPart = mappend <$> many1 digit <*> option "" floatPart
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit)
stringObj :: Parser StringObj
stringObj =
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
where
literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar
matchingParenthesis =
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
name :: Parser String
name = char '/' *> many regular
array :: Parser [DirectObject]
array = char '[' *> blank *> directObject `endBy` blank <* char ']'
dictionary :: Parser Dictionary
dictionary =
try (string "<<" *> blank *> keyValPairs <* string ">>")
where
keyVal = (,) <$> name <* blank <*> directObject
keyValPairs = Map.fromList <$> keyVal `endBy` blank
nullObject :: Parser ()
nullObject = string "null" *> return ()
comment :: Parser String
comment = char '%' *> many (noneOf eolCharset) <* eol
reference :: Parser (Int, Int)
reference = (,) <$> integer <*> integer <* char 'R'
object :: Parser Object
object =
Direct <$> directObject
<|> do
header <- dictionary
streamContent <- BS.pack <$> stream (Map.lookup "Length" header)
return $ Stream header streamContent
where
stream value =
case value of
Nothing -> fail "Missing 'Length' key on stream"
Just (Number size) ->
line "stream" *> count (truncate size) anyChar <* eol <* line "endstream"
_ -> fail "Expected number"
occurrence :: Parser Occurrence
occurrence = Comment <$> comment <|> indirectObj
where
indirectObj =
Indirect <$> integer <*> integer <*> (line "obj"
*> object
<* eol <* line "endobj")
entry :: Parser XRefEntry
entry = do
(big, small) <- (,) <$> integer <*> integer
(inUse big small <|> free big small) <* blank
where
inUse :: Int64 -> Int -> Parser XRefEntry
inUse offset generation = char 'n' *> return (InUse {offset, generation})
free :: Int64 -> Int -> Parser XRefEntry
free nextFree generation = char 'f' *> return (Free {nextFree, generation})
xrefSubSection :: Parser XRefSubSection
xrefSubSection = do
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
return $ XRefSubSection {firstObjectId, entries}
content :: Parser Content
content =
Content []
<$> (line "xref" *> xrefSubSection `sepBy` eol)
<*> (line "trailer" *> dictionary <* eol)
<*> (line "startxref" *> integer)