Cut PDF module in two, implement basic parsing up to reading XRef table
This commit is contained in:
parent
6eacb55fc4
commit
8043f84da8
4 changed files with 242 additions and 156 deletions
|
@ -17,6 +17,7 @@ cabal-version: >=1.10
|
||||||
|
|
||||||
library
|
library
|
||||||
exposed-modules: PDF
|
exposed-modules: PDF
|
||||||
|
, PDF.Object
|
||||||
, Data.ByteString.Lazy.Char8.Util
|
, Data.ByteString.Lazy.Char8.Util
|
||||||
other-modules:
|
other-modules:
|
||||||
-- other-extensions:
|
-- other-extensions:
|
||||||
|
@ -25,4 +26,5 @@ library
|
||||||
, containers
|
, containers
|
||||||
, parsec
|
, parsec
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
|
ghc-options: -Wall
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
|
@ -4,7 +4,7 @@ module Data.ByteString.Lazy.Char8.Util (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, pack, take)
|
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
|
||||||
import Data.Int (Int64)
|
import Data.Int (Int64)
|
||||||
import Prelude hiding (length)
|
import Prelude hiding (length)
|
||||||
|
|
||||||
|
|
183
src/PDF.hs
183
src/PDF.hs
|
@ -1,14 +1,18 @@
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
module PDF (
|
module PDF (
|
||||||
|
parseDocument
|
||||||
|
, Document(..)
|
||||||
|
, Content(..)
|
||||||
|
, DirectObject(..)
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, pack, unpack)
|
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack)
|
||||||
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
|
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
|
||||||
import Data.Int (Int64)
|
import Data.Int (Int64)
|
||||||
import Data.Map (Map, lookup)
|
import qualified Data.Map as Map (lookup)
|
||||||
import qualified Data.Map as Map (empty, fromList)
|
import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset)
|
||||||
import Text.Parsec
|
import Text.Parsec
|
||||||
import Text.Parsec.ByteString.Lazy (Parser)
|
import Text.Parsec.ByteString.Lazy (Parser)
|
||||||
import Text.Parsec.Pos (newPos)
|
import Text.Parsec.Pos (newPos)
|
||||||
|
@ -16,157 +20,24 @@ import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
|
||||||
|
|
||||||
data Document = Document {
|
data Document = Document {
|
||||||
pdfVersion :: String
|
pdfVersion :: String
|
||||||
, objectsById :: Map Int Object
|
, contents :: [Content]
|
||||||
, flow :: [Occurrence]
|
|
||||||
, xref :: [ByteString]
|
|
||||||
, trailer :: ByteString
|
|
||||||
, startXref :: Int64
|
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
type Dictionary = Map String DirectObject
|
|
||||||
|
|
||||||
data DirectObject =
|
|
||||||
Boolean Bool
|
|
||||||
| Number Float
|
|
||||||
| String StringObj
|
|
||||||
| Name String
|
|
||||||
| Array [DirectObject]
|
|
||||||
| Dictionary Dictionary
|
|
||||||
| Null
|
|
||||||
| Reference (Int, Int)
|
|
||||||
deriving Show
|
|
||||||
|
|
||||||
data Object =
|
|
||||||
Direct DirectObject
|
|
||||||
| Stream {
|
|
||||||
header :: Dictionary
|
|
||||||
, content :: ByteString
|
|
||||||
}
|
|
||||||
deriving Show
|
|
||||||
|
|
||||||
data Occurrence =
|
|
||||||
Comment String
|
|
||||||
| Indirect {
|
|
||||||
objId :: Int
|
|
||||||
, versionNumber :: Int
|
|
||||||
, objectContent :: Object
|
|
||||||
}
|
|
||||||
deriving Show
|
|
||||||
|
|
||||||
data StringObj = Literal String | Hexadecimal String deriving Show
|
|
||||||
|
|
||||||
data EOLStyle = CR | LF | CRLF
|
|
||||||
|
|
||||||
eolCharset :: String
|
|
||||||
eolCharset = "\r\n"
|
|
||||||
|
|
||||||
eol :: Parser EOLStyle
|
|
||||||
eol =
|
|
||||||
try (string "\r\n" >> return CRLF)
|
|
||||||
<|> (string "\r" >> return CR)
|
|
||||||
<|> (string "\n" >> return LF)
|
|
||||||
|
|
||||||
whiteSpaceCharset :: String
|
|
||||||
whiteSpaceCharset = "\0\t\12 "
|
|
||||||
|
|
||||||
whiteSpace :: Parser ()
|
|
||||||
whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
|
|
||||||
|
|
||||||
blank :: Parser ()
|
|
||||||
blank = skipMany whiteSpace
|
|
||||||
|
|
||||||
delimiterCharset :: String
|
|
||||||
delimiterCharset = "()<>[]{}/%"
|
|
||||||
|
|
||||||
delimiter :: Parser Char
|
|
||||||
delimiter = oneOf delimiterCharset
|
|
||||||
|
|
||||||
regular :: Parser Char
|
|
||||||
regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
|
|
||||||
|
|
||||||
int :: Parser Int
|
|
||||||
int = read <$> many1 digit <* whiteSpace
|
|
||||||
|
|
||||||
directObject :: Parser DirectObject
|
|
||||||
directObject =
|
|
||||||
Boolean <$> boolean
|
|
||||||
<|> Number <$> number
|
|
||||||
<|> String <$> stringObj
|
|
||||||
<|> Name <$> name
|
|
||||||
<|> Array <$> array
|
|
||||||
<|> const Null <$> nullObject
|
|
||||||
<|> Reference <$> reference
|
|
||||||
|
|
||||||
boolean :: Parser Bool
|
|
||||||
boolean = (string "true" *> return True) <|> (string "false" *> return False)
|
|
||||||
|
|
||||||
number :: Parser Float
|
|
||||||
number = read <$> (mappend <$> (mappend <$> sign <*> integerPart) <*> floatPart)
|
|
||||||
where
|
|
||||||
sign = string "-" <|> option "" (char '+' >> return "")
|
|
||||||
integerPart = option "0" $ many1 digit
|
|
||||||
floatPart = option "" $ (:) <$> char '.' <*> integerPart
|
|
||||||
|
|
||||||
stringObj :: Parser StringObj
|
|
||||||
stringObj =
|
|
||||||
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
|
|
||||||
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
|
|
||||||
where
|
|
||||||
literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar
|
|
||||||
matchingParenthesis =
|
|
||||||
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
|
|
||||||
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
|
|
||||||
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
|
|
||||||
|
|
||||||
name :: Parser String
|
|
||||||
name = char '/' *> many regular
|
|
||||||
|
|
||||||
array :: Parser [DirectObject]
|
|
||||||
array = char '[' *> directObject `sepBy` whiteSpace <* char ']'
|
|
||||||
|
|
||||||
dictionary :: Parser Dictionary
|
|
||||||
dictionary =
|
|
||||||
string "<<" *> blank *> keyValPairs <* blank <* string ">>"
|
|
||||||
where
|
|
||||||
keyValPairs = Map.fromList <$> many ((,) <$> name <*> directObject)
|
|
||||||
|
|
||||||
nullObject :: Parser ()
|
|
||||||
nullObject = string "null" *> return ()
|
|
||||||
|
|
||||||
comment :: Parser String
|
|
||||||
comment = char '%' *> many (noneOf eolCharset) <* eol
|
|
||||||
|
|
||||||
reference :: Parser (Int, Int)
|
|
||||||
reference = (,) <$> int <*> int <* char 'R'
|
|
||||||
|
|
||||||
object :: Parser Object
|
|
||||||
object =
|
|
||||||
Direct <$> directObject
|
|
||||||
<|> Stream <$> dictionary <*> (BS.pack <$> stream)
|
|
||||||
where
|
|
||||||
stream = string "stream" *> eol *> many anyChar <* eol <* string "endstream"
|
|
||||||
|
|
||||||
occurrence :: Parser Occurrence
|
|
||||||
occurrence = Comment <$> comment <|> indirectObj
|
|
||||||
where
|
|
||||||
indirectObj =
|
|
||||||
Indirect <$> int <*> int <*> (string "obj" *> eol
|
|
||||||
*> object
|
|
||||||
<* eol <* string "endobj")
|
|
||||||
|
|
||||||
version :: Parser String
|
|
||||||
version = string magicNumber *> many (noneOf eolCharset)
|
|
||||||
|
|
||||||
magicNumber :: String
|
magicNumber :: String
|
||||||
magicNumber = "%PDF-"
|
magicNumber = "%PDF-"
|
||||||
|
|
||||||
|
version :: Parser String
|
||||||
|
version = string magicNumber *> many (noneOf eolCharset)
|
||||||
|
|
||||||
eofMarker :: ByteString
|
eofMarker :: ByteString
|
||||||
eofMarker = "%%EOF"
|
eofMarker = "%%EOF"
|
||||||
|
|
||||||
|
parseError :: String -> Either ParseError a
|
||||||
|
parseError errorMessage =
|
||||||
|
Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0)
|
||||||
|
|
||||||
check :: Bool -> String -> Either ParseError ()
|
check :: Bool -> String -> Either ParseError ()
|
||||||
check test errorMessage = if test then return () else Left parseError
|
check test errorMessage = if test then return () else parseError errorMessage
|
||||||
where
|
|
||||||
parseError = newErrorMessage (Message errorMessage) (newPos "" 0 0)
|
|
||||||
|
|
||||||
readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64
|
readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64
|
||||||
readStartXref eolStyle input =
|
readStartXref eolStyle input =
|
||||||
|
@ -189,14 +60,16 @@ parseDocument :: ByteString -> Either ParseError Document
|
||||||
parseDocument input = do
|
parseDocument input = do
|
||||||
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
|
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
|
||||||
startXref <- readStartXref eolStyle input
|
startXref <- readStartXref eolStyle input
|
||||||
return . fillObjects input $ Document {
|
contents <- iterateContents startXref input
|
||||||
pdfVersion
|
return $ Document {pdfVersion, contents}
|
||||||
, objectsById = Map.empty
|
|
||||||
, flow = []
|
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
|
||||||
, xref = []
|
iterateContents startXref input =
|
||||||
, trailer = ""
|
parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
|
||||||
, startXref
|
where
|
||||||
}
|
stopOrFollow c@(Content {trailer}) =
|
||||||
|
case Map.lookup "Prev" trailer of
|
||||||
|
Nothing -> Right [c]
|
||||||
|
Just (Number f) -> (c:) <$> (iterateContents (truncate f) input)
|
||||||
|
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
|
||||||
|
|
||||||
fillObjects :: ByteString -> Document -> Document
|
|
||||||
fillObjects input document = document
|
|
||||||
|
|
211
src/PDF/Object.hs
Normal file
211
src/PDF/Object.hs
Normal file
|
@ -0,0 +1,211 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
module PDF.Object (
|
||||||
|
Content(..)
|
||||||
|
, DirectObject(..)
|
||||||
|
, EOLStyle(..)
|
||||||
|
, Object(..)
|
||||||
|
, Occurrence(..)
|
||||||
|
, XRefEntry(..)
|
||||||
|
, XRefSubSection(..)
|
||||||
|
, content
|
||||||
|
, eol
|
||||||
|
, eolCharset
|
||||||
|
, occurrence
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
|
import qualified Data.ByteString.Lazy.Char8 as BS (pack)
|
||||||
|
import Data.Int (Int64)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import qualified Data.Map as Map (fromList, lookup)
|
||||||
|
import Text.Parsec
|
||||||
|
import Text.Parsec.ByteString.Lazy (Parser)
|
||||||
|
|
||||||
|
data EOLStyle = CR | LF | CRLF
|
||||||
|
|
||||||
|
type Dictionary = Map String DirectObject
|
||||||
|
|
||||||
|
data StringObj = Literal String | Hexadecimal String deriving Show
|
||||||
|
|
||||||
|
data DirectObject =
|
||||||
|
Boolean Bool
|
||||||
|
| Number Float
|
||||||
|
| String StringObj
|
||||||
|
| Name String
|
||||||
|
| Array [DirectObject]
|
||||||
|
| Dictionary Dictionary
|
||||||
|
| Null
|
||||||
|
| Reference (Int, Int)
|
||||||
|
deriving Show
|
||||||
|
|
||||||
|
data Object =
|
||||||
|
Direct DirectObject
|
||||||
|
| Stream {
|
||||||
|
header :: Dictionary
|
||||||
|
, streamContent :: ByteString
|
||||||
|
}
|
||||||
|
deriving Show
|
||||||
|
|
||||||
|
data Occurrence =
|
||||||
|
Comment String
|
||||||
|
| Indirect {
|
||||||
|
objId :: Int
|
||||||
|
, versionNumber :: Int
|
||||||
|
, objectContent :: Object
|
||||||
|
}
|
||||||
|
deriving Show
|
||||||
|
|
||||||
|
data XRefEntry = InUse {
|
||||||
|
offset :: Int64
|
||||||
|
, generation :: Int
|
||||||
|
} | Free {
|
||||||
|
nextFree :: Int64
|
||||||
|
, generation :: Int
|
||||||
|
} deriving Show
|
||||||
|
|
||||||
|
data XRefSubSection = XRefSubSection {
|
||||||
|
firstObjectId :: Int
|
||||||
|
, entries :: Map Int XRefEntry
|
||||||
|
} deriving Show
|
||||||
|
|
||||||
|
data Content = Content {
|
||||||
|
body :: [Occurrence]
|
||||||
|
, xrefSection :: [XRefSubSection]
|
||||||
|
, trailer :: Dictionary
|
||||||
|
, startXrefPosition :: Int64
|
||||||
|
} deriving Show
|
||||||
|
|
||||||
|
eolCharset :: String
|
||||||
|
eolCharset = "\r\n"
|
||||||
|
|
||||||
|
eol :: Parser EOLStyle
|
||||||
|
eol =
|
||||||
|
try (string "\r\n" >> return CRLF)
|
||||||
|
<|> (string "\r" >> return CR)
|
||||||
|
<|> (string "\n" >> return LF)
|
||||||
|
|
||||||
|
line :: String -> Parser ()
|
||||||
|
line l = string l *> eol *> return ()
|
||||||
|
|
||||||
|
whiteSpaceCharset :: String
|
||||||
|
whiteSpaceCharset = "\0\t\12 "
|
||||||
|
|
||||||
|
whiteSpace :: Parser ()
|
||||||
|
whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
|
||||||
|
|
||||||
|
blank :: Parser ()
|
||||||
|
blank = skipMany whiteSpace
|
||||||
|
|
||||||
|
delimiterCharset :: String
|
||||||
|
delimiterCharset = "()<>[]{}/%"
|
||||||
|
|
||||||
|
{-
|
||||||
|
delimiter :: Parser Char
|
||||||
|
delimiter = oneOf delimiterCharset
|
||||||
|
-}
|
||||||
|
|
||||||
|
regular :: Parser Char
|
||||||
|
regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
|
||||||
|
|
||||||
|
integer :: (Read a, Num a) => Parser a
|
||||||
|
integer = read <$> many1 digit <* whiteSpace
|
||||||
|
|
||||||
|
directObject :: Parser DirectObject
|
||||||
|
directObject =
|
||||||
|
Boolean <$> try boolean
|
||||||
|
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -}
|
||||||
|
<|> Number <$> try number
|
||||||
|
<|> String <$> try stringObj
|
||||||
|
<|> Name <$> try name
|
||||||
|
<|> Array <$> try array
|
||||||
|
<|> Dictionary <$> try dictionary
|
||||||
|
<|> const Null <$> try nullObject
|
||||||
|
|
||||||
|
boolean :: Parser Bool
|
||||||
|
boolean = (string "true" *> return True) <|> (string "false" *> return False)
|
||||||
|
|
||||||
|
number :: Parser Float
|
||||||
|
number = read <$> (mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart))
|
||||||
|
where
|
||||||
|
sign = string "-" <|> option "" (char '+' >> return "")
|
||||||
|
integerPart = mappend <$> many1 digit <*> option "" floatPart
|
||||||
|
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit)
|
||||||
|
|
||||||
|
stringObj :: Parser StringObj
|
||||||
|
stringObj =
|
||||||
|
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
|
||||||
|
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
|
||||||
|
where
|
||||||
|
literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar
|
||||||
|
matchingParenthesis =
|
||||||
|
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||||
|
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||||
|
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
|
||||||
|
|
||||||
|
name :: Parser String
|
||||||
|
name = char '/' *> many regular
|
||||||
|
|
||||||
|
array :: Parser [DirectObject]
|
||||||
|
array = char '[' *> blank *> directObject `endBy` blank <* char ']'
|
||||||
|
|
||||||
|
dictionary :: Parser Dictionary
|
||||||
|
dictionary =
|
||||||
|
try (string "<<" *> blank *> keyValPairs <* string ">>")
|
||||||
|
where
|
||||||
|
keyVal = (,) <$> name <* blank <*> directObject
|
||||||
|
keyValPairs = Map.fromList <$> keyVal `endBy` blank
|
||||||
|
|
||||||
|
nullObject :: Parser ()
|
||||||
|
nullObject = string "null" *> return ()
|
||||||
|
|
||||||
|
comment :: Parser String
|
||||||
|
comment = char '%' *> many (noneOf eolCharset) <* eol
|
||||||
|
|
||||||
|
reference :: Parser (Int, Int)
|
||||||
|
reference = (,) <$> integer <*> integer <* char 'R'
|
||||||
|
|
||||||
|
object :: Parser Object
|
||||||
|
object =
|
||||||
|
Direct <$> directObject
|
||||||
|
<|> do
|
||||||
|
header <- dictionary
|
||||||
|
streamContent <- BS.pack <$> stream (Map.lookup "Length" header)
|
||||||
|
return $ Stream header streamContent
|
||||||
|
where
|
||||||
|
stream value =
|
||||||
|
case value of
|
||||||
|
Nothing -> fail "Missing 'Length' key on stream"
|
||||||
|
Just (Number size) ->
|
||||||
|
line "stream" *> count (truncate size) anyChar <* eol <* line "endstream"
|
||||||
|
_ -> fail "Expected number"
|
||||||
|
|
||||||
|
occurrence :: Parser Occurrence
|
||||||
|
occurrence = Comment <$> comment <|> indirectObj
|
||||||
|
where
|
||||||
|
indirectObj =
|
||||||
|
Indirect <$> integer <*> integer <*> (line "obj"
|
||||||
|
*> object
|
||||||
|
<* eol <* line "endobj")
|
||||||
|
|
||||||
|
entry :: Parser XRefEntry
|
||||||
|
entry = do
|
||||||
|
(big, small) <- (,) <$> integer <*> integer
|
||||||
|
(inUse big small <|> free big small) <* blank
|
||||||
|
where
|
||||||
|
inUse :: Int64 -> Int -> Parser XRefEntry
|
||||||
|
inUse offset generation = char 'n' *> return (InUse {offset, generation})
|
||||||
|
free :: Int64 -> Int -> Parser XRefEntry
|
||||||
|
free nextFree generation = char 'f' *> return (Free {nextFree, generation})
|
||||||
|
|
||||||
|
xrefSubSection :: Parser XRefSubSection
|
||||||
|
xrefSubSection = do
|
||||||
|
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer
|
||||||
|
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
|
||||||
|
return $ XRefSubSection {firstObjectId, entries}
|
||||||
|
|
||||||
|
content :: Parser Content
|
||||||
|
content =
|
||||||
|
Content []
|
||||||
|
<$> (line "xref" *> xrefSubSection `sepBy` eol)
|
||||||
|
<*> (line "trailer" *> dictionary <* eol)
|
||||||
|
<*> (line "startxref" *> integer)
|
Loading…
Reference in a new issue