212 lines
5.6 KiB
Haskell
212 lines
5.6 KiB
Haskell
{-# LANGUAGE NamedFieldPuns #-}
|
||
module PDF.Object (
|
||
Content(..)
|
||
, DirectObject(..)
|
||
, EOLStyle(..)
|
||
, Object(..)
|
||
, Occurrence(..)
|
||
, XRefEntry(..)
|
||
, XRefSubSection(..)
|
||
, content
|
||
, eol
|
||
, eolCharset
|
||
, occurrence
|
||
) where
|
||
|
||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||
import qualified Data.ByteString.Lazy.Char8 as BS (pack)
|
||
import Data.Int (Int64)
|
||
import Data.Map (Map)
|
||
import qualified Data.Map as Map (fromList, lookup)
|
||
import Text.Parsec
|
||
import Text.Parsec.ByteString.Lazy (Parser)
|
||
|
||
data EOLStyle = CR | LF | CRLF
|
||
|
||
type Dictionary = Map String DirectObject
|
||
|
||
data StringObj = Literal String | Hexadecimal String deriving Show
|
||
|
||
data DirectObject =
|
||
Boolean Bool
|
||
| Number Float
|
||
| String StringObj
|
||
| Name String
|
||
| Array [DirectObject]
|
||
| Dictionary Dictionary
|
||
| Null
|
||
| Reference (Int, Int)
|
||
deriving Show
|
||
|
||
data Object =
|
||
Direct DirectObject
|
||
| Stream {
|
||
header :: Dictionary
|
||
, streamContent :: ByteString
|
||
}
|
||
deriving Show
|
||
|
||
data Occurrence =
|
||
Comment String
|
||
| Indirect {
|
||
objId :: Int
|
||
, versionNumber :: Int
|
||
, objectContent :: Object
|
||
}
|
||
deriving Show
|
||
|
||
data XRefEntry = InUse {
|
||
offset :: Int64
|
||
, generation :: Int
|
||
} | Free {
|
||
nextFree :: Int64
|
||
, generation :: Int
|
||
} deriving Show
|
||
|
||
data XRefSubSection = XRefSubSection {
|
||
firstObjectId :: Int
|
||
, entries :: Map Int XRefEntry
|
||
} deriving Show
|
||
|
||
data Content = Content {
|
||
body :: [Occurrence]
|
||
, xrefSection :: [XRefSubSection]
|
||
, trailer :: Dictionary
|
||
, startXrefPosition :: Int64
|
||
} deriving Show
|
||
|
||
eolCharset :: String
|
||
eolCharset = "\r\n"
|
||
|
||
eol :: Parser EOLStyle
|
||
eol =
|
||
try (string "\r\n" >> return CRLF)
|
||
<|> (string "\r" >> return CR)
|
||
<|> (string "\n" >> return LF)
|
||
|
||
line :: String -> Parser ()
|
||
line l = string l *> eol *> return ()
|
||
|
||
whiteSpaceCharset :: String
|
||
whiteSpaceCharset = "\0\t\12 "
|
||
|
||
whiteSpace :: Parser ()
|
||
whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
|
||
|
||
blank :: Parser ()
|
||
blank = skipMany whiteSpace
|
||
|
||
delimiterCharset :: String
|
||
delimiterCharset = "()<>[]{}/%"
|
||
|
||
{-
|
||
delimiter :: Parser Char
|
||
delimiter = oneOf delimiterCharset
|
||
-}
|
||
|
||
regular :: Parser Char
|
||
regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
|
||
|
||
integer :: (Read a, Num a) => Parser a
|
||
integer = read <$> many1 digit <* whiteSpace
|
||
|
||
directObject :: Parser DirectObject
|
||
directObject =
|
||
Boolean <$> try boolean
|
||
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -}
|
||
<|> Number <$> try number
|
||
<|> String <$> try stringObj
|
||
<|> Name <$> try name
|
||
<|> Array <$> try array
|
||
<|> Dictionary <$> try dictionary
|
||
<|> const Null <$> try nullObject
|
||
|
||
boolean :: Parser Bool
|
||
boolean = (string "true" *> return True) <|> (string "false" *> return False)
|
||
|
||
number :: Parser Float
|
||
number = read <$> (mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart))
|
||
where
|
||
sign = string "-" <|> option "" (char '+' >> return "")
|
||
integerPart = mappend <$> many1 digit <*> option "" floatPart
|
||
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit)
|
||
|
||
stringObj :: Parser StringObj
|
||
stringObj =
|
||
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
|
||
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
|
||
where
|
||
literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar
|
||
matchingParenthesis =
|
||
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
|
||
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
|
||
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
|
||
|
||
name :: Parser String
|
||
name = char '/' *> many regular
|
||
|
||
array :: Parser [DirectObject]
|
||
array = char '[' *> blank *> directObject `endBy` blank <* char ']'
|
||
|
||
dictionary :: Parser Dictionary
|
||
dictionary =
|
||
try (string "<<" *> blank *> keyValPairs <* string ">>")
|
||
where
|
||
keyVal = (,) <$> name <* blank <*> directObject
|
||
keyValPairs = Map.fromList <$> keyVal `endBy` blank
|
||
|
||
nullObject :: Parser ()
|
||
nullObject = string "null" *> return ()
|
||
|
||
comment :: Parser String
|
||
comment = char '%' *> many (noneOf eolCharset) <* eol
|
||
|
||
reference :: Parser (Int, Int)
|
||
reference = (,) <$> integer <*> integer <* char 'R'
|
||
|
||
object :: Parser Object
|
||
object =
|
||
Direct <$> directObject
|
||
<|> do
|
||
header <- dictionary
|
||
streamContent <- BS.pack <$> stream (Map.lookup "Length" header)
|
||
return $ Stream header streamContent
|
||
where
|
||
stream value =
|
||
case value of
|
||
Nothing -> fail "Missing 'Length' key on stream"
|
||
Just (Number size) ->
|
||
line "stream" *> count (truncate size) anyChar <* eol <* line "endstream"
|
||
_ -> fail "Expected number"
|
||
|
||
occurrence :: Parser Occurrence
|
||
occurrence = Comment <$> comment <|> indirectObj
|
||
where
|
||
indirectObj =
|
||
Indirect <$> integer <*> integer <*> (line "obj"
|
||
*> object
|
||
<* eol <* line "endobj")
|
||
|
||
entry :: Parser XRefEntry
|
||
entry = do
|
||
(big, small) <- (,) <$> integer <*> integer
|
||
(inUse big small <|> free big small) <* blank
|
||
where
|
||
inUse :: Int64 -> Int -> Parser XRefEntry
|
||
inUse offset generation = char 'n' *> return (InUse {offset, generation})
|
||
free :: Int64 -> Int -> Parser XRefEntry
|
||
free nextFree generation = char 'f' *> return (Free {nextFree, generation})
|
||
|
||
xrefSubSection :: Parser XRefSubSection
|
||
xrefSubSection = do
|
||
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer
|
||
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
|
||
return $ XRefSubSection {firstObjectId, entries}
|
||
|
||
content :: Parser Content
|
||
content =
|
||
Content []
|
||
<$> (line "xref" *> xrefSubSection `sepBy` eol)
|
||
<*> (line "trailer" *> dictionary <* eol)
|
||
<*> (line "startxref" *> integer)
|