diff --git a/hufflepdf.cabal b/hufflepdf.cabal index 2e29e1c..c7f4485 100644 --- a/hufflepdf.cabal +++ b/hufflepdf.cabal @@ -17,6 +17,7 @@ cabal-version: >=1.10 library exposed-modules: PDF + , PDF.Object , Data.ByteString.Lazy.Char8.Util other-modules: -- other-extensions: @@ -25,4 +26,5 @@ library , containers , parsec hs-source-dirs: src + ghc-options: -Wall default-language: Haskell2010 diff --git a/src/Data/ByteString/Lazy/Char8/Util.hs b/src/Data/ByteString/Lazy/Char8/Util.hs index 9f8c78c..831be25 100644 --- a/src/Data/ByteString/Lazy/Char8/Util.hs +++ b/src/Data/ByteString/Lazy/Char8/Util.hs @@ -4,7 +4,7 @@ module Data.ByteString.Lazy.Char8.Util ( ) where import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, pack, take) +import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take) import Data.Int (Int64) import Prelude hiding (length) diff --git a/src/PDF.hs b/src/PDF.hs index 0f18b5f..f28847e 100644 --- a/src/PDF.hs +++ b/src/PDF.hs @@ -1,14 +1,18 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE NamedFieldPuns #-} module PDF ( + parseDocument + , Document(..) + , Content(..) + , DirectObject(..) ) where import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, pack, unpack) +import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack) import Data.ByteString.Lazy.Char8.Util (previous, subBS) import Data.Int (Int64) -import Data.Map (Map, lookup) -import qualified Data.Map as Map (empty, fromList) +import qualified Data.Map as Map (lookup) +import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset) import Text.Parsec import Text.Parsec.ByteString.Lazy (Parser) import Text.Parsec.Pos (newPos) @@ -16,157 +20,24 @@ import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage) data Document = Document { pdfVersion :: String - , objectsById :: Map Int Object - , flow :: [Occurrence] - , xref :: [ByteString] - , trailer :: ByteString - , startXref :: Int64 + , contents :: [Content] } deriving Show -type Dictionary = Map String DirectObject - -data DirectObject = - Boolean Bool - | Number Float - | String StringObj - | Name String - | Array [DirectObject] - | Dictionary Dictionary - | Null - | Reference (Int, Int) - deriving Show - -data Object = - Direct DirectObject - | Stream { - header :: Dictionary - , content :: ByteString - } - deriving Show - -data Occurrence = - Comment String - | Indirect { - objId :: Int - , versionNumber :: Int - , objectContent :: Object - } - deriving Show - -data StringObj = Literal String | Hexadecimal String deriving Show - -data EOLStyle = CR | LF | CRLF - -eolCharset :: String -eolCharset = "\r\n" - -eol :: Parser EOLStyle -eol = - try (string "\r\n" >> return CRLF) - <|> (string "\r" >> return CR) - <|> (string "\n" >> return LF) - -whiteSpaceCharset :: String -whiteSpaceCharset = "\0\t\12 " - -whiteSpace :: Parser () -whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return () - -blank :: Parser () -blank = skipMany whiteSpace - -delimiterCharset :: String -delimiterCharset = "()<>[]{}/%" - -delimiter :: Parser Char -delimiter = oneOf delimiterCharset - -regular :: Parser Char -regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset - -int :: Parser Int -int = read <$> many1 digit <* whiteSpace - -directObject :: Parser DirectObject -directObject = - Boolean <$> boolean - <|> Number <$> number - <|> String <$> stringObj - <|> Name <$> name - <|> Array <$> array - <|> const Null <$> nullObject - <|> Reference <$> reference - -boolean :: Parser Bool -boolean = (string "true" *> return True) <|> (string "false" *> return False) - -number :: Parser Float -number = read <$> (mappend <$> (mappend <$> sign <*> integerPart) <*> floatPart) - where - sign = string "-" <|> option "" (char '+' >> return "") - integerPart = option "0" $ many1 digit - floatPart = option "" $ (:) <$> char '.' <*> integerPart - -stringObj :: Parser StringObj -stringObj = - Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')') - <|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>') - where - literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar - matchingParenthesis = - (++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")" - escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode) - octalCode = choice $ (\n -> count n octDigit) <$> [1..3] - -name :: Parser String -name = char '/' *> many regular - -array :: Parser [DirectObject] -array = char '[' *> directObject `sepBy` whiteSpace <* char ']' - -dictionary :: Parser Dictionary -dictionary = - string "<<" *> blank *> keyValPairs <* blank <* string ">>" - where - keyValPairs = Map.fromList <$> many ((,) <$> name <*> directObject) - -nullObject :: Parser () -nullObject = string "null" *> return () - -comment :: Parser String -comment = char '%' *> many (noneOf eolCharset) <* eol - -reference :: Parser (Int, Int) -reference = (,) <$> int <*> int <* char 'R' - -object :: Parser Object -object = - Direct <$> directObject - <|> Stream <$> dictionary <*> (BS.pack <$> stream) - where - stream = string "stream" *> eol *> many anyChar <* eol <* string "endstream" - -occurrence :: Parser Occurrence -occurrence = Comment <$> comment <|> indirectObj - where - indirectObj = - Indirect <$> int <*> int <*> (string "obj" *> eol - *> object - <* eol <* string "endobj") - -version :: Parser String -version = string magicNumber *> many (noneOf eolCharset) - magicNumber :: String magicNumber = "%PDF-" +version :: Parser String +version = string magicNumber *> many (noneOf eolCharset) + eofMarker :: ByteString eofMarker = "%%EOF" +parseError :: String -> Either ParseError a +parseError errorMessage = + Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0) + check :: Bool -> String -> Either ParseError () -check test errorMessage = if test then return () else Left parseError - where - parseError = newErrorMessage (Message errorMessage) (newPos "" 0 0) +check test errorMessage = if test then return () else parseError errorMessage readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64 readStartXref eolStyle input = @@ -189,14 +60,16 @@ parseDocument :: ByteString -> Either ParseError Document parseDocument input = do (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input startXref <- readStartXref eolStyle input - return . fillObjects input $ Document { - pdfVersion - , objectsById = Map.empty - , flow = [] - , xref = [] - , trailer = "" - , startXref - } + contents <- iterateContents startXref input + return $ Document {pdfVersion, contents} + +iterateContents :: Int64 -> ByteString -> Either ParseError [Content] +iterateContents startXref input = + parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow + where + stopOrFollow c@(Content {trailer}) = + case Map.lookup "Prev" trailer of + Nothing -> Right [c] + Just (Number f) -> (c:) <$> (iterateContents (truncate f) input) + Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v -fillObjects :: ByteString -> Document -> Document -fillObjects input document = document diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs new file mode 100644 index 0000000..aea9297 --- /dev/null +++ b/src/PDF/Object.hs @@ -0,0 +1,211 @@ +{-# LANGUAGE NamedFieldPuns #-} +module PDF.Object ( + Content(..) + , DirectObject(..) + , EOLStyle(..) + , Object(..) + , Occurrence(..) + , XRefEntry(..) + , XRefSubSection(..) + , content + , eol + , eolCharset + , occurrence + ) where + +import Data.ByteString.Lazy.Char8 (ByteString) +import qualified Data.ByteString.Lazy.Char8 as BS (pack) +import Data.Int (Int64) +import Data.Map (Map) +import qualified Data.Map as Map (fromList, lookup) +import Text.Parsec +import Text.Parsec.ByteString.Lazy (Parser) + +data EOLStyle = CR | LF | CRLF + +type Dictionary = Map String DirectObject + +data StringObj = Literal String | Hexadecimal String deriving Show + +data DirectObject = + Boolean Bool + | Number Float + | String StringObj + | Name String + | Array [DirectObject] + | Dictionary Dictionary + | Null + | Reference (Int, Int) + deriving Show + +data Object = + Direct DirectObject + | Stream { + header :: Dictionary + , streamContent :: ByteString + } + deriving Show + +data Occurrence = + Comment String + | Indirect { + objId :: Int + , versionNumber :: Int + , objectContent :: Object + } + deriving Show + +data XRefEntry = InUse { + offset :: Int64 + , generation :: Int + } | Free { + nextFree :: Int64 + , generation :: Int + } deriving Show + +data XRefSubSection = XRefSubSection { + firstObjectId :: Int + , entries :: Map Int XRefEntry + } deriving Show + +data Content = Content { + body :: [Occurrence] + , xrefSection :: [XRefSubSection] + , trailer :: Dictionary + , startXrefPosition :: Int64 + } deriving Show + +eolCharset :: String +eolCharset = "\r\n" + +eol :: Parser EOLStyle +eol = + try (string "\r\n" >> return CRLF) + <|> (string "\r" >> return CR) + <|> (string "\n" >> return LF) + +line :: String -> Parser () +line l = string l *> eol *> return () + +whiteSpaceCharset :: String +whiteSpaceCharset = "\0\t\12 " + +whiteSpace :: Parser () +whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return () + +blank :: Parser () +blank = skipMany whiteSpace + +delimiterCharset :: String +delimiterCharset = "()<>[]{}/%" + +{- +delimiter :: Parser Char +delimiter = oneOf delimiterCharset +-} + +regular :: Parser Char +regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset + +integer :: (Read a, Num a) => Parser a +integer = read <$> many1 digit <* whiteSpace + +directObject :: Parser DirectObject +directObject = + Boolean <$> try boolean + <|> Reference <$> try reference {- defined before Number because Number is a prefix of it -} + <|> Number <$> try number + <|> String <$> try stringObj + <|> Name <$> try name + <|> Array <$> try array + <|> Dictionary <$> try dictionary + <|> const Null <$> try nullObject + +boolean :: Parser Bool +boolean = (string "true" *> return True) <|> (string "false" *> return False) + +number :: Parser Float +number = read <$> (mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart)) + where + sign = string "-" <|> option "" (char '+' >> return "") + integerPart = mappend <$> many1 digit <*> option "" floatPart + floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit) + +stringObj :: Parser StringObj +stringObj = + Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')') + <|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>') + where + literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar + matchingParenthesis = + (++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")" + escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode) + octalCode = choice $ (\n -> count n octDigit) <$> [1..3] + +name :: Parser String +name = char '/' *> many regular + +array :: Parser [DirectObject] +array = char '[' *> blank *> directObject `endBy` blank <* char ']' + +dictionary :: Parser Dictionary +dictionary = + try (string "<<" *> blank *> keyValPairs <* string ">>") + where + keyVal = (,) <$> name <* blank <*> directObject + keyValPairs = Map.fromList <$> keyVal `endBy` blank + +nullObject :: Parser () +nullObject = string "null" *> return () + +comment :: Parser String +comment = char '%' *> many (noneOf eolCharset) <* eol + +reference :: Parser (Int, Int) +reference = (,) <$> integer <*> integer <* char 'R' + +object :: Parser Object +object = + Direct <$> directObject + <|> do + header <- dictionary + streamContent <- BS.pack <$> stream (Map.lookup "Length" header) + return $ Stream header streamContent + where + stream value = + case value of + Nothing -> fail "Missing 'Length' key on stream" + Just (Number size) -> + line "stream" *> count (truncate size) anyChar <* eol <* line "endstream" + _ -> fail "Expected number" + +occurrence :: Parser Occurrence +occurrence = Comment <$> comment <|> indirectObj + where + indirectObj = + Indirect <$> integer <*> integer <*> (line "obj" + *> object + <* eol <* line "endobj") + +entry :: Parser XRefEntry +entry = do + (big, small) <- (,) <$> integer <*> integer + (inUse big small <|> free big small) <* blank + where + inUse :: Int64 -> Int -> Parser XRefEntry + inUse offset generation = char 'n' *> return (InUse {offset, generation}) + free :: Int64 -> Int -> Parser XRefEntry + free nextFree generation = char 'f' *> return (Free {nextFree, generation}) + +xrefSubSection :: Parser XRefSubSection +xrefSubSection = do + (firstObjectId, entriesNumber) <- (,) <$> integer <*> integer + entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry + return $ XRefSubSection {firstObjectId, entries} + +content :: Parser Content +content = + Content [] + <$> (line "xref" *> xrefSubSection `sepBy` eol) + <*> (line "trailer" *> dictionary <* eol) + <*> (line "startxref" *> integer)