{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE NamedFieldPuns #-} module PDF ( ) where import Data.ByteString.Lazy.Char8 (ByteString) import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, pack, unpack) import Data.ByteString.Lazy.Char8.Util (previous, subBS) import Data.Int (Int64) import Data.Map (Map, lookup) import qualified Data.Map as Map (empty, fromList) import Text.Parsec import Text.Parsec.ByteString.Lazy (Parser) import Text.Parsec.Pos (newPos) import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage) data Document = Document { pdfVersion :: String , objectsById :: Map Int Object , flow :: [Occurrence] , xref :: [ByteString] , trailer :: ByteString , startXref :: Int64 } deriving Show type Dictionary = Map String DirectObject data DirectObject = Boolean Bool | Number Float | String StringObj | Name String | Array [DirectObject] | Dictionary Dictionary | Null | Reference (Int, Int) deriving Show data Object = Direct DirectObject | Stream { header :: Dictionary , content :: ByteString } deriving Show data Occurrence = Comment String | Indirect { objId :: Int , versionNumber :: Int , objectContent :: Object } deriving Show data StringObj = Literal String | Hexadecimal String deriving Show data EOLStyle = CR | LF | CRLF eolCharset :: String eolCharset = "\r\n" eol :: Parser EOLStyle eol = try (string "\r\n" >> return CRLF) <|> (string "\r" >> return CR) <|> (string "\n" >> return LF) whiteSpaceCharset :: String whiteSpaceCharset = "\0\t\12 " whiteSpace :: Parser () whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return () blank :: Parser () blank = skipMany whiteSpace delimiterCharset :: String delimiterCharset = "()<>[]{}/%" delimiter :: Parser Char delimiter = oneOf delimiterCharset regular :: Parser Char regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset int :: Parser Int int = read <$> many1 digit <* whiteSpace directObject :: Parser DirectObject directObject = Boolean <$> boolean <|> Number <$> number <|> String <$> stringObj <|> Name <$> name <|> Array <$> array <|> const Null <$> nullObject <|> Reference <$> reference boolean :: Parser Bool boolean = (string "true" *> return True) <|> (string "false" *> return False) number :: Parser Float number = read <$> (mappend <$> (mappend <$> sign <*> integerPart) <*> floatPart) where sign = string "-" <|> option "" (char '+' >> return "") integerPart = option "0" $ many1 digit floatPart = option "" $ (:) <$> char '.' <*> integerPart stringObj :: Parser StringObj stringObj = Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')') <|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>') where literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar matchingParenthesis = (++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")" escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode) octalCode = choice $ (\n -> count n octDigit) <$> [1..3] name :: Parser String name = char '/' *> many regular array :: Parser [DirectObject] array = char '[' *> directObject `sepBy` whiteSpace <* char ']' dictionary :: Parser Dictionary dictionary = string "<<" *> blank *> keyValPairs <* blank <* string ">>" where keyValPairs = Map.fromList <$> many ((,) <$> name <*> directObject) nullObject :: Parser () nullObject = string "null" *> return () comment :: Parser String comment = char '%' *> many (noneOf eolCharset) <* eol reference :: Parser (Int, Int) reference = (,) <$> int <*> int <* char 'R' object :: Parser Object object = Direct <$> directObject <|> Stream <$> dictionary <*> (BS.pack <$> stream) where stream = string "stream" *> eol *> many anyChar <* eol <* string "endstream" occurrence :: Parser Occurrence occurrence = Comment <$> comment <|> indirectObj where indirectObj = Indirect <$> int <*> int <*> (string "obj" *> eol *> object <* eol <* string "endobj") version :: Parser String version = string magicNumber *> many (noneOf eolCharset) magicNumber :: String magicNumber = "%PDF-" eofMarker :: ByteString eofMarker = "%%EOF" check :: Bool -> String -> Either ParseError () check test errorMessage = if test then return () else Left parseError where parseError = newErrorMessage (Message errorMessage) (newPos "" 0 0) readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64 readStartXref eolStyle input = check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input)) "Badly formed document : missing EOF marker at the end" >> return (read . BS.unpack $ subBS startXrefPosition startXrefLength input) where (eolOffset, eolLastByte) = case eolStyle of CRLF -> (2, '\n') CR -> (1, '\r') _ -> (1, '\n') eofMarkerPosition = BS.length input - BS.length eofMarker - if BS.last input == BS.last eofMarker then 0 else eolOffset startXrefPosition = previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1 startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition parseDocument :: ByteString -> Either ParseError Document parseDocument input = do (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input startXref <- readStartXref eolStyle input return . fillObjects input $ Document { pdfVersion , objectsById = Map.empty , flow = [] , xref = [] , trailer = "" , startXref } fillObjects :: ByteString -> Document -> Document fillObjects input document = document