From c036334b6f3b12f2299c5787d311fe2f8cf41a71 Mon Sep 17 00:00:00 2001 From: Tissevert Date: Mon, 13 May 2019 08:05:28 +0200 Subject: [PATCH] Prototype successfully parsing (only last) startxref --- .gitignore | 2 + ChangeLog.md | 5 + LICENSE | 30 ++++ Setup.hs | 2 + hufflepdf.cabal | 28 ++++ src/Data/ByteString/Lazy/Char8/Util.hs | 17 +++ src/PDF.hs | 202 +++++++++++++++++++++++++ 7 files changed, 286 insertions(+) create mode 100644 .gitignore create mode 100644 ChangeLog.md create mode 100644 LICENSE create mode 100644 Setup.hs create mode 100644 hufflepdf.cabal create mode 100644 src/Data/ByteString/Lazy/Char8/Util.hs create mode 100644 src/PDF.hs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ebadcab --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +dist*/ +.ghc.environment.* diff --git a/ChangeLog.md b/ChangeLog.md new file mode 100644 index 0000000..0a5b001 --- /dev/null +++ b/ChangeLog.md @@ -0,0 +1,5 @@ +# Revision history for hufflepdf + +## 0.1.0.0 -- YYYY-mm-dd + +* First version. Released on an unsuspecting world. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..42f34d0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,30 @@ +Copyright (c) 2019, Tissevert + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Tissevert nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Setup.hs b/Setup.hs new file mode 100644 index 0000000..9a994af --- /dev/null +++ b/Setup.hs @@ -0,0 +1,2 @@ +import Distribution.Simple +main = defaultMain diff --git a/hufflepdf.cabal b/hufflepdf.cabal new file mode 100644 index 0000000..2e29e1c --- /dev/null +++ b/hufflepdf.cabal @@ -0,0 +1,28 @@ +-- Initial hufflepdf.cabal generated by cabal init. For further +-- documentation, see http://haskell.org/cabal/users-guide/ + +name: hufflepdf +version: 0.1.0.0 +synopsis: A PDF parser +-- description: +license: BSD3 +license-file: LICENSE +author: Tissevert +maintainer: tissevert+devel@marvid.fr +-- copyright: +category: Data +build-type: Simple +extra-source-files: ChangeLog.md +cabal-version: >=1.10 + +library + exposed-modules: PDF + , Data.ByteString.Lazy.Char8.Util + other-modules: + -- other-extensions: + build-depends: base >=4.9 && <4.13 + , bytestring + , containers + , parsec + hs-source-dirs: src + default-language: Haskell2010 diff --git a/src/Data/ByteString/Lazy/Char8/Util.hs b/src/Data/ByteString/Lazy/Char8/Util.hs new file mode 100644 index 0000000..9f8c78c --- /dev/null +++ b/src/Data/ByteString/Lazy/Char8/Util.hs @@ -0,0 +1,17 @@ +module Data.ByteString.Lazy.Char8.Util ( + previous + , subBS + ) where + +import Data.ByteString.Lazy.Char8 (ByteString) +import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, pack, take) +import Data.Int (Int64) +import Prelude hiding (length) + +previous :: Char -> Int64 -> ByteString -> Int64 +previous char position byteString + | BS.index byteString position == char = position + | otherwise = previous char (position - 1) byteString + +subBS :: Int64 -> Int64 -> ByteString -> ByteString +subBS offset length = BS.take length . BS.drop offset diff --git a/src/PDF.hs b/src/PDF.hs new file mode 100644 index 0000000..e1e7421 --- /dev/null +++ b/src/PDF.hs @@ -0,0 +1,202 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE NamedFieldPuns #-} +module PDF ( + ) where + +import Data.ByteString.Lazy.Char8 (ByteString) +import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, pack, unpack) +import Data.ByteString.Lazy.Char8.Util (previous, subBS) +import Data.Int (Int64) +import Data.Map (Map, lookup) +import qualified Data.Map as Map (empty, fromList) +import Text.Parsec +import Text.Parsec.ByteString.Lazy (Parser) +import Text.Parsec.Pos (newPos) +import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage) + +data Document = Document { + pdfVersion :: String + , objectsById :: Map Int Object + , flow :: [Occurrence] + , xref :: [ByteString] + , trailer :: ByteString + , startXref :: Int64 + } deriving Show + +type Dictionary = Map String DirectObject + +data DirectObject = + Boolean Bool + | Number Float + | String StringObj + | Name String + | Array [DirectObject] + | Dictionary Dictionary + | Null + | Reference (Int, Int) + deriving Show + +data Object = + Direct DirectObject + | Stream { + header :: Dictionary + , content :: ByteString + } + deriving Show + +data Occurrence = + Comment String + | Indirect { + objId :: Int + , versionNumber :: Int + , objectContent :: Object + } + deriving Show + +data StringObj = Literal String | Hexadecimal String deriving Show + +data EOLStyle = CR | LF | CRLF + +eolCharset :: String +eolCharset = "\r\n" + +eol :: Parser EOLStyle +eol = + try (string "\r\n" >> return CRLF) + <|> (string "\r" >> return CR) + <|> (string "\n" >> return LF) + +whiteSpaceCharset :: String +whiteSpaceCharset = "\0\t\12 " + +whiteSpace :: Parser () +whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return () + +blank :: Parser () +blank = skipMany whiteSpace + +delimiterCharset :: String +delimiterCharset = "()<>[]{}/%" + +delimiter :: Parser Char +delimiter = oneOf delimiterCharset + +regular :: Parser Char +regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset + +int :: Parser Int +int = read <$> many1 digit <* whiteSpace + +directObject :: Parser DirectObject +directObject = + Boolean <$> boolean + <|> Number <$> number + <|> String <$> stringObj + <|> Name <$> name + <|> Array <$> array + <|> const Null <$> nullObject + <|> Reference <$> reference + +boolean :: Parser Bool +boolean = (string "true" *> return True) <|> (string "false" *> return False) + +number :: Parser Float +number = read <$> (mappend <$> (mappend <$> sign <*> integerPart) <*> floatPart) + where + sign = string "-" <|> option "" (char '+' >> return "") + integerPart = option "0" $ many1 digit + floatPart = option "" $ (:) <$> char '.' <*> integerPart + +stringObj :: Parser StringObj +stringObj = + Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')') + <|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>') + where + literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar + matchingParenthesis = + (++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")" + escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode) + octalCode = choice $ (\n -> count n octDigit) <$> [1..3] + +name :: Parser String +name = char '/' *> many regular + +array :: Parser [DirectObject] +array = char '[' *> directObject `sepBy` whiteSpace <* char ']' + +dictionary :: Parser Dictionary +dictionary = + string "<<" *> blank *> keyValPairs <* blank <* string ">>" + where + keyValPairs = Map.fromList <$> many ((,) <$> name <*> directObject) + +nullObject :: Parser () +nullObject = string "null" *> return () + +comment :: Parser String +comment = char '%' *> many (noneOf eolCharset) <* eol + +reference :: Parser (Int, Int) +reference = (,) <$> int <*> int <* char 'R' + +object :: Parser Object +object = + Direct <$> directObject + <|> Stream <$> dictionary <*> (BS.pack <$> stream) + where + stream = string "stream" *> eol *> many anyChar <* eol <* string "endstream" + +occurrence :: Parser Occurrence +occurrence = Comment <$> comment <|> indirectObj + where + indirectObj = + Indirect <$> int <*> int <*> (string "obj" *> eol + *> object + <* eol <* string "endobj") + +version :: Parser String +version = string magicNumber *> many (noneOf eolCharset) + +magicNumber :: String +magicNumber = "%PDF-" + +eofMarker :: ByteString +eofMarker = "%%EOF" + +check :: Bool -> String -> Either ParseError () +check test errorMessage = if test then return () else Left parseError + where + parseError = newErrorMessage (Message errorMessage) (newPos "" 0 0) + +readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64 +readStartXref eolStyle input = + check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input)) + "Badly formed document : missing EOF marker at the end" + >> return (read . BS.unpack $ subBS startXrefPosition startXrefLength input) + where + (eolOffset, eolLastByte) = case eolStyle of + CRLF -> (2, '\n') + CR -> (1, '\r') + _ -> (1, '\n') + eofMarkerPosition = + BS.length input - BS.length eofMarker + - if BS.last input == BS.last eofMarker then 0 else eolOffset + startXrefPosition = + previous eolLastByte (eofMarkerPosition - eolOffset) input + 1 + startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition + +parseDocument :: ByteString -> Either ParseError Document +parseDocument input = do + (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input + startXref <- readStartXref eolStyle input + return . fillObjects input $ Document { + pdfVersion + , objectsById = Map.empty + , flow = [] + , xref = [] + , trailer = "" + , startXref + } + +fillObjects :: ByteString -> Document -> Document +fillObjects input document = document