Prototype successfully parsing (only last) startxref

2019-05-13 08:05:28 +02:00 · 2019-05-13 08:05:28 +02:00 · c036334b6f
commit c036334b6f
7 changed files with 286 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+dist*/
+.ghc.environment.*
--- a/ChangeLog.md
+++ b/ChangeLog.md
@ -0,0 +1,5 @@
+# Revision history for hufflepdf
+
+## 0.1.0.0  -- YYYY-mm-dd
+
+* First version. Released on an unsuspecting world.
--- a/30
+++ b/30
@ -0,0 +1,30 @@
+Copyright (c) 2019, Tissevert
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * Neither the name of Tissevert nor the names of other
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/Setup.hs
+++ b/Setup.hs
@ -0,0 +1,2 @@
+import Distribution.Simple
+main = defaultMain
--- a/hufflepdf.cabal
+++ b/hufflepdf.cabal
@ -0,0 +1,28 @@
+-- Initial hufflepdf.cabal generated by cabal init.  For further 
+-- documentation, see http://haskell.org/cabal/users-guide/
+
+name:                hufflepdf
+version:             0.1.0.0
+synopsis:            A PDF parser
+-- description:         
+license:             BSD3
+license-file:        LICENSE
+author:              Tissevert
+maintainer:          tissevert+devel@marvid.fr
+-- copyright:           
+category:            Data
+build-type:          Simple
+extra-source-files:  ChangeLog.md
+cabal-version:       >=1.10
+
+library
+  exposed-modules:     PDF
+                     , Data.ByteString.Lazy.Char8.Util
+  other-modules:       
+  -- other-extensions:    
+  build-depends:       base >=4.9 && <4.13
+                     , bytestring
+                     , containers
+                     , parsec
+  hs-source-dirs:      src
+  default-language:    Haskell2010
--- a/src/Data/ByteString/Lazy/Char8/Util.hs
+++ b/src/Data/ByteString/Lazy/Char8/Util.hs
@ -0,0 +1,17 @@
+module Data.ByteString.Lazy.Char8.Util (
+      previous
+    , subBS
+  ) where
+
+import Data.ByteString.Lazy.Char8 (ByteString)
+import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, pack, take)
+import Data.Int (Int64)
+import Prelude hiding (length)
+
+previous :: Char -> Int64 -> ByteString -> Int64
+previous char position byteString
+  | BS.index byteString position == char = position
+  | otherwise = previous char (position - 1) byteString
+
+subBS :: Int64 -> Int64 -> ByteString -> ByteString
+subBS offset length = BS.take length . BS.drop offset
--- a/src/PDF.hs
+++ b/src/PDF.hs
@ -0,0 +1,202 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE NamedFieldPuns #-}
+module PDF (
+  ) where
+
+import Data.ByteString.Lazy.Char8 (ByteString)
+import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, pack, unpack)
+import Data.ByteString.Lazy.Char8.Util (previous, subBS)
+import Data.Int (Int64)
+import Data.Map (Map, lookup)
+import qualified Data.Map as Map (empty, fromList)
+import Text.Parsec
+import Text.Parsec.ByteString.Lazy (Parser)
+import Text.Parsec.Pos (newPos)
+import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
+
+data Document = Document {
+      pdfVersion :: String
+    , objectsById :: Map Int Object
+    , flow :: [Occurrence]
+    , xref :: [ByteString]
+    , trailer :: ByteString
+    , startXref :: Int64
+  } deriving Show
+
+type Dictionary = Map String DirectObject
+
+data DirectObject =
+    Boolean Bool
+  | Number Float
+  | String StringObj
+  | Name String
+  | Array [DirectObject]
+  | Dictionary Dictionary
+  | Null
+  | Reference (Int, Int)
+  deriving Show
+
+data Object =
+    Direct DirectObject
+  | Stream {
+        header :: Dictionary
+      , content :: ByteString
+    }
+  deriving Show
+
+data Occurrence =
+    Comment String
+  | Indirect {
+        objId :: Int
+      , versionNumber :: Int
+      , objectContent :: Object
+    }
+  deriving Show
+
+data StringObj = Literal String | Hexadecimal String deriving Show
+
+data EOLStyle = CR | LF | CRLF
+
+eolCharset :: String
+eolCharset = "\r\n"
+
+eol :: Parser EOLStyle
+eol =
+    try (string "\r\n" >> return CRLF)
+  <|> (string "\r" >> return CR)
+  <|> (string "\n" >> return LF)
+
+whiteSpaceCharset :: String
+whiteSpaceCharset = "\0\t\12 "
+
+whiteSpace :: Parser ()
+whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
+
+blank :: Parser ()
+blank = skipMany whiteSpace
+
+delimiterCharset :: String
+delimiterCharset = "()<>[]{}/%"
+
+delimiter :: Parser Char
+delimiter = oneOf delimiterCharset
+
+regular :: Parser Char
+regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
+
+int :: Parser Int
+int = read <$> many1 digit <* whiteSpace
+
+directObject :: Parser DirectObject
+directObject =
+    Boolean <$> boolean
+  <|> Number <$> number
+  <|> String <$> stringObj
+  <|> Name <$> name
+  <|> Array <$> array
+  <|> const Null <$> nullObject
+  <|> Reference <$> reference
+
+boolean :: Parser Bool
+boolean = (string "true" *> return True) <|> (string "false" *> return False)
+
+number :: Parser Float
+number = read <$> (mappend <$> (mappend <$> sign <*> integerPart) <*> floatPart)
+  where
+    sign = string "-" <|> option "" (char '+' >> return "")
+    integerPart = option "0" $ many1 digit
+    floatPart = option "" $ (:) <$> char '.' <*> integerPart
+
+stringObj :: Parser StringObj
+stringObj =
+    Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
+  <|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
+  where
+    literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar
+    matchingParenthesis =
+      (++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
+    escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
+    octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
+
+name :: Parser String
+name = char '/' *> many regular
+
+array :: Parser [DirectObject]
+array = char '[' *> directObject `sepBy` whiteSpace <* char ']'
+
+dictionary :: Parser Dictionary
+dictionary =
+  string "<<" *> blank *> keyValPairs <* blank <* string ">>"
+  where
+    keyValPairs = Map.fromList <$> many ((,) <$> name <*> directObject)
+
+nullObject :: Parser ()
+nullObject = string "null" *> return ()
+
+comment :: Parser String
+comment = char '%' *> many (noneOf eolCharset) <* eol
+
+reference :: Parser (Int, Int)
+reference = (,) <$> int <*> int <* char 'R'
+
+object :: Parser Object
+object =
+    Direct <$> directObject
+  <|> Stream <$> dictionary <*> (BS.pack <$> stream)
+  where
+    stream = string "stream" *> eol *> many anyChar <* eol <* string "endstream"
+
+occurrence :: Parser Occurrence
+occurrence = Comment <$> comment <|> indirectObj
+  where
+    indirectObj =
+      Indirect <$> int <*> int <*> (string "obj" *> eol
+      *> object
+      <* eol <* string "endobj")
+
+version :: Parser String
+version = string magicNumber *> many (noneOf eolCharset)
+
+magicNumber :: String
+magicNumber = "%PDF-"
+
+eofMarker :: ByteString
+eofMarker = "%%EOF"
+
+check :: Bool -> String -> Either ParseError ()
+check test errorMessage = if test then return () else Left parseError
+  where
+    parseError = newErrorMessage (Message errorMessage) (newPos "" 0 0)
+
+readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64
+readStartXref eolStyle input =
+  check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
+    "Badly formed document : missing EOF marker at the end"
+  >> return (read . BS.unpack $ subBS startXrefPosition startXrefLength input)
+  where
+    (eolOffset, eolLastByte) = case eolStyle of
+      CRLF -> (2, '\n')
+      CR -> (1, '\r')
+      _ -> (1, '\n')
+    eofMarkerPosition =
+      BS.length input - BS.length eofMarker
+      - if BS.last input == BS.last eofMarker then 0 else eolOffset
+    startXrefPosition =
+      previous eolLastByte (eofMarkerPosition - eolOffset) input + 1
+    startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
+
+parseDocument :: ByteString -> Either ParseError Document
+parseDocument input = do
+  (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
+  startXref <- readStartXref eolStyle input
+  return . fillObjects input $ Document {
+        pdfVersion
+      , objectsById = Map.empty
+      , flow = []
+      , xref = []
+      , trailer = ""
+      , startXref
+    }
+
+fillObjects :: ByteString -> Document -> Document
+fillObjects input document = document