Prototype successfully parsing (only last) startxref

2019-05-13 08:05:28 +02:00 · 2019-05-13 08:05:28 +02:00 · c036334b6f
commit c036334b6f
7 changed files with 286 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 dist*/
 .ghc.environment.*
--- a/ChangeLog.md
+++ b/ChangeLog.md
@ -0,0 +1,5 @@
 # Revision history for hufflepdf
 ## 0.1.0.0  -- YYYY-mm-dd
 * First version. Released on an unsuspecting world.
--- a/30
+++ b/30
@ -0,0 +1,30 @@
 Copyright (c) 2019, Tissevert
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above
      copyright notice, this list of conditions and the following
      disclaimer in the documentation and/or other materials provided
      with the distribution.
    * Neither the name of Tissevert nor the names of other
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/Setup.hs
+++ b/Setup.hs
@ -0,0 +1,2 @@
 import Distribution.Simple
 main = defaultMain
--- a/hufflepdf.cabal
+++ b/hufflepdf.cabal
@ -0,0 +1,28 @@
 -- Initial hufflepdf.cabal generated by cabal init.  For further 
 -- documentation, see http://haskell.org/cabal/users-guide/
 name:                hufflepdf
 version:             0.1.0.0
 synopsis:            A PDF parser
 -- description:         
 license:             BSD3
 license-file:        LICENSE
 author:              Tissevert
 maintainer:          tissevert+devel@marvid.fr
 -- copyright:           
 category:            Data
 build-type:          Simple
 extra-source-files:  ChangeLog.md
 cabal-version:       >=1.10
 library
  exposed-modules:     PDF
                     , Data.ByteString.Lazy.Char8.Util
  other-modules:       
  -- other-extensions:    
  build-depends:       base >=4.9 && <4.13
                     , bytestring
                     , containers
                     , parsec
  hs-source-dirs:      src
  default-language:    Haskell2010
--- a/src/Data/ByteString/Lazy/Char8/Util.hs
+++ b/src/Data/ByteString/Lazy/Char8/Util.hs
@ -0,0 +1,17 @@
 module Data.ByteString.Lazy.Char8.Util (
      previous
    , subBS
  ) where
 import Data.ByteString.Lazy.Char8 (ByteString)
 import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, pack, take)
 import Data.Int (Int64)
 import Prelude hiding (length)
 previous :: Char -> Int64 -> ByteString -> Int64
 previous char position byteString
  | BS.index byteString position == char = position
  | otherwise = previous char (position - 1) byteString
 subBS :: Int64 -> Int64 -> ByteString -> ByteString
 subBS offset length = BS.take length . BS.drop offset
--- a/src/PDF.hs
+++ b/src/PDF.hs
@ -0,0 +1,202 @@
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE NamedFieldPuns #-}
 module PDF (
  ) where
 import Data.ByteString.Lazy.Char8 (ByteString)
 import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, pack, unpack)
 import Data.ByteString.Lazy.Char8.Util (previous, subBS)
 import Data.Int (Int64)
 import Data.Map (Map, lookup)
 import qualified Data.Map as Map (empty, fromList)
 import Text.Parsec
 import Text.Parsec.ByteString.Lazy (Parser)
 import Text.Parsec.Pos (newPos)
 import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
 data Document = Document {
      pdfVersion :: String
    , objectsById :: Map Int Object
    , flow :: [Occurrence]
    , xref :: [ByteString]
    , trailer :: ByteString
    , startXref :: Int64
  } deriving Show
 type Dictionary = Map String DirectObject
 data DirectObject =
    Boolean Bool
  | Number Float
  | String StringObj
  | Name String
  | Array [DirectObject]
  | Dictionary Dictionary
  | Null
  | Reference (Int, Int)
  deriving Show
 data Object =
    Direct DirectObject
  | Stream {
        header :: Dictionary
      , content :: ByteString
    }
  deriving Show
 data Occurrence =
    Comment String
  | Indirect {
        objId :: Int
      , versionNumber :: Int
      , objectContent :: Object
    }
  deriving Show
 data StringObj = Literal String | Hexadecimal String deriving Show
 data EOLStyle = CR | LF | CRLF
 eolCharset :: String
 eolCharset = "\r\n"
 eol :: Parser EOLStyle
 eol =
    try (string "\r\n" >> return CRLF)
  <|> (string "\r" >> return CR)
  <|> (string "\n" >> return LF)
 whiteSpaceCharset :: String
 whiteSpaceCharset = "\0\t\12 "
 whiteSpace :: Parser ()
 whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
 blank :: Parser ()
 blank = skipMany whiteSpace
 delimiterCharset :: String
 delimiterCharset = "()<>[]{}/%"
 delimiter :: Parser Char
 delimiter = oneOf delimiterCharset
 regular :: Parser Char
 regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
 int :: Parser Int
 int = read <$> many1 digit <* whiteSpace
 directObject :: Parser DirectObject
 directObject =
    Boolean <$> boolean
  <|> Number <$> number
  <|> String <$> stringObj
  <|> Name <$> name
  <|> Array <$> array
  <|> const Null <$> nullObject
  <|> Reference <$> reference
 boolean :: Parser Bool
 boolean = (string "true" *> return True) <|> (string "false" *> return False)
 number :: Parser Float
 number = read <$> (mappend <$> (mappend <$> sign <*> integerPart) <*> floatPart)
  where
    sign = string "-" <|> option "" (char '+' >> return "")
    integerPart = option "0" $ many1 digit
    floatPart = option "" $ (:) <$> char '.' <*> integerPart
 stringObj :: Parser StringObj
 stringObj =
    Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
  <|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
  where
    literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar
    matchingParenthesis =
      (++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
    escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
    octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
 name :: Parser String
 name = char '/' *> many regular
 array :: Parser [DirectObject]
 array = char '[' *> directObject `sepBy` whiteSpace <* char ']'
 dictionary :: Parser Dictionary
 dictionary =
  string "<<" *> blank *> keyValPairs <* blank <* string ">>"
  where
    keyValPairs = Map.fromList <$> many ((,) <$> name <*> directObject)
 nullObject :: Parser ()
 nullObject = string "null" *> return ()
 comment :: Parser String
 comment = char '%' *> many (noneOf eolCharset) <* eol
 reference :: Parser (Int, Int)
 reference = (,) <$> int <*> int <* char 'R'
 object :: Parser Object
 object =
    Direct <$> directObject
  <|> Stream <$> dictionary <*> (BS.pack <$> stream)
  where
    stream = string "stream" *> eol *> many anyChar <* eol <* string "endstream"
 occurrence :: Parser Occurrence
 occurrence = Comment <$> comment <|> indirectObj
  where
    indirectObj =
      Indirect <$> int <*> int <*> (string "obj" *> eol
      *> object
      <* eol <* string "endobj")
 version :: Parser String
 version = string magicNumber *> many (noneOf eolCharset)
 magicNumber :: String
 magicNumber = "%PDF-"
 eofMarker :: ByteString
 eofMarker = "%%EOF"
 check :: Bool -> String -> Either ParseError ()
 check test errorMessage = if test then return () else Left parseError
  where
    parseError = newErrorMessage (Message errorMessage) (newPos "" 0 0)
 readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64
 readStartXref eolStyle input =
  check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
    "Badly formed document : missing EOF marker at the end"
  >> return (read . BS.unpack $ subBS startXrefPosition startXrefLength input)
  where
    (eolOffset, eolLastByte) = case eolStyle of
      CRLF -> (2, '\n')
      CR -> (1, '\r')
      _ -> (1, '\n')
    eofMarkerPosition =
      BS.length input - BS.length eofMarker
      - if BS.last input == BS.last eofMarker then 0 else eolOffset
    startXrefPosition =
      previous eolLastByte (eofMarkerPosition - eolOffset) input + 1
    startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
 parseDocument :: ByteString -> Either ParseError Document
 parseDocument input = do
  (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
  startXref <- readStartXref eolStyle input
  return . fillObjects input $ Document {
        pdfVersion
      , objectsById = Map.empty
      , flow = []
      , xref = []
      , trailer = ""
      , startXref
    }
 fillObjects :: ByteString -> Document -> Document
 fillObjects input document = document
		`@ -0,0 +1,2 @@`
							`import Distribution.Simple`
							`main = defaultMain`