Prototype successfully parsing (only last) startxref

This commit is contained in:
Tissevert 2019-05-13 08:05:28 +02:00
commit c036334b6f
7 changed files with 286 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
dist*/
.ghc.environment.*

5
ChangeLog.md Normal file
View File

@ -0,0 +1,5 @@
# Revision history for hufflepdf
## 0.1.0.0 -- YYYY-mm-dd
* First version. Released on an unsuspecting world.

30
LICENSE Normal file
View File

@ -0,0 +1,30 @@
Copyright (c) 2019, Tissevert
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Tissevert nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

2
Setup.hs Normal file
View File

@ -0,0 +1,2 @@
import Distribution.Simple
main = defaultMain

28
hufflepdf.cabal Normal file
View File

@ -0,0 +1,28 @@
-- Initial hufflepdf.cabal generated by cabal init. For further
-- documentation, see http://haskell.org/cabal/users-guide/
name: hufflepdf
version: 0.1.0.0
synopsis: A PDF parser
-- description:
license: BSD3
license-file: LICENSE
author: Tissevert
maintainer: tissevert+devel@marvid.fr
-- copyright:
category: Data
build-type: Simple
extra-source-files: ChangeLog.md
cabal-version: >=1.10
library
exposed-modules: PDF
, Data.ByteString.Lazy.Char8.Util
other-modules:
-- other-extensions:
build-depends: base >=4.9 && <4.13
, bytestring
, containers
, parsec
hs-source-dirs: src
default-language: Haskell2010

View File

@ -0,0 +1,17 @@
module Data.ByteString.Lazy.Char8.Util (
previous
, subBS
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, pack, take)
import Data.Int (Int64)
import Prelude hiding (length)
previous :: Char -> Int64 -> ByteString -> Int64
previous char position byteString
| BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
subBS :: Int64 -> Int64 -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset

202
src/PDF.hs Normal file
View File

@ -0,0 +1,202 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NamedFieldPuns #-}
module PDF (
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, pack, unpack)
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
import Data.Int (Int64)
import Data.Map (Map, lookup)
import qualified Data.Map as Map (empty, fromList)
import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser)
import Text.Parsec.Pos (newPos)
import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
data Document = Document {
pdfVersion :: String
, objectsById :: Map Int Object
, flow :: [Occurrence]
, xref :: [ByteString]
, trailer :: ByteString
, startXref :: Int64
} deriving Show
type Dictionary = Map String DirectObject
data DirectObject =
Boolean Bool
| Number Float
| String StringObj
| Name String
| Array [DirectObject]
| Dictionary Dictionary
| Null
| Reference (Int, Int)
deriving Show
data Object =
Direct DirectObject
| Stream {
header :: Dictionary
, content :: ByteString
}
deriving Show
data Occurrence =
Comment String
| Indirect {
objId :: Int
, versionNumber :: Int
, objectContent :: Object
}
deriving Show
data StringObj = Literal String | Hexadecimal String deriving Show
data EOLStyle = CR | LF | CRLF
eolCharset :: String
eolCharset = "\r\n"
eol :: Parser EOLStyle
eol =
try (string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR)
<|> (string "\n" >> return LF)
whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 "
whiteSpace :: Parser ()
whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
blank :: Parser ()
blank = skipMany whiteSpace
delimiterCharset :: String
delimiterCharset = "()<>[]{}/%"
delimiter :: Parser Char
delimiter = oneOf delimiterCharset
regular :: Parser Char
regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
int :: Parser Int
int = read <$> many1 digit <* whiteSpace
directObject :: Parser DirectObject
directObject =
Boolean <$> boolean
<|> Number <$> number
<|> String <$> stringObj
<|> Name <$> name
<|> Array <$> array
<|> const Null <$> nullObject
<|> Reference <$> reference
boolean :: Parser Bool
boolean = (string "true" *> return True) <|> (string "false" *> return False)
number :: Parser Float
number = read <$> (mappend <$> (mappend <$> sign <*> integerPart) <*> floatPart)
where
sign = string "-" <|> option "" (char '+' >> return "")
integerPart = option "0" $ many1 digit
floatPart = option "" $ (:) <$> char '.' <*> integerPart
stringObj :: Parser StringObj
stringObj =
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
where
literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar
matchingParenthesis =
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
name :: Parser String
name = char '/' *> many regular
array :: Parser [DirectObject]
array = char '[' *> directObject `sepBy` whiteSpace <* char ']'
dictionary :: Parser Dictionary
dictionary =
string "<<" *> blank *> keyValPairs <* blank <* string ">>"
where
keyValPairs = Map.fromList <$> many ((,) <$> name <*> directObject)
nullObject :: Parser ()
nullObject = string "null" *> return ()
comment :: Parser String
comment = char '%' *> many (noneOf eolCharset) <* eol
reference :: Parser (Int, Int)
reference = (,) <$> int <*> int <* char 'R'
object :: Parser Object
object =
Direct <$> directObject
<|> Stream <$> dictionary <*> (BS.pack <$> stream)
where
stream = string "stream" *> eol *> many anyChar <* eol <* string "endstream"
occurrence :: Parser Occurrence
occurrence = Comment <$> comment <|> indirectObj
where
indirectObj =
Indirect <$> int <*> int <*> (string "obj" *> eol
*> object
<* eol <* string "endobj")
version :: Parser String
version = string magicNumber *> many (noneOf eolCharset)
magicNumber :: String
magicNumber = "%PDF-"
eofMarker :: ByteString
eofMarker = "%%EOF"
check :: Bool -> String -> Either ParseError ()
check test errorMessage = if test then return () else Left parseError
where
parseError = newErrorMessage (Message errorMessage) (newPos "" 0 0)
readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64
readStartXref eolStyle input =
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
"Badly formed document : missing EOF marker at the end"
>> return (read . BS.unpack $ subBS startXrefPosition startXrefLength input)
where
(eolOffset, eolLastByte) = case eolStyle of
CRLF -> (2, '\n')
CR -> (1, '\r')
_ -> (1, '\n')
eofMarkerPosition =
BS.length input - BS.length eofMarker
- if BS.last input == BS.last eofMarker then 0 else eolOffset
startXrefPosition =
previous eolLastByte (eofMarkerPosition - eolOffset) input + 1
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
parseDocument :: ByteString -> Either ParseError Document
parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
startXref <- readStartXref eolStyle input
return . fillObjects input $ Document {
pdfVersion
, objectsById = Map.empty
, flow = []
, xref = []
, trailer = ""
, startXref
}
fillObjects :: ByteString -> Document -> Document
fillObjects input document = document