Prototype successfully parsing (only last) startxref
This commit is contained in:
commit
c036334b6f
7 changed files with 286 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
dist*/
|
||||||
|
.ghc.environment.*
|
5
ChangeLog.md
Normal file
5
ChangeLog.md
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
# Revision history for hufflepdf
|
||||||
|
|
||||||
|
## 0.1.0.0 -- YYYY-mm-dd
|
||||||
|
|
||||||
|
* First version. Released on an unsuspecting world.
|
30
LICENSE
Normal file
30
LICENSE
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
Copyright (c) 2019, Tissevert
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above
|
||||||
|
copyright notice, this list of conditions and the following
|
||||||
|
disclaimer in the documentation and/or other materials provided
|
||||||
|
with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Tissevert nor the names of other
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
2
Setup.hs
Normal file
2
Setup.hs
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
import Distribution.Simple
|
||||||
|
main = defaultMain
|
28
hufflepdf.cabal
Normal file
28
hufflepdf.cabal
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
-- Initial hufflepdf.cabal generated by cabal init. For further
|
||||||
|
-- documentation, see http://haskell.org/cabal/users-guide/
|
||||||
|
|
||||||
|
name: hufflepdf
|
||||||
|
version: 0.1.0.0
|
||||||
|
synopsis: A PDF parser
|
||||||
|
-- description:
|
||||||
|
license: BSD3
|
||||||
|
license-file: LICENSE
|
||||||
|
author: Tissevert
|
||||||
|
maintainer: tissevert+devel@marvid.fr
|
||||||
|
-- copyright:
|
||||||
|
category: Data
|
||||||
|
build-type: Simple
|
||||||
|
extra-source-files: ChangeLog.md
|
||||||
|
cabal-version: >=1.10
|
||||||
|
|
||||||
|
library
|
||||||
|
exposed-modules: PDF
|
||||||
|
, Data.ByteString.Lazy.Char8.Util
|
||||||
|
other-modules:
|
||||||
|
-- other-extensions:
|
||||||
|
build-depends: base >=4.9 && <4.13
|
||||||
|
, bytestring
|
||||||
|
, containers
|
||||||
|
, parsec
|
||||||
|
hs-source-dirs: src
|
||||||
|
default-language: Haskell2010
|
17
src/Data/ByteString/Lazy/Char8/Util.hs
Normal file
17
src/Data/ByteString/Lazy/Char8/Util.hs
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
module Data.ByteString.Lazy.Char8.Util (
|
||||||
|
previous
|
||||||
|
, subBS
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
|
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, pack, take)
|
||||||
|
import Data.Int (Int64)
|
||||||
|
import Prelude hiding (length)
|
||||||
|
|
||||||
|
previous :: Char -> Int64 -> ByteString -> Int64
|
||||||
|
previous char position byteString
|
||||||
|
| BS.index byteString position == char = position
|
||||||
|
| otherwise = previous char (position - 1) byteString
|
||||||
|
|
||||||
|
subBS :: Int64 -> Int64 -> ByteString -> ByteString
|
||||||
|
subBS offset length = BS.take length . BS.drop offset
|
202
src/PDF.hs
Normal file
202
src/PDF.hs
Normal file
|
@ -0,0 +1,202 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
module PDF (
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
|
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, pack, unpack)
|
||||||
|
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
|
||||||
|
import Data.Int (Int64)
|
||||||
|
import Data.Map (Map, lookup)
|
||||||
|
import qualified Data.Map as Map (empty, fromList)
|
||||||
|
import Text.Parsec
|
||||||
|
import Text.Parsec.ByteString.Lazy (Parser)
|
||||||
|
import Text.Parsec.Pos (newPos)
|
||||||
|
import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
|
||||||
|
|
||||||
|
data Document = Document {
|
||||||
|
pdfVersion :: String
|
||||||
|
, objectsById :: Map Int Object
|
||||||
|
, flow :: [Occurrence]
|
||||||
|
, xref :: [ByteString]
|
||||||
|
, trailer :: ByteString
|
||||||
|
, startXref :: Int64
|
||||||
|
} deriving Show
|
||||||
|
|
||||||
|
type Dictionary = Map String DirectObject
|
||||||
|
|
||||||
|
data DirectObject =
|
||||||
|
Boolean Bool
|
||||||
|
| Number Float
|
||||||
|
| String StringObj
|
||||||
|
| Name String
|
||||||
|
| Array [DirectObject]
|
||||||
|
| Dictionary Dictionary
|
||||||
|
| Null
|
||||||
|
| Reference (Int, Int)
|
||||||
|
deriving Show
|
||||||
|
|
||||||
|
data Object =
|
||||||
|
Direct DirectObject
|
||||||
|
| Stream {
|
||||||
|
header :: Dictionary
|
||||||
|
, content :: ByteString
|
||||||
|
}
|
||||||
|
deriving Show
|
||||||
|
|
||||||
|
data Occurrence =
|
||||||
|
Comment String
|
||||||
|
| Indirect {
|
||||||
|
objId :: Int
|
||||||
|
, versionNumber :: Int
|
||||||
|
, objectContent :: Object
|
||||||
|
}
|
||||||
|
deriving Show
|
||||||
|
|
||||||
|
data StringObj = Literal String | Hexadecimal String deriving Show
|
||||||
|
|
||||||
|
data EOLStyle = CR | LF | CRLF
|
||||||
|
|
||||||
|
eolCharset :: String
|
||||||
|
eolCharset = "\r\n"
|
||||||
|
|
||||||
|
eol :: Parser EOLStyle
|
||||||
|
eol =
|
||||||
|
try (string "\r\n" >> return CRLF)
|
||||||
|
<|> (string "\r" >> return CR)
|
||||||
|
<|> (string "\n" >> return LF)
|
||||||
|
|
||||||
|
whiteSpaceCharset :: String
|
||||||
|
whiteSpaceCharset = "\0\t\12 "
|
||||||
|
|
||||||
|
whiteSpace :: Parser ()
|
||||||
|
whiteSpace = oneOf whiteSpaceCharset *> return () <|> eol *> return ()
|
||||||
|
|
||||||
|
blank :: Parser ()
|
||||||
|
blank = skipMany whiteSpace
|
||||||
|
|
||||||
|
delimiterCharset :: String
|
||||||
|
delimiterCharset = "()<>[]{}/%"
|
||||||
|
|
||||||
|
delimiter :: Parser Char
|
||||||
|
delimiter = oneOf delimiterCharset
|
||||||
|
|
||||||
|
regular :: Parser Char
|
||||||
|
regular = noneOf $ eolCharset ++ whiteSpaceCharset ++ delimiterCharset
|
||||||
|
|
||||||
|
int :: Parser Int
|
||||||
|
int = read <$> many1 digit <* whiteSpace
|
||||||
|
|
||||||
|
directObject :: Parser DirectObject
|
||||||
|
directObject =
|
||||||
|
Boolean <$> boolean
|
||||||
|
<|> Number <$> number
|
||||||
|
<|> String <$> stringObj
|
||||||
|
<|> Name <$> name
|
||||||
|
<|> Array <$> array
|
||||||
|
<|> const Null <$> nullObject
|
||||||
|
<|> Reference <$> reference
|
||||||
|
|
||||||
|
boolean :: Parser Bool
|
||||||
|
boolean = (string "true" *> return True) <|> (string "false" *> return False)
|
||||||
|
|
||||||
|
number :: Parser Float
|
||||||
|
number = read <$> (mappend <$> (mappend <$> sign <*> integerPart) <*> floatPart)
|
||||||
|
where
|
||||||
|
sign = string "-" <|> option "" (char '+' >> return "")
|
||||||
|
integerPart = option "0" $ many1 digit
|
||||||
|
floatPart = option "" $ (:) <$> char '.' <*> integerPart
|
||||||
|
|
||||||
|
stringObj :: Parser StringObj
|
||||||
|
stringObj =
|
||||||
|
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
|
||||||
|
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
|
||||||
|
where
|
||||||
|
literalStringBlock = many (noneOf "\\(") <|> matchingParenthesis <|> escapedChar
|
||||||
|
matchingParenthesis =
|
||||||
|
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||||
|
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||||
|
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
|
||||||
|
|
||||||
|
name :: Parser String
|
||||||
|
name = char '/' *> many regular
|
||||||
|
|
||||||
|
array :: Parser [DirectObject]
|
||||||
|
array = char '[' *> directObject `sepBy` whiteSpace <* char ']'
|
||||||
|
|
||||||
|
dictionary :: Parser Dictionary
|
||||||
|
dictionary =
|
||||||
|
string "<<" *> blank *> keyValPairs <* blank <* string ">>"
|
||||||
|
where
|
||||||
|
keyValPairs = Map.fromList <$> many ((,) <$> name <*> directObject)
|
||||||
|
|
||||||
|
nullObject :: Parser ()
|
||||||
|
nullObject = string "null" *> return ()
|
||||||
|
|
||||||
|
comment :: Parser String
|
||||||
|
comment = char '%' *> many (noneOf eolCharset) <* eol
|
||||||
|
|
||||||
|
reference :: Parser (Int, Int)
|
||||||
|
reference = (,) <$> int <*> int <* char 'R'
|
||||||
|
|
||||||
|
object :: Parser Object
|
||||||
|
object =
|
||||||
|
Direct <$> directObject
|
||||||
|
<|> Stream <$> dictionary <*> (BS.pack <$> stream)
|
||||||
|
where
|
||||||
|
stream = string "stream" *> eol *> many anyChar <* eol <* string "endstream"
|
||||||
|
|
||||||
|
occurrence :: Parser Occurrence
|
||||||
|
occurrence = Comment <$> comment <|> indirectObj
|
||||||
|
where
|
||||||
|
indirectObj =
|
||||||
|
Indirect <$> int <*> int <*> (string "obj" *> eol
|
||||||
|
*> object
|
||||||
|
<* eol <* string "endobj")
|
||||||
|
|
||||||
|
version :: Parser String
|
||||||
|
version = string magicNumber *> many (noneOf eolCharset)
|
||||||
|
|
||||||
|
magicNumber :: String
|
||||||
|
magicNumber = "%PDF-"
|
||||||
|
|
||||||
|
eofMarker :: ByteString
|
||||||
|
eofMarker = "%%EOF"
|
||||||
|
|
||||||
|
check :: Bool -> String -> Either ParseError ()
|
||||||
|
check test errorMessage = if test then return () else Left parseError
|
||||||
|
where
|
||||||
|
parseError = newErrorMessage (Message errorMessage) (newPos "" 0 0)
|
||||||
|
|
||||||
|
readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64
|
||||||
|
readStartXref eolStyle input =
|
||||||
|
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
|
||||||
|
"Badly formed document : missing EOF marker at the end"
|
||||||
|
>> return (read . BS.unpack $ subBS startXrefPosition startXrefLength input)
|
||||||
|
where
|
||||||
|
(eolOffset, eolLastByte) = case eolStyle of
|
||||||
|
CRLF -> (2, '\n')
|
||||||
|
CR -> (1, '\r')
|
||||||
|
_ -> (1, '\n')
|
||||||
|
eofMarkerPosition =
|
||||||
|
BS.length input - BS.length eofMarker
|
||||||
|
- if BS.last input == BS.last eofMarker then 0 else eolOffset
|
||||||
|
startXrefPosition =
|
||||||
|
previous eolLastByte (eofMarkerPosition - eolOffset) input + 1
|
||||||
|
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
|
||||||
|
|
||||||
|
parseDocument :: ByteString -> Either ParseError Document
|
||||||
|
parseDocument input = do
|
||||||
|
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
|
||||||
|
startXref <- readStartXref eolStyle input
|
||||||
|
return . fillObjects input $ Document {
|
||||||
|
pdfVersion
|
||||||
|
, objectsById = Map.empty
|
||||||
|
, flow = []
|
||||||
|
, xref = []
|
||||||
|
, trailer = ""
|
||||||
|
, startXref
|
||||||
|
}
|
||||||
|
|
||||||
|
fillObjects :: ByteString -> Document -> Document
|
||||||
|
fillObjects input document = document
|
Loading…
Reference in a new issue