Compare commits

...

14 Commits

Author SHA1 Message Date
Tissevert 7eca875900 Improve getObj example to catch no-existing ObjectId and default to listing existing ObjectIds when none is provided 2019-11-29 11:53:08 +01:00
Tissevert 380c1e439b Fix a bug preventing Hufflepdf from reading objects with a ' ' after the `obj` keyword 2019-11-27 18:01:19 +01:00
Tissevert d6994f0813 Release 0.2.0.0 2019-10-14 10:16:14 +02:00
Tissevert 68f90d20e2 Implement PDF's multilayer updates and use it in getObj to display only the current version of the object taken into account instead of the concatenation of all its versions 2019-09-22 01:40:39 +02:00
Tissevert 3a39c75e6a Stop requiring an empty line between subsections in a xref section 2019-09-22 01:37:28 +02:00
Tissevert 29c5823f34 Fix precision bug caused by using Floats to represent PDF Number values sometimes used to represent a byte offset within a file 2019-09-22 01:34:17 +02:00
Tissevert 9ab010de61 Add to example programs to show how the lib can be used 2019-09-20 22:42:17 +02:00
Tissevert 699f830a45 Simplify XRef structure, clarify integer types and remove nextLine 2019-09-20 22:39:14 +02:00
Tissevert dd79cb3fc7 Release bugfix v0.1.1.1 2019-05-31 15:16:23 +02:00
Tissevert 264b0dc92b Stop requiring «trailer» keywords to live on a separate line as counter-examples have been found 2019-05-31 15:08:54 +02:00
Tissevert 9dac275f68 Keep comment-opening '%' along with the comment and support empty lines 2019-05-31 15:07:41 +02:00
Tissevert 85e4eb9273 Fix bypassed error message for lines + add one for occurrences 2019-05-31 15:06:20 +02:00
Tissevert 11cb6504d7 Go strict ByteStrings with attoparsec 2019-05-24 10:48:09 +02:00
Tissevert 0daa03d958 Remove commented out dead code 2019-05-21 09:07:37 +02:00
13 changed files with 530 additions and 257 deletions

View File

@ -1,5 +1,24 @@
# Revision history for Hufflepdf
## 0.2.0.1 -- 2019-11-27
* Fix bug discovered while running Hufflepdf on a PDF output from pdftk : magic keywords like `obj`, `stream` or `xref` can have spaces after them before the EOL
## 0.2.0.0 -- 2019-10-14
* Implement PDF's multilayer update mechanism
## 0.1.1.1 -- 2019-05-31
* Fix a bug preventing files with the «trailer» keyword on the same line as the dictionary that follows it to be parsed
* Improve error messages
* Support empty lines as comments
## 0.1.1.0 -- 2019-05-23
* Rewrite the parser using attoparsec instead of parsec (about 1.5x faster)
* Use strict ByteStrings instead of lazy ones for an additional gain in performance, simpler type interface (lazy ByteStrings are now used only for output because that's required by the ByteString Builder) and fewer import lines
## 0.1.0.0 -- 2019-05-18
* First version. Released on an unsuspecting world.

View File

@ -2,7 +2,7 @@
-- documentation, see http://haskell.org/cabal/users-guide/
name: Hufflepdf
version: 0.1.0.0
version: 0.2.0.1
synopsis: A PDF parser
-- description:
license: BSD3
@ -17,17 +17,37 @@ cabal-version: >=1.10
library
exposed-modules: PDF
, PDF.Object
other-modules: PDF.Body
, PDF.EOL
, PDF.Object
, PDF.Output
, Data.ByteString.Lazy.Char8.Util
, PDF.Update
other-modules: Data.ByteString.Char8.Util
, PDF.Body
, PDF.Parser
-- other-extensions:
build-depends: base >=4.9 && <4.13
build-depends: attoparsec
, base >=4.9 && <4.13
, bytestring
, containers
, mtl
, parsec
hs-source-dirs: src
ghc-options: -Wall
default-language: Haskell2010
executable equivalent
main-is: examples/equivalent.hs
build-depends: base
, bytestring
, Hufflepdf
ghc-options: -Wall
default-language: Haskell2010
executable getObj
main-is: examples/getObj.hs
build-depends: base
, bytestring
, containers
, Hufflepdf
, zlib
ghc-options: -Wall
default-language: Haskell2010

13
examples/equivalent.hs Normal file
View File

@ -0,0 +1,13 @@
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (writeFile)
import PDF (parseDocument, render)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
main :: IO ()
main = do
[inputFile, outputFile] <- getArgs
result <- parseDocument <$> BS.readFile inputFile
case result of
Left parseError -> hPutStrLn stderr parseError
Right doc -> Lazy.writeFile outputFile $ render doc

53
examples/getObj.hs Normal file
View File

@ -0,0 +1,53 @@
{-# LANGUAGE NamedFieldPuns #-}
import Codec.Compression.Zlib (decompress)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
import Data.Map ((!?))
import qualified Data.Map as Map (keys, lookup)
import PDF (Document(..), parseDocument)
import qualified PDF.EOL as EOL (Style)
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
import PDF.Output (ObjectId(..))
import qualified PDF.Output as Output (render)
import PDF.Update (unify)
import System.Environment (getArgs, getProgName)
import System.Exit (die)
import Text.Printf (printf)
display :: EOL.Style -> Object -> ByteString
display eolStyle d@(Direct _) = Output.render eolStyle d
display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) -> Stream {
header
, streamContent = Lazy.toStrict . decompress $ Lazy.fromStrict streamContent
}
_ -> s
extractObject :: ObjectId -> Document -> Either String ByteString
extractObject objectId (Document {eolStyle, updates}) =
case objects content !? objectId of
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
Just o -> Right $ display eolStyle o
where
content = unify updates
listObjectIds :: Document -> Either String [String]
listObjectIds =
Right . prependTitle . toString . Map.keys . objects . unify . updates
where
toString = fmap (show . getObjectId)
prependTitle = ("ObjectIds defined in this PDF:":)
main :: IO ()
main = do
(inputFile, getData) <- parse =<< getArgs
input <- BS.readFile inputFile
either die id $ (parseDocument input >>= getData)
where
parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
parse [inputFile, objectId] = return
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName

View File

@ -0,0 +1,16 @@
module Data.ByteString.Char8.Util (
previous
, subBS
) where
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (drop, index, take)
import Prelude hiding (length)
previous :: Char -> Int -> ByteString -> Int
previous char position byteString
| BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
subBS :: Int -> Int -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset

View File

@ -1,17 +0,0 @@
module Data.ByteString.Lazy.Char8.Util (
previous
, subBS
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
import Data.Int (Int64)
import Prelude hiding (length)
previous :: Char -> Int64 -> ByteString -> Int64
previous char position byteString
| BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
subBS :: Int64 -> Int64 -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset

View File

@ -5,12 +5,12 @@ module PDF (
, render
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (
drop, findIndex, head, isPrefixOf, last, length, span, unpack
)
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
import Data.Int (Int64)
import Data.ByteString.Char8.Util (previous, subBS)
import qualified Data.ByteString.Lazy as Lazy (ByteString)
import qualified Data.Map as Map (lookup)
import PDF.Body (populate)
import qualified PDF.EOL as EOL (Style(..), charset, parser)
@ -19,39 +19,33 @@ import PDF.Object (
, Structure(..)
, eofMarker, magicNumber, structure
)
import qualified PDF.Output as Output (render, string)
import PDF.Output (Output(..), nextLine)
import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser)
import Text.Parsec.Pos (newPos)
import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
import qualified PDF.Output as Output (render, line)
import PDF.Output (Output(..))
import PDF.Parser (Parser, runParser, string, takeAll)
import Text.Printf (printf)
data Document = Document {
pdfVersion :: String
, eolStyle :: EOL.Style
, contents :: [Content]
, updates :: [Content]
} deriving Show
instance Output Document where
output (Document {pdfVersion, contents}) =
Output.string (printf "%%PDF-%s" pdfVersion)
`nextLine` output contents
output (Document {pdfVersion, updates}) =
Output.line (printf "%%PDF-%s" pdfVersion)
`mappend` output updates
render :: Document -> ByteString
render :: Document -> Lazy.ByteString
render document@(Document {eolStyle}) = Output.render eolStyle document
version :: Parser String
version = string magicNumber *> many (noneOf EOL.charset)
version :: Parser () String
version = BS.unpack <$>
(string magicNumber *> takeAll (not . (`elem` EOL.charset)))
parseError :: String -> Either ParseError a
parseError errorMessage =
Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0)
check :: Bool -> String -> Either String ()
check test errorMessage = if test then return () else Left errorMessage
check :: Bool -> String -> Either ParseError ()
check test errorMessage = if test then return () else parseError errorMessage
readStartXref :: EOL.Style -> ByteString -> Either ParseError Int64
readStartXref :: EOL.Style -> ByteString -> Either String Int
readStartXref eolStyle input =
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
"Badly formed document : missing EOF marker at the end"
@ -68,7 +62,7 @@ readStartXref eolStyle input =
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
findNextLine :: ByteString -> Int64
findNextLine :: ByteString -> Int
findNextLine input =
let (line, eolPrefixed) = BS.span notInEol input in
let nextNotInEol = BS.findIndex notInEol eolPrefixed in
@ -76,7 +70,7 @@ findNextLine input =
where
notInEol = not . (`elem` EOL.charset)
findNextSection :: Int64 -> ByteString -> Int64
findNextSection :: Int -> ByteString -> Int
findNextSection offset input =
case BS.findIndex (== BS.head eofMarker) input of
Nothing -> 0
@ -87,9 +81,9 @@ findNextSection offset input =
then newOffset + findNextLine newInput
else findNextSection (newOffset + 1) (BS.drop 1 newInput)
readStructures :: Int64 -> ByteString -> Either ParseError [InputStructure]
readStructures :: Int -> ByteString -> Either String [InputStructure]
readStructures startXref input =
parse structure ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
runParser structure () (BS.drop startXref input) >>= stopOrFollow
where
stopOrFollow s@(Structure {trailer}) =
case Map.lookup (Name "Prev") trailer of
@ -98,12 +92,12 @@ readStructures startXref input =
let offset = truncate newStartXref in
let startOffset = findNextSection offset (BS.drop offset input) in
(InputStructure startOffset s:) <$> (readStructures offset input)
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
Just v -> Left $ "Bad value for Prev entry in trailer: " ++ show v
parseDocument :: ByteString -> Either ParseError Document
parseDocument :: ByteString -> Either String Document
parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> EOL.parser) "" input
(pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
startXref <- readStartXref eolStyle input
structuresRead <- readStructures startXref input
let contents = populate input <$> structuresRead
return $ Document {pdfVersion, eolStyle, contents}
let updates = populate input <$> structuresRead
return $ Document {pdfVersion, eolStyle, updates}

View File

@ -1,21 +1,24 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
module PDF.Body (
populate
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
import Data.Int (Int64)
import Control.Applicative ((<|>))
import Control.Monad.State (get, gets, modify)
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
import Data.Map ((!))
import qualified Data.Map as Map (empty, insert, lookup)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Object (
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
, Parser, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..)
, Structure(..), XRefEntry(..), XRefSection
, blank, dictionary, directObject, integer, line
)
import Text.Parsec
import PDF.Output (ObjectId(..), Offset(..))
import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
data UserState = UserState {
input :: ByteString
@ -26,9 +29,9 @@ data UserState = UserState {
type SParser = Parser UserState
modifyFlow :: (Flow -> Flow) -> SParser ()
modifyFlow f = modifyState $ \state -> state {flow = f $ flow state}
modifyFlow f = modify $ \state -> state {flow = f $ flow state}
addObject :: Int -> Object -> SParser ()
addObject :: ObjectId -> Object -> SParser ()
addObject objectId newObject = modifyFlow $ \flow -> flow {
tmpObjects = Map.insert objectId newObject $ tmpObjects flow
}
@ -39,39 +42,27 @@ pushOccurrence newOccurrence = modifyFlow $ \flow -> flow {
}
comment :: Parser u String
comment = char '%' *> many (noneOf EOL.charset) <* EOL.parser
comment = BS.unpack <$> (option "" afterPercent <* EOL.parser)
where
afterPercent = BS.cons <$> char '%' <*> takeAll (not . (`elem` EOL.charset))
lookupOffset :: Int -> XRefSection -> Maybe Int64
lookupOffset _ [] = Nothing
lookupOffset objectId (xrefSubSection:others) =
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
let index = objectId - firstObjectId in
if index >= 0 && index < entriesNumber
then
case Map.lookup index entries of
Just (InUse {offset}) -> Just offset
_ -> Nothing
else lookupOffset objectId others
getOffset :: Int -> SParser Int64
getOffset objectId = do
table <- xreferences <$> getState
case lookupOffset objectId table of
lookupOffset :: ObjectId -> SParser Offset
lookupOffset objectId = do
table <- gets xreferences
case Map.lookup objectId table >>= entryOffset of
Nothing -> fail $
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
Just offset -> return offset
where
entryOffset (InUse {offset}) = Just offset
entryOffset _ = Nothing
on :: Monad m => ParsecT s u m a -> s -> ParsecT s u m a
on parser input = do
originalInput <- getInput
setInput input >> parser <* setInput originalInput
loadNumber :: Int -> SParser Float
loadNumber :: ObjectId -> SParser Double
loadNumber objectId = do
offset <- getOffset objectId
objectStart <- BS.drop offset . input <$> getState
indirectObjCoordinates `on` objectStart >> return ()
objectValue <- (!objectId) . tmpObjects . flow <$> getState
offset <- getOffset <$> lookupOffset objectId
objectStart <- BS.drop offset <$> gets input
indirectObjCoordinates `on` (objectStart :: ByteString) >> return ()
objectValue <- (!objectId) . tmpObjects <$> gets flow
case objectValue of
Direct (NumberObject (Number n)) -> return n
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
@ -79,11 +70,11 @@ loadNumber objectId = do
invalidValue :: Object -> String
invalidValue v = "Invalid value " ++ show v
getSize :: Maybe DirectObject -> SParser Float
getSize :: Maybe DirectObject -> SParser Double
getSize Nothing = fail "Missing '/Length' key on stream"
getSize (Just (NumberObject (Number size))) = return size
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
Flow {tmpObjects} <- flow <$> getState
Flow {tmpObjects} <- gets flow
case Map.lookup objectId tmpObjects of
Nothing -> loadNumber objectId
Just (Direct (NumberObject (Number size))) -> return size
@ -92,32 +83,33 @@ getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
streamObject :: SParser Object
streamObject = try $ do
streamObject = do
header <- dictionary <* blank
size <- getSize (Map.lookup (Name "Length") header)
streamContent <- BS.pack <$> stream (truncate size)
streamContent <- stream (truncate size)
return $ Stream {header, streamContent}
where
stream size = line "stream" *> count size anyChar <* blank <* line "endstream"
stream size = line "stream" *> block size <* blank <* line "endstream"
object :: SParser Object
object = streamObject <|> Direct <$> directObject
indirectObjCoordinates :: SParser IndirectObjCoordinates
indirectObjCoordinates = do
objectId <- integer
objectId <- ObjectId <$> integer
coordinates <- IndirectObjCoordinates objectId <$> integer
objectValue <- line "obj" *> object <* blank <* line "endobj"
addObject objectId objectValue
return coordinates
occurrence :: SParser Occurrence
occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
occurrence =
Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
populate :: ByteString -> InputStructure -> Content
populate input structure =
let bodyInput = BS.drop (startOffset structure) input in
case runParser recurseOnOccurrences initialState "" bodyInput of
case runParser recurseOnOccurrences initialState bodyInput of
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
Right finalState ->
let Flow {occurrencesStack, tmpObjects} = flow finalState in
@ -126,7 +118,7 @@ populate input structure =
}
where
docStructure = inputStructure structure
xreferences = xrefSection docStructure
xreferences = xRef docStructure
initialState = UserState {
input, xreferences, flow = Flow {
occurrencesStack = [], tmpObjects = Map.empty
@ -135,4 +127,4 @@ populate input structure =
recurseOnOccurrences :: SParser UserState
recurseOnOccurrences =
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> get

View File

@ -1,19 +1,20 @@
{-# LANGUAGE OverloadedStrings #-}
module PDF.EOL (
Style(..)
, charset
, parser
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import Text.Parsec ((<|>), Parsec, string, try)
import Control.Applicative ((<|>))
import PDF.Parser (Parser, string)
data Style = CR | LF | CRLF deriving Show
charset :: String
charset = "\r\n"
parser :: Parsec ByteString u Style
parser :: Parser s Style
parser =
try (string "\r\n" >> return CRLF)
(string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR)
<|> (string "\n" >> return LF)

View File

@ -5,17 +5,16 @@ module PDF.Object (
Content(..)
, DirectObject(..)
, Flow(..)
, IndexedObjects
, IndirectObjCoordinates(..)
, InputStructure(..)
, Name(..)
, Number(..)
, Object(..)
, Occurrence(..)
, Parser
, Structure(..)
, XRefEntry(..)
, XRefSection
, XRefSubSection(..)
, blank
, dictionary
, directObject
@ -26,25 +25,33 @@ module PDF.Object (
, structure
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import Data.Int (Int64)
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (elems, fromList, toList)
import qualified PDF.EOL as EOL (charset, parser)
import qualified PDF.Output as Output (string)
import PDF.Output (
OBuilder, Offset(..), Output(..)
, byteString, getOffsets, join, newLine, nextLine, saveOffset
import Control.Applicative ((<|>))
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (
concat, cons, pack, singleton, unpack
)
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union
)
import qualified PDF.EOL as EOL (charset, parser)
import qualified PDF.Output as Output (concat, line, string)
import PDF.Output (
OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
, byteString, getObjectId, getOffset, getOffsets, join, newLine
, saveOffset
)
import PDF.Parser (
Parser, (<?>)
, char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
, sepBy, string, takeAll, takeAll1
)
import Text.Parsec
import Text.Printf (printf)
type Parser u = Parsec ByteString u
line :: String -> Parser u ()
line l = string l *> EOL.parser *> return ()
line l = (string (BS.pack l) *> blank *> return ()) <?> printf "line «%s»" l
magicNumber :: String
magicNumber :: ByteString
magicNumber = "%PDF-"
eofMarker :: ByteString
@ -53,35 +60,35 @@ eofMarker = "%%EOF"
whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 "
whiteSpace :: Parser u ()
whiteSpace = oneOf whiteSpaceCharset *> return () <|> EOL.parser *> return ()
blank :: Parser u ()
blank = skipMany whiteSpace
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
delimiterCharset :: String
delimiterCharset = "()<>[]{}/%"
regular :: Parser u Char
regular = noneOf $ EOL.charset ++ whiteSpaceCharset ++ delimiterCharset
regular :: Char -> Bool
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: (Read a, Num a) => Parser u a
integer = read <$> many1 digit <* whiteSpace
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
-------------------------------------
-- OBJECTS
-------------------------------------
type IndexedObjects = Map ObjectId Object
--
-- Boolean
--
boolean :: Parser u Bool
boolean = (string "true" *> return True) <|> (string "false" *> return False)
boolean =
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
--
-- Number
--
newtype Number = Number Float deriving Show
newtype Number = Number Double deriving Show
instance Output Number where
output (Number f) = Output.string $
@ -90,12 +97,13 @@ instance Output Number where
_ -> printf "%f" f
number :: Parser u Number
number = Number . read <$>
(mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart))
number = Number . read . BS.unpack <$>
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
<?> "number"
where
sign = string "-" <|> option "" (char '+' >> return "")
integerPart = mappend <$> many1 digit <*> option "" floatPart
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit)
integerPart = mappend <$> decNumber <*> option "" floatPart
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
--
-- StringObject
@ -108,14 +116,18 @@ instance Output StringObject where
stringObject :: Parser u StringObject
stringObject =
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
<?> "string object (literal or hexadecimal)"
where
literalStringBlock = many1 (noneOf "\\()") <|> matchingParenthesis <|> escapedChar
literalString = many literalStringBlock
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
normalChar = not . (`elem` ("\\()" :: String))
matchingParenthesis =
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar =
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
--
-- Name
@ -123,16 +135,17 @@ stringObject =
newtype Name = Name String deriving (Eq, Ord, Show)
instance Output Name where
output (Name n) = "/" `mappend` Output.string n
output (Name n) = Output.string ('/':n)
name :: Parser u Name
name = Name <$> (char '/' *> many regular)
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
--
-- Array
--
array :: Parser u [DirectObject]
array = char '[' *> blank *> directObject `endBy` blank <* char ']'
array =
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
--
-- Dictionary
@ -145,31 +158,32 @@ instance Output Dictionary where
where
keyValues = join " " $ outputKeyVal <$> Map.toList dict
outputKeyVal :: (Name, DirectObject) -> OBuilder
outputKeyVal (key, val) = output key `mappend` " " `mappend` output val
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
dictionary :: Parser u Dictionary
dictionary =
try (string "<<" *> blank *> keyValPairs <* string ">>")
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
where
keyVal = (,) <$> name <* blank <*> directObject
keyValPairs = Map.fromList <$> keyVal `endBy` blank
keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank
--
-- Null
--
nullObject :: Parser u ()
nullObject = string "null" *> return ()
nullObject = string "null" *> return () <?> "null object"
--
-- Reference
--
data IndirectObjCoordinates = IndirectObjCoordinates {
objectId :: Int
objectId :: ObjectId
, versionNumber :: Int
} deriving Show
reference :: Parser u IndirectObjCoordinates
reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R'
reference = IndirectObjCoordinates
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
--
-- DirectObject
@ -190,22 +204,23 @@ instance Output DirectObject where
output (NumberObject n) = output n
output (StringObject s) = output s
output (NameObject n) = output n
output (Array a) = "[" `mappend` join " " a `mappend` "]"
output (Array a) = Output.concat ["[", join " " a, "]"]
output (Dictionary d) = output d
output (Null) = "null"
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
Output.string (printf "%d %d R" objectId versionNumber)
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
directObject :: Parser u DirectObject
directObject =
Boolean <$> try boolean
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> try number
<|> StringObject <$> try stringObject
<|> NameObject <$> try name
<|> Array <$> try array
<|> Dictionary <$> try dictionary
<|> const Null <$> try nullObject
Boolean <$> boolean
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> number
<|> StringObject <$> stringObject
<|> NameObject <$> name
<|> Array <$> array
<|> Dictionary <$> dictionary
<|> const Null <$> nullObject
<?> "direct object"
--
-- Object
@ -220,26 +235,26 @@ data Object =
instance Output Object where
output (Direct d) = output d
output (Stream {header, streamContent}) =
output header
`nextLine` "stream"
`nextLine` byteString streamContent
`mappend` "endstream"
output (Stream {header, streamContent}) = Output.concat [
output header, newLine
, Output.line "stream"
, byteString streamContent
, "endstream"
]
--
-- Occurrence
--
data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
outputOccurrence :: Map Int Object -> Occurrence -> OBuilder
outputOccurrence _ (Comment c) =
Output.string (printf "%%%s" c) `mappend` newLine
outputOccurrence :: IndexedObjects -> Occurrence -> OBuilder
outputOccurrence _ (Comment c) = Output.line c
outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNumber})) =
saveOffset (ObjectId objectId)
>> Output.string (printf "%d %d obj" objectId versionNumber)
`nextLine` output (objects ! objectId)
`nextLine` "endobj" `mappend` newLine
saveOffset (Object objectId) >> Output.concat [
Output.line (printf "%d %d obj" (getObjectId objectId) versionNumber)
, output (objects ! objectId), newLine
, Output.line "endobj"
]
-------------------------------------
-- XREF TABLE
@ -249,89 +264,107 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum
-- XRefEntry
--
data XRefEntry = InUse {
offset :: Int64
offset :: Offset
, generation :: Int
} | Free {
nextFree :: Int64
nextFree :: ObjectId
, generation :: Int
} deriving Show
instance Output XRefEntry where
output (InUse {offset, generation}) =
Output.string (printf "%010d %05d n " offset generation) `mappend` newLine
Output.line (printf "%010d %05d n " (getOffset offset) generation)
output (Free {nextFree, generation}) =
Output.string (printf "%010d %05d f " nextFree generation) `mappend` newLine
Output.line (printf "%010d %05d f " (getObjectId nextFree) generation)
entry :: Parser u XRefEntry
entry = do
(big, small) <- (,) <$> integer <*> integer
(inUse big small <|> free big small) <* blank
(inUse big small <|> free big small <?> "XRef entry") <* blank
where
inUse :: Int64 -> Int -> Parser u XRefEntry
inUse offset generation = char 'n' *> return (InUse {offset, generation})
free :: Int64 -> Int -> Parser u XRefEntry
free nextFree generation = char 'f' *> return (Free {nextFree, generation})
inUse :: Int -> Int -> Parser u XRefEntry
inUse big generation =
char 'n' *> return (InUse {offset = Offset big, generation})
free :: Int -> Int -> Parser u XRefEntry
free big generation =
char 'f' *> return (Free {nextFree = ObjectId big, generation})
--
-- XRefSubSection
--
data XRefSubSection = XRefSubSection {
firstObjectId :: Int
, entriesNumber :: Int
, entries :: Map Int XRefEntry
firstObjectId :: ObjectId
, entries :: [XRefEntry]
} deriving Show
instance Output XRefSubSection where
output (XRefSubSection {firstObjectId, entriesNumber, entries}) =
Output.string (printf "%d %d" firstObjectId entriesNumber)
`nextLine` output (Map.elems entries)
output (XRefSubSection {firstObjectId, entries}) =
Output.line (printf "%d %d" (getObjectId firstObjectId) (length entries))
`mappend` output entries
xrefSubSection :: Parser u XRefSubSection
xrefSubSection = do
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
return $ XRefSubSection {firstObjectId, entriesNumber, entries}
xRefSubSection :: Parser u XRefSubSection
xRefSubSection = do
(firstId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
entries <- count entriesNumber entry
return $ XRefSubSection {firstObjectId = ObjectId firstId, entries}
type XRefSection = [XRefSubSection]
type XRefSection = Map ObjectId XRefEntry
instance Output XRefSection where
output = output . sections
where
sections tmp =
case Map.minViewWithKey tmp of
Nothing -> []
Just ((objectId@(ObjectId value), firstEntry), rest) ->
let (subSection, sndRest) = section objectId [firstEntry] (value + 1) rest in
subSection : sections sndRest
section firstObjectId stack nextValue tmp =
let nextId = ObjectId nextValue in
case Map.lookup nextId tmp of
Nothing -> (XRefSubSection {firstObjectId, entries = reverse stack}, tmp)
Just nextEntry ->
section firstObjectId (nextEntry:stack) (nextValue + 1) (Map.delete nextId tmp)
xRefSection :: Parser u XRefSection
xRefSection = foldr addSubsection Map.empty <$>
(line "xref" *> xRefSubSection `sepBy` many EOL.parser)
where
addSubsection (XRefSubSection {firstObjectId, entries}) =
Map.union . Map.fromList $ zip ([firstObjectId..]) entries
--
-- Structure
--
data InputStructure = InputStructure {
startOffset :: Int64
startOffset :: Int
, inputStructure :: Structure
}
data Structure = Structure {
xrefSection :: XRefSection
xRef :: XRefSection
, trailer :: Dictionary
} deriving Show
structure :: Parser u Structure
structure =
Structure
<$> (line "xref" *> xrefSubSection `sepBy` EOL.parser)
<*> (line "trailer" *> dictionary <* EOL.parser)
<$> xRefSection
<*> (string "trailer" *> blank *> dictionary <* EOL.parser)
updateXrefs :: XRefSection -> Map Offset Int64 -> (XRefSection, Int64)
updateXrefs xrefSection offsets = (
updateSubSection <$> xrefSection
, offsets ! StartXRef
)
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
where
updateSubSection subSection@(XRefSubSection {firstObjectId, entries}) =
subSection {entries = mapWithKey (updateEntry firstObjectId) entries}
updateEntry firstObjectId index e@(InUse {}) =
e {offset = offsets ! (ObjectId $ firstObjectId + index)}
updateEntry _ _ e = e
updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
updateEntry _ e = e
--
-- Flow
--
data Flow = Flow {
occurrencesStack :: [Occurrence]
, tmpObjects :: Map Int Object
, tmpObjects :: IndexedObjects
} deriving Show
--
@ -339,25 +372,26 @@ data Flow = Flow {
--
data Content = Content {
occurrences :: [Occurrence]
, objects :: Map Int Object
, objects :: IndexedObjects
, docStructure :: Structure
} deriving Show
outputBody :: ([Occurrence], Map Int Object) -> OBuilder
outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
outputBody (occurrences, objects) =
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
instance Output Content where
output (Content {occurrences, objects, docStructure}) =
fmap (updateXrefs xrefSection) <$> getOffsets (outputBody (occurrences, objects))
>>= \(body, (xref, startXRef)) ->
body
`mappend` "xref"
`nextLine` output xref
`mappend` "trailer"
`nextLine` output trailer
`nextLine` "startxref"
`nextLine` (Output.string (printf "%d" startXRef))
`nextLine` byteString eofMarker
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
>>= \(body, (xref, startXRef)) -> Output.concat [
body
, Output.line "xref"
, output xref
, Output.line "trailer"
, output trailer, newLine
, Output.line "startxref"
, Output.line (printf "%d" (getOffset startXRef))
, byteString eofMarker
]
where
Structure {xrefSection, trailer} = docStructure
Structure {xRef, trailer} = docStructure

View File

@ -2,47 +2,44 @@
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE AllowAmbiguousTypes #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module PDF.Output (
OBuilder
, ObjectId(..)
, OContext(..)
, Offset(..)
, Output(..)
, Resource(..)
, byteString
, char
, concat
, getOffsets
, join
, lift
, line
, newLine
, nextLine
, saveOffset
, string
, render
) where
--import Data.ByteString.Builder (Builder, char8, lazyByteString, string8)
import Data.ByteString.Builder (Builder, char8, lazyByteString, string8, toLazyByteString)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (length)
import Data.Int (Int64)
import Data.ByteString.Builder (Builder, char8, string8, toLazyByteString)
import qualified Data.ByteString.Builder as B (byteString)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (length)
import qualified Data.ByteString.Lazy as Lazy (ByteString)
import Data.Map (Map)
import qualified Data.Map as Map (singleton)
-- #if MIN_VERSION_base(4,9,0)
-- import qualified Data.Semigroup as Sem
-- #endif
import Data.String (IsString(..))
--import Control.Monad.Reader (MonadReader(..), Reader, runReader)
--import Control.Monad.Reader (MonadReader(..), Reader, runReader)
--import Control.Monad.Reader (MonadReader(..), Reader, runReader)
import Control.Monad.RWS (RWS, runRWS, ask, get, listen, modify, tell)
import qualified PDF.EOL as EOL (Style(..))
import Prelude hiding (concat)
data Offset = StartXRef | ObjectId Int deriving (Eq, Ord)
newtype ObjectId = ObjectId {getObjectId :: Int} deriving (Enum, Eq, Ord, Show)
newtype Offset = Offset {getOffset :: Int} deriving (Show)
{-
incrOffset :: (Int64 -> Int64) -> OutputState
-}
data Resource = StartXRef | Object ObjectId deriving (Eq, Ord)
newtype OContext a = OContext (RWS EOL.Style (Map Offset Int64) Int64 a)
newtype OContext a = OContext (RWS EOL.Style (Map Resource Offset) Offset a)
type OBuilder = OContext Builder
instance Functor OContext where
@ -55,20 +52,23 @@ instance Applicative OContext where
instance Monad OContext where
(>>=) (OContext a) f = OContext (a >>= (\x -> let OContext y = f x in y))
saveOffset :: Offset -> OContext ()
saveOffset offset = OContext $
get >>= tell . Map.singleton offset
saveOffset :: Resource -> OContext ()
saveOffset resource = OContext $
get >>= tell . Map.singleton resource
lift :: (a -> Builder) -> a -> OBuilder
lift f a = return (f a)
getOffsets :: OBuilder -> OContext (OBuilder, Map Offset Int64)
getOffsets :: OBuilder -> OContext (OBuilder, Map Resource Offset)
getOffsets (OContext builder) =
OContext (listen builder >>= \(a, w) -> return (return a, w))
append :: OBuilder -> OBuilder -> OBuilder
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
concat :: [OBuilder] -> OBuilder
concat = foldl mappend mempty
#if MIN_VERSION_base(4,11,0)
instance Semigroup OBuilder where
(<>) = append
@ -94,7 +94,7 @@ instance Output Bool where
output True = string "true"
instance Output a => Output [a] where
output = foldl mappend mempty . fmap output
output = concat . fmap output
join :: Output a => String -> [a] -> OBuilder
join _ [] = mempty
@ -102,27 +102,30 @@ join _ [a] = output a
join separator (a:as) =
output a `mappend` string separator `mappend` (join separator as)
newLine :: OBuilder
newLine = OContext $ buildEOL =<< ask
where
buildEOL EOL.CR = return (char8 '\r') <* modify (+1)
buildEOL EOL.LF = return (char8 '\n') <* modify (+1)
buildEOL EOL.CRLF = return (string8 "\r\n") <* modify (+2)
offset :: (Int -> Int) -> OContext ()
offset f = OContext $ modify $ \(Offset o) -> Offset (f o)
nextLine :: OBuilder -> OBuilder -> OBuilder
nextLine a b = a `mappend` newLine `mappend` b
newLine :: OBuilder
newLine = buildEOL =<< OContext ask
where
buildEOL EOL.CR = return (char8 '\r') <* offset (+1)
buildEOL EOL.LF = return (char8 '\n') <* offset (+1)
buildEOL EOL.CRLF = return (string8 "\r\n") <* offset (+2)
char :: Char -> OBuilder
char c = lift char8 c <* OContext (modify (+1))
char c = lift char8 c <* offset (+1)
string :: String -> OBuilder
string s = lift string8 s <* OContext (modify (+ toEnum (length s)))
string s = lift string8 s <* offset (+ toEnum (length s))
line :: String -> OBuilder
line l = string l `mappend` newLine
byteString :: ByteString -> OBuilder
byteString bs = lift lazyByteString bs <* OContext (modify (+ BS.length bs))
byteString bs = lift B.byteString bs <* offset (+ BS.length bs)
render :: Output a => EOL.Style -> a -> ByteString
render :: Output a => EOL.Style -> a -> Lazy.ByteString
render eolStyle a =
let OContext builder = output a in
let (outputByteString, _, _) = runRWS builder eolStyle 0 in
let (outputByteString, _, _) = runRWS builder eolStyle (Offset 0) in
toLazyByteString outputByteString

98
src/PDF/Parser.hs Normal file
View File

@ -0,0 +1,98 @@
module PDF.Parser (
Parser
, (<?>)
, block
, char
, choice
, count
, decNumber
, hexNumber
, many
, octDigit
, on
, oneOf
, option
, runParser
, sepBy
, string
, takeAll
, takeAll1
) where
import Control.Applicative ((<|>), empty)
import Control.Monad.State (StateT(..), evalStateT)
import Control.Monad.Trans (lift)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
)
import Data.ByteString (ByteString)
import Data.Char (toLower)
import Data.Set (Set)
import qualified Data.Set as Set (fromList, member, unions)
type Parser s = StateT s Atto.Parser
(<?>) :: Parser s a -> String -> Parser s a
(<?>) parser debugMessage = parser <|> fail debugMessage
block :: Int -> Parser s ByteString
block = lift . Atto.take
char :: Char -> Parser s Char
char = lift . Atto.char
choice :: [Parser s a] -> Parser s a
choice = foldr (<|>) empty
count :: Int -> Parser s a -> Parser s [a]
count 0 _ = return []
count n p = (:) <$> p <*> count (n-1) p
decNumber :: Parser s ByteString
decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
digits :: Set Char
digits = Set.fromList ['0'..'9']
hexDigits :: Set Char
hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
where
af = ['A'..'F']
hexNumber :: Parser s ByteString
hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
many :: Parser s a -> Parser s [a]
many parser = (:) <$> parser <*> many parser <|> return []
octDigit :: Parser s Char
octDigit = oneOf ['0'..'7']
on :: Parser s a -> ByteString -> Parser s (Either String a)
on (StateT parserF) input = StateT $ \state ->
case Atto.parseOnly (parserF state) input of
Left errorMsg -> return (Left errorMsg, state)
Right (result, newState) -> return (Right result, newState)
oneOf :: String -> Parser s Char
oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
option :: a -> Parser s a -> Parser s a
option defaultValue p = p <|> pure defaultValue
runParser :: Parser s a -> s -> ByteString -> Either String a
runParser parser initState =
Atto.parseOnly (evalStateT parser initState)
sepBy :: Parser s a -> Parser s b -> Parser s [a]
sepBy parser separator =
option [] $ (:) <$> parser <*> many (separator *> parser)
string :: ByteString -> Parser s ByteString
string = lift . Atto.string
takeAll :: (Char -> Bool) -> Parser s ByteString
takeAll = lift . Atto.takeWhile
takeAll1 :: (Char -> Bool) -> Parser s ByteString
takeAll1 = lift . Atto.takeWhile1

47
src/PDF/Update.hs Normal file
View File

@ -0,0 +1,47 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.Update (
unify
) where
import Data.Map (member)
import qualified Data.Map as Map (empty, union)
import PDF.Object (
Content(..), IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
, Structure(..)
)
emptyContent :: Content
emptyContent = Content {
docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
, objects = Map.empty
, occurrences = []
}
unify :: [Content] -> Content
unify = foldl complete emptyContent
where
complete tmpContent older =
let mergedObjects = Map.union (objects tmpContent) (objects older) in
Content {
docStructure =
unifyDocStructure (docStructure tmpContent) (docStructure older)
, objects = mergedObjects
, occurrences =
unifyOccurrences mergedObjects (occurrences tmpContent) (occurrences older)
}
unifyDocStructure :: Structure -> Structure -> Structure
unifyDocStructure update original = Structure {
xRef = Map.union (xRef update) (xRef original)
, trailer = Map.union (trailer update) (trailer original)
}
unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
unifyOccurrences objects update = foldr addOlder update
where
addOlder occurrence@(Comment _) existing = occurrence : existing
addOlder occurrence@(Indirect indirect) existing =
if objectId indirect `member` objects
then occurrence : existing
else existing