Go strict ByteStrings with attoparsec

This commit is contained in:
Tissevert 2019-05-24 10:48:09 +02:00
parent 0daa03d958
commit 11cb6504d7
10 changed files with 237 additions and 134 deletions

View file

@ -1,5 +1,10 @@
# Revision history for Hufflepdf # Revision history for Hufflepdf
## 0.1.1.0 -- 2019-05-23
* Rewrite the parser using attoparsec instead of parsec (about 1.5x faster)
* Use strict ByteStrings instead of lazy ones for an additional gain in performance, simpler type interface (lazy ByteStrings are now used only for output because that's required by the ByteString Builder) and fewer import lines
## 0.1.0.0 -- 2019-05-18 ## 0.1.0.0 -- 2019-05-18
* First version. Released on an unsuspecting world. * First version. Released on an unsuspecting world.

View file

@ -17,17 +17,18 @@ cabal-version: >=1.10
library library
exposed-modules: PDF exposed-modules: PDF
, PDF.Object
other-modules: PDF.Body
, PDF.EOL , PDF.EOL
, PDF.Object
, PDF.Output , PDF.Output
, Data.ByteString.Lazy.Char8.Util other-modules: Data.ByteString.Char8.Util
, PDF.Body
, PDF.Parser
-- other-extensions: -- other-extensions:
build-depends: base >=4.9 && <4.13 build-depends: attoparsec
, base >=4.9 && <4.13
, bytestring , bytestring
, containers , containers
, mtl , mtl
, parsec
hs-source-dirs: src hs-source-dirs: src
ghc-options: -Wall ghc-options: -Wall
default-language: Haskell2010 default-language: Haskell2010

View file

@ -0,0 +1,16 @@
module Data.ByteString.Char8.Util (
previous
, subBS
) where
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (drop, index, take)
import Prelude hiding (length)
previous :: Char -> Int -> ByteString -> Int
previous char position byteString
| BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
subBS :: Int -> Int -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset

View file

@ -1,17 +0,0 @@
module Data.ByteString.Lazy.Char8.Util (
previous
, subBS
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
import Data.Int (Int64)
import Prelude hiding (length)
previous :: Char -> Int64 -> ByteString -> Int64
previous char position byteString
| BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
subBS :: Int64 -> Int64 -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset

View file

@ -5,12 +5,12 @@ module PDF (
, render , render
) where ) where
import Data.ByteString.Lazy.Char8 (ByteString) import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS ( import qualified Data.ByteString.Char8 as BS (
drop, findIndex, head, isPrefixOf, last, length, span, unpack drop, findIndex, head, isPrefixOf, last, length, span, unpack
) )
import Data.ByteString.Lazy.Char8.Util (previous, subBS) import Data.ByteString.Char8.Util (previous, subBS)
import Data.Int (Int64) import qualified Data.ByteString.Lazy as Lazy (ByteString)
import qualified Data.Map as Map (lookup) import qualified Data.Map as Map (lookup)
import PDF.Body (populate) import PDF.Body (populate)
import qualified PDF.EOL as EOL (Style(..), charset, parser) import qualified PDF.EOL as EOL (Style(..), charset, parser)
@ -21,10 +21,7 @@ import PDF.Object (
) )
import qualified PDF.Output as Output (render, string) import qualified PDF.Output as Output (render, string)
import PDF.Output (Output(..), nextLine) import PDF.Output (Output(..), nextLine)
import Text.Parsec import PDF.Parser (Parser, runParser, string, takeAll)
import Text.Parsec.ByteString.Lazy (Parser)
import Text.Parsec.Pos (newPos)
import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
import Text.Printf (printf) import Text.Printf (printf)
data Document = Document { data Document = Document {
@ -38,20 +35,17 @@ instance Output Document where
Output.string (printf "%%PDF-%s" pdfVersion) Output.string (printf "%%PDF-%s" pdfVersion)
`nextLine` output contents `nextLine` output contents
render :: Document -> ByteString render :: Document -> Lazy.ByteString
render document@(Document {eolStyle}) = Output.render eolStyle document render document@(Document {eolStyle}) = Output.render eolStyle document
version :: Parser String version :: Parser () String
version = string magicNumber *> many (noneOf EOL.charset) version = BS.unpack <$>
(string magicNumber *> takeAll (not . (`elem` EOL.charset)))
parseError :: String -> Either ParseError a check :: Bool -> String -> Either String ()
parseError errorMessage = check test errorMessage = if test then return () else Left errorMessage
Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0)
check :: Bool -> String -> Either ParseError () readStartXref :: EOL.Style -> ByteString -> Either String Int
check test errorMessage = if test then return () else parseError errorMessage
readStartXref :: EOL.Style -> ByteString -> Either ParseError Int64
readStartXref eolStyle input = readStartXref eolStyle input =
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input)) check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
"Badly formed document : missing EOF marker at the end" "Badly formed document : missing EOF marker at the end"
@ -68,7 +62,7 @@ readStartXref eolStyle input =
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1 previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
findNextLine :: ByteString -> Int64 findNextLine :: ByteString -> Int
findNextLine input = findNextLine input =
let (line, eolPrefixed) = BS.span notInEol input in let (line, eolPrefixed) = BS.span notInEol input in
let nextNotInEol = BS.findIndex notInEol eolPrefixed in let nextNotInEol = BS.findIndex notInEol eolPrefixed in
@ -76,7 +70,7 @@ findNextLine input =
where where
notInEol = not . (`elem` EOL.charset) notInEol = not . (`elem` EOL.charset)
findNextSection :: Int64 -> ByteString -> Int64 findNextSection :: Int -> ByteString -> Int
findNextSection offset input = findNextSection offset input =
case BS.findIndex (== BS.head eofMarker) input of case BS.findIndex (== BS.head eofMarker) input of
Nothing -> 0 Nothing -> 0
@ -87,9 +81,9 @@ findNextSection offset input =
then newOffset + findNextLine newInput then newOffset + findNextLine newInput
else findNextSection (newOffset + 1) (BS.drop 1 newInput) else findNextSection (newOffset + 1) (BS.drop 1 newInput)
readStructures :: Int64 -> ByteString -> Either ParseError [InputStructure] readStructures :: Int -> ByteString -> Either String [InputStructure]
readStructures startXref input = readStructures startXref input =
parse structure ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow runParser structure () (BS.drop startXref input) >>= stopOrFollow
where where
stopOrFollow s@(Structure {trailer}) = stopOrFollow s@(Structure {trailer}) =
case Map.lookup (Name "Prev") trailer of case Map.lookup (Name "Prev") trailer of
@ -98,11 +92,11 @@ readStructures startXref input =
let offset = truncate newStartXref in let offset = truncate newStartXref in
let startOffset = findNextSection offset (BS.drop offset input) in let startOffset = findNextSection offset (BS.drop offset input) in
(InputStructure startOffset s:) <$> (readStructures offset input) (InputStructure startOffset s:) <$> (readStructures offset input)
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v Just v -> Left $ "Bad value for Prev entry in trailer: " ++ show v
parseDocument :: ByteString -> Either ParseError Document parseDocument :: ByteString -> Either String Document
parseDocument input = do parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> EOL.parser) "" input (pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
startXref <- readStartXref eolStyle input startXref <- readStartXref eolStyle input
structuresRead <- readStructures startXref input structuresRead <- readStructures startXref input
let contents = populate input <$> structuresRead let contents = populate input <$> structuresRead

View file

@ -1,21 +1,23 @@
{-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
module PDF.Body ( module PDF.Body (
populate populate
) where ) where
import Data.ByteString.Lazy.Char8 (ByteString) import Control.Applicative ((<|>))
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack) import Control.Monad.State (get, gets, modify)
import Data.Int (Int64) import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (drop, unpack)
import Data.Map ((!)) import Data.Map ((!))
import qualified Data.Map as Map (empty, insert, lookup) import qualified Data.Map as Map (empty, insert, lookup)
import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.EOL as EOL (charset, parser)
import PDF.Object ( import PDF.Object (
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..) Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..) , InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
, Parser, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..) , Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..)
, blank, dictionary, directObject, integer, line , blank, dictionary, directObject, integer, line
) )
import Text.Parsec import PDF.Parser (Parser, block, char, on, runParser, takeAll)
data UserState = UserState { data UserState = UserState {
input :: ByteString input :: ByteString
@ -26,7 +28,7 @@ data UserState = UserState {
type SParser = Parser UserState type SParser = Parser UserState
modifyFlow :: (Flow -> Flow) -> SParser () modifyFlow :: (Flow -> Flow) -> SParser ()
modifyFlow f = modifyState $ \state -> state {flow = f $ flow state} modifyFlow f = modify $ \state -> state {flow = f $ flow state}
addObject :: Int -> Object -> SParser () addObject :: Int -> Object -> SParser ()
addObject objectId newObject = modifyFlow $ \flow -> flow { addObject objectId newObject = modifyFlow $ \flow -> flow {
@ -39,9 +41,10 @@ pushOccurrence newOccurrence = modifyFlow $ \flow -> flow {
} }
comment :: Parser u String comment :: Parser u String
comment = char '%' *> many (noneOf EOL.charset) <* EOL.parser comment = BS.unpack <$>
(char '%' *> takeAll (not . (`elem` EOL.charset)) <* EOL.parser)
lookupOffset :: Int -> XRefSection -> Maybe Int64 lookupOffset :: Int -> XRefSection -> Maybe Int
lookupOffset _ [] = Nothing lookupOffset _ [] = Nothing
lookupOffset objectId (xrefSubSection:others) = lookupOffset objectId (xrefSubSection:others) =
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
@ -53,25 +56,20 @@ lookupOffset objectId (xrefSubSection:others) =
_ -> Nothing _ -> Nothing
else lookupOffset objectId others else lookupOffset objectId others
getOffset :: Int -> SParser Int64 getOffset :: Int -> SParser Int
getOffset objectId = do getOffset objectId = do
table <- xreferences <$> getState table <- gets xreferences
case lookupOffset objectId table of case lookupOffset objectId table of
Nothing -> fail $ Nothing -> fail $
"obj " ++ show objectId ++ " is referenced but missing in XRef table" "obj " ++ show objectId ++ " is referenced but missing in XRef table"
Just offset -> return offset Just offset -> return offset
on :: Monad m => ParsecT s u m a -> s -> ParsecT s u m a
on parser input = do
originalInput <- getInput
setInput input >> parser <* setInput originalInput
loadNumber :: Int -> SParser Float loadNumber :: Int -> SParser Float
loadNumber objectId = do loadNumber objectId = do
offset <- getOffset objectId offset <- getOffset objectId
objectStart <- BS.drop offset . input <$> getState objectStart <- BS.drop offset <$> gets input
indirectObjCoordinates `on` objectStart >> return () indirectObjCoordinates `on` (objectStart :: ByteString) >> return ()
objectValue <- (!objectId) . tmpObjects . flow <$> getState objectValue <- (!objectId) . tmpObjects <$> gets flow
case objectValue of case objectValue of
Direct (NumberObject (Number n)) -> return n Direct (NumberObject (Number n)) -> return n
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number" obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
@ -83,7 +81,7 @@ getSize :: Maybe DirectObject -> SParser Float
getSize Nothing = fail "Missing '/Length' key on stream" getSize Nothing = fail "Missing '/Length' key on stream"
getSize (Just (NumberObject (Number size))) = return size getSize (Just (NumberObject (Number size))) = return size
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
Flow {tmpObjects} <- flow <$> getState Flow {tmpObjects} <- gets flow
case Map.lookup objectId tmpObjects of case Map.lookup objectId tmpObjects of
Nothing -> loadNumber objectId Nothing -> loadNumber objectId
Just (Direct (NumberObject (Number size))) -> return size Just (Direct (NumberObject (Number size))) -> return size
@ -92,13 +90,13 @@ getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length" getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
streamObject :: SParser Object streamObject :: SParser Object
streamObject = try $ do streamObject = do
header <- dictionary <* blank header <- dictionary <* blank
size <- getSize (Map.lookup (Name "Length") header) size <- getSize (Map.lookup (Name "Length") header)
streamContent <- BS.pack <$> stream (truncate size) streamContent <- stream (truncate size)
return $ Stream {header, streamContent} return $ Stream {header, streamContent}
where where
stream size = line "stream" *> count size anyChar <* blank <* line "endstream" stream size = line "stream" *> block size <* blank <* line "endstream"
object :: SParser Object object :: SParser Object
object = streamObject <|> Direct <$> directObject object = streamObject <|> Direct <$> directObject
@ -117,7 +115,7 @@ occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
populate :: ByteString -> InputStructure -> Content populate :: ByteString -> InputStructure -> Content
populate input structure = populate input structure =
let bodyInput = BS.drop (startOffset structure) input in let bodyInput = BS.drop (startOffset structure) input in
case runParser recurseOnOccurrences initialState "" bodyInput of case runParser recurseOnOccurrences initialState bodyInput of
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure} Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
Right finalState -> Right finalState ->
let Flow {occurrencesStack, tmpObjects} = flow finalState in let Flow {occurrencesStack, tmpObjects} = flow finalState in
@ -135,4 +133,4 @@ populate input structure =
recurseOnOccurrences :: SParser UserState recurseOnOccurrences :: SParser UserState
recurseOnOccurrences = recurseOnOccurrences =
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState (occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> get

View file

@ -1,19 +1,20 @@
{-# LANGUAGE OverloadedStrings #-}
module PDF.EOL ( module PDF.EOL (
Style(..) Style(..)
, charset , charset
, parser , parser
) where ) where
import Data.ByteString.Lazy.Char8 (ByteString) import Control.Applicative ((<|>))
import Text.Parsec ((<|>), Parsec, string, try) import PDF.Parser (Parser, string)
data Style = CR | LF | CRLF deriving Show data Style = CR | LF | CRLF deriving Show
charset :: String charset :: String
charset = "\r\n" charset = "\r\n"
parser :: Parsec ByteString u Style parser :: Parser s Style
parser = parser =
try (string "\r\n" >> return CRLF) (string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR) <|> (string "\r" >> return CR)
<|> (string "\n" >> return LF) <|> (string "\n" >> return LF)

View file

@ -11,7 +11,6 @@ module PDF.Object (
, Number(..) , Number(..)
, Object(..) , Object(..)
, Occurrence(..) , Occurrence(..)
, Parser
, Structure(..) , Structure(..)
, XRefEntry(..) , XRefEntry(..)
, XRefSection , XRefSection
@ -26,8 +25,11 @@ module PDF.Object (
, structure , structure
) where ) where
import Data.ByteString.Lazy.Char8 (ByteString) import Control.Applicative ((<|>))
import Data.Int (Int64) import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (
concat, cons, pack, singleton, unpack
)
import Data.Map (Map, (!), mapWithKey) import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (elems, fromList, toList) import qualified Data.Map as Map (elems, fromList, toList)
import qualified PDF.EOL as EOL (charset, parser) import qualified PDF.EOL as EOL (charset, parser)
@ -36,15 +38,17 @@ import PDF.Output (
OBuilder, Offset(..), Output(..) OBuilder, Offset(..), Output(..)
, byteString, getOffsets, join, newLine, nextLine, saveOffset , byteString, getOffsets, join, newLine, nextLine, saveOffset
) )
import Text.Parsec import PDF.Parser (
Parser, (<?>)
, char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
, sepBy, string, takeAll, takeAll1
)
import Text.Printf (printf) import Text.Printf (printf)
type Parser u = Parsec ByteString u
line :: String -> Parser u () line :: String -> Parser u ()
line l = string l *> EOL.parser *> return () line l = string (BS.pack l) *> EOL.parser *> return () <?> printf "line «%s»" l
magicNumber :: String magicNumber :: ByteString
magicNumber = "%PDF-" magicNumber = "%PDF-"
eofMarker :: ByteString eofMarker :: ByteString
@ -53,20 +57,17 @@ eofMarker = "%%EOF"
whiteSpaceCharset :: String whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 " whiteSpaceCharset = "\0\t\12 "
whiteSpace :: Parser u ()
whiteSpace = oneOf whiteSpaceCharset *> return () <|> EOL.parser *> return ()
blank :: Parser u () blank :: Parser u ()
blank = skipMany whiteSpace blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
delimiterCharset :: String delimiterCharset :: String
delimiterCharset = "()<>[]{}/%" delimiterCharset = "()<>[]{}/%"
regular :: Parser u Char regular :: Char -> Bool
regular = noneOf $ EOL.charset ++ whiteSpaceCharset ++ delimiterCharset regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: (Read a, Num a) => Parser u a integer :: (Read a, Num a) => Parser u a
integer = read <$> many1 digit <* whiteSpace integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
------------------------------------- -------------------------------------
-- OBJECTS -- OBJECTS
@ -76,7 +77,8 @@ integer = read <$> many1 digit <* whiteSpace
-- Boolean -- Boolean
-- --
boolean :: Parser u Bool boolean :: Parser u Bool
boolean = (string "true" *> return True) <|> (string "false" *> return False) boolean =
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
-- --
-- Number -- Number
@ -90,12 +92,13 @@ instance Output Number where
_ -> printf "%f" f _ -> printf "%f" f
number :: Parser u Number number :: Parser u Number
number = Number . read <$> number = Number . read . BS.unpack <$>
(mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart)) (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
<?> "number"
where where
sign = string "-" <|> option "" (char '+' >> return "") sign = string "-" <|> option "" (char '+' >> return "")
integerPart = mappend <$> many1 digit <*> option "" floatPart integerPart = mappend <$> decNumber <*> option "" floatPart
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit) floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
-- --
-- StringObject -- StringObject
@ -108,14 +111,18 @@ instance Output StringObject where
stringObject :: Parser u StringObject stringObject :: Parser u StringObject
stringObject = stringObject =
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')') Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>') <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
<?> "string object (literal or hexadecimal)"
where where
literalStringBlock = many1 (noneOf "\\()") <|> matchingParenthesis <|> escapedChar literalString = many literalStringBlock
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
normalChar = not . (`elem` ("\\()" :: String))
matchingParenthesis = matchingParenthesis =
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")" mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode) escapedChar =
octalCode = choice $ (\n -> count n octDigit) <$> [1..3] BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
-- --
-- Name -- Name
@ -126,13 +133,14 @@ instance Output Name where
output (Name n) = "/" `mappend` Output.string n output (Name n) = "/" `mappend` Output.string n
name :: Parser u Name name :: Parser u Name
name = Name <$> (char '/' *> many regular) name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
-- --
-- Array -- Array
-- --
array :: Parser u [DirectObject] array :: Parser u [DirectObject]
array = char '[' *> blank *> directObject `endBy` blank <* char ']' array =
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
-- --
-- Dictionary -- Dictionary
@ -149,16 +157,16 @@ instance Output Dictionary where
dictionary :: Parser u Dictionary dictionary :: Parser u Dictionary
dictionary = dictionary =
try (string "<<" *> blank *> keyValPairs <* string ">>") string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
where where
keyVal = (,) <$> name <* blank <*> directObject keyVal = (,) <$> name <* blank <*> directObject
keyValPairs = Map.fromList <$> keyVal `endBy` blank keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank
-- --
-- Null -- Null
-- --
nullObject :: Parser u () nullObject :: Parser u ()
nullObject = string "null" *> return () nullObject = string "null" *> return () <?> "null object"
-- --
-- Reference -- Reference
@ -169,7 +177,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
} deriving Show } deriving Show
reference :: Parser u IndirectObjCoordinates reference :: Parser u IndirectObjCoordinates
reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R' reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R' <?> "reference to an object"
-- --
-- DirectObject -- DirectObject
@ -198,14 +206,15 @@ instance Output DirectObject where
directObject :: Parser u DirectObject directObject :: Parser u DirectObject
directObject = directObject =
Boolean <$> try boolean Boolean <$> boolean
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -} <|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> try number <|> NumberObject <$> number
<|> StringObject <$> try stringObject <|> StringObject <$> stringObject
<|> NameObject <$> try name <|> NameObject <$> name
<|> Array <$> try array <|> Array <$> array
<|> Dictionary <$> try dictionary <|> Dictionary <$> dictionary
<|> const Null <$> try nullObject <|> const Null <$> nullObject
<?> "direct object"
-- --
-- Object -- Object
@ -226,7 +235,6 @@ instance Output Object where
`nextLine` byteString streamContent `nextLine` byteString streamContent
`mappend` "endstream" `mappend` "endstream"
-- --
-- Occurrence -- Occurrence
-- --
@ -249,10 +257,10 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum
-- XRefEntry -- XRefEntry
-- --
data XRefEntry = InUse { data XRefEntry = InUse {
offset :: Int64 offset :: Int
, generation :: Int , generation :: Int
} | Free { } | Free {
nextFree :: Int64 nextFree :: Int
, generation :: Int , generation :: Int
} deriving Show } deriving Show
@ -265,14 +273,13 @@ instance Output XRefEntry where
entry :: Parser u XRefEntry entry :: Parser u XRefEntry
entry = do entry = do
(big, small) <- (,) <$> integer <*> integer (big, small) <- (,) <$> integer <*> integer
(inUse big small <|> free big small) <* blank (inUse big small <|> free big small <?> "XRef entry") <* blank
where where
inUse :: Int64 -> Int -> Parser u XRefEntry inUse :: Int -> Int -> Parser u XRefEntry
inUse offset generation = char 'n' *> return (InUse {offset, generation}) inUse offset generation = char 'n' *> return (InUse {offset, generation})
free :: Int64 -> Int -> Parser u XRefEntry free :: Int -> Int -> Parser u XRefEntry
free nextFree generation = char 'f' *> return (Free {nextFree, generation}) free nextFree generation = char 'f' *> return (Free {nextFree, generation})
-- --
-- XRefSubSection -- XRefSubSection
-- --
@ -289,7 +296,7 @@ instance Output XRefSubSection where
xrefSubSection :: Parser u XRefSubSection xrefSubSection :: Parser u XRefSubSection
xrefSubSection = do xrefSubSection = do
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer (firstObjectId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
return $ XRefSubSection {firstObjectId, entriesNumber, entries} return $ XRefSubSection {firstObjectId, entriesNumber, entries}
@ -299,7 +306,7 @@ type XRefSection = [XRefSubSection]
-- Structure -- Structure
-- --
data InputStructure = InputStructure { data InputStructure = InputStructure {
startOffset :: Int64 startOffset :: Int
, inputStructure :: Structure , inputStructure :: Structure
} }
@ -314,7 +321,7 @@ structure =
<$> (line "xref" *> xrefSubSection `sepBy` EOL.parser) <$> (line "xref" *> xrefSubSection `sepBy` EOL.parser)
<*> (line "trailer" *> dictionary <* EOL.parser) <*> (line "trailer" *> dictionary <* EOL.parser)
updateXrefs :: XRefSection -> Map Offset Int64 -> (XRefSection, Int64) updateXrefs :: XRefSection -> Map Offset Int -> (XRefSection, Int)
updateXrefs xrefSection offsets = ( updateXrefs xrefSection offsets = (
updateSubSection <$> xrefSection updateSubSection <$> xrefSection
, offsets ! StartXRef , offsets ! StartXRef

View file

@ -11,7 +11,6 @@ module PDF.Output (
, char , char
, getOffsets , getOffsets
, join , join
, lift
, newLine , newLine
, nextLine , nextLine
, saveOffset , saveOffset
@ -19,10 +18,11 @@ module PDF.Output (
, render , render
) where ) where
import Data.ByteString.Builder (Builder, char8, lazyByteString, string8, toLazyByteString) import Data.ByteString.Builder (Builder, char8, string8, toLazyByteString)
import Data.ByteString.Lazy.Char8 (ByteString) import qualified Data.ByteString.Builder as B (byteString)
import qualified Data.ByteString.Lazy.Char8 as BS (length) import Data.ByteString (ByteString)
import Data.Int (Int64) import qualified Data.ByteString as BS (length)
import qualified Data.ByteString.Lazy as Lazy (ByteString)
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map as Map (singleton) import qualified Data.Map as Map (singleton)
import Data.String (IsString(..)) import Data.String (IsString(..))
@ -31,7 +31,7 @@ import qualified PDF.EOL as EOL (Style(..))
data Offset = StartXRef | ObjectId Int deriving (Eq, Ord) data Offset = StartXRef | ObjectId Int deriving (Eq, Ord)
newtype OContext a = OContext (RWS EOL.Style (Map Offset Int64) Int64 a) newtype OContext a = OContext (RWS EOL.Style (Map Offset Int) Int a)
type OBuilder = OContext Builder type OBuilder = OContext Builder
instance Functor OContext where instance Functor OContext where
@ -51,7 +51,7 @@ saveOffset offset = OContext $
lift :: (a -> Builder) -> a -> OBuilder lift :: (a -> Builder) -> a -> OBuilder
lift f a = return (f a) lift f a = return (f a)
getOffsets :: OBuilder -> OContext (OBuilder, Map Offset Int64) getOffsets :: OBuilder -> OContext (OBuilder, Map Offset Int)
getOffsets (OContext builder) = getOffsets (OContext builder) =
OContext (listen builder >>= \(a, w) -> return (return a, w)) OContext (listen builder >>= \(a, w) -> return (return a, w))
@ -108,9 +108,9 @@ string :: String -> OBuilder
string s = lift string8 s <* OContext (modify (+ toEnum (length s))) string s = lift string8 s <* OContext (modify (+ toEnum (length s)))
byteString :: ByteString -> OBuilder byteString :: ByteString -> OBuilder
byteString bs = lift lazyByteString bs <* OContext (modify (+ BS.length bs)) byteString bs = lift B.byteString bs <* OContext (modify (+ BS.length bs))
render :: Output a => EOL.Style -> a -> ByteString render :: Output a => EOL.Style -> a -> Lazy.ByteString
render eolStyle a = render eolStyle a =
let OContext builder = output a in let OContext builder = output a in
let (outputByteString, _, _) = runRWS builder eolStyle 0 in let (outputByteString, _, _) = runRWS builder eolStyle 0 in

98
src/PDF/Parser.hs Normal file
View file

@ -0,0 +1,98 @@
module PDF.Parser (
Parser
, (<?>)
, block
, char
, choice
, count
, decNumber
, hexNumber
, many
, octDigit
, on
, oneOf
, option
, runParser
, sepBy
, string
, takeAll
, takeAll1
) where
import Control.Applicative ((<|>), empty)
import Control.Monad.State (StateT(..), evalStateT)
import Control.Monad.Trans (lift)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
)
import Data.ByteString (ByteString)
import Data.Char (toLower)
import Data.Set (Set)
import qualified Data.Set as Set (fromList, member, unions)
type Parser s = StateT s Atto.Parser
(<?>) :: Parser s a -> String -> Parser s a
(<?>) parser debugMessage = parser <|> fail debugMessage
block :: Int -> Parser s ByteString
block = lift . Atto.take
char :: Char -> Parser s Char
char = lift . Atto.char
choice :: [Parser s a] -> Parser s a
choice = foldr (<|>) empty
count :: Int -> Parser s a -> Parser s [a]
count 0 _ = return []
count n p = (:) <$> p <*> count (n-1) p
decNumber :: Parser s ByteString
decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
digits :: Set Char
digits = Set.fromList ['0'..'9']
hexDigits :: Set Char
hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
where
af = ['A'..'F']
hexNumber :: Parser s ByteString
hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
many :: Parser s a -> Parser s [a]
many parser = (:) <$> parser <*> many parser <|> return []
octDigit :: Parser s Char
octDigit = oneOf ['0'..'7']
on :: Parser s a -> ByteString -> Parser s (Either String a)
on (StateT parserF) input = StateT $ \state ->
case Atto.parseOnly (parserF state) input of
Left errorMsg -> return (Left errorMsg, state)
Right (result, newState) -> return (Right result, newState)
oneOf :: String -> Parser s Char
oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
option :: a -> Parser s a -> Parser s a
option defaultValue p = p <|> pure defaultValue
runParser :: Parser s a -> s -> ByteString -> Either String a
runParser parser initState =
Atto.parseOnly (evalStateT parser initState)
sepBy :: Parser s a -> Parser s b -> Parser s [a]
sepBy parser separator =
option [] $ (:) <$> parser <*> many (separator *> parser)
string :: ByteString -> Parser s ByteString
string = lift . Atto.string
takeAll :: (Char -> Bool) -> Parser s ByteString
takeAll = lift . Atto.takeWhile
takeAll1 :: (Char -> Bool) -> Parser s ByteString
takeAll1 = lift . Atto.takeWhile1