Go strict ByteStrings with attoparsec

This commit is contained in:
Tissevert 2019-05-24 10:48:09 +02:00
parent 0daa03d958
commit 11cb6504d7
10 changed files with 237 additions and 134 deletions

View file

@ -1,5 +1,10 @@
# Revision history for Hufflepdf
## 0.1.1.0 -- 2019-05-23
* Rewrite the parser using attoparsec instead of parsec (about 1.5x faster)
* Use strict ByteStrings instead of lazy ones for an additional gain in performance, simpler type interface (lazy ByteStrings are now used only for output because that's required by the ByteString Builder) and fewer import lines
## 0.1.0.0 -- 2019-05-18
* First version. Released on an unsuspecting world.

View file

@ -17,17 +17,18 @@ cabal-version: >=1.10
library
exposed-modules: PDF
, PDF.Object
other-modules: PDF.Body
, PDF.EOL
, PDF.Object
, PDF.Output
, Data.ByteString.Lazy.Char8.Util
other-modules: Data.ByteString.Char8.Util
, PDF.Body
, PDF.Parser
-- other-extensions:
build-depends: base >=4.9 && <4.13
build-depends: attoparsec
, base >=4.9 && <4.13
, bytestring
, containers
, mtl
, parsec
hs-source-dirs: src
ghc-options: -Wall
default-language: Haskell2010

View file

@ -0,0 +1,16 @@
module Data.ByteString.Char8.Util (
previous
, subBS
) where
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (drop, index, take)
import Prelude hiding (length)
previous :: Char -> Int -> ByteString -> Int
previous char position byteString
| BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
subBS :: Int -> Int -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset

View file

@ -1,17 +0,0 @@
module Data.ByteString.Lazy.Char8.Util (
previous
, subBS
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
import Data.Int (Int64)
import Prelude hiding (length)
previous :: Char -> Int64 -> ByteString -> Int64
previous char position byteString
| BS.index byteString position == char = position
| otherwise = previous char (position - 1) byteString
subBS :: Int64 -> Int64 -> ByteString -> ByteString
subBS offset length = BS.take length . BS.drop offset

View file

@ -5,12 +5,12 @@ module PDF (
, render
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (
drop, findIndex, head, isPrefixOf, last, length, span, unpack
)
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
import Data.Int (Int64)
import Data.ByteString.Char8.Util (previous, subBS)
import qualified Data.ByteString.Lazy as Lazy (ByteString)
import qualified Data.Map as Map (lookup)
import PDF.Body (populate)
import qualified PDF.EOL as EOL (Style(..), charset, parser)
@ -21,10 +21,7 @@ import PDF.Object (
)
import qualified PDF.Output as Output (render, string)
import PDF.Output (Output(..), nextLine)
import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser)
import Text.Parsec.Pos (newPos)
import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
import PDF.Parser (Parser, runParser, string, takeAll)
import Text.Printf (printf)
data Document = Document {
@ -38,20 +35,17 @@ instance Output Document where
Output.string (printf "%%PDF-%s" pdfVersion)
`nextLine` output contents
render :: Document -> ByteString
render :: Document -> Lazy.ByteString
render document@(Document {eolStyle}) = Output.render eolStyle document
version :: Parser String
version = string magicNumber *> many (noneOf EOL.charset)
version :: Parser () String
version = BS.unpack <$>
(string magicNumber *> takeAll (not . (`elem` EOL.charset)))
parseError :: String -> Either ParseError a
parseError errorMessage =
Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0)
check :: Bool -> String -> Either String ()
check test errorMessage = if test then return () else Left errorMessage
check :: Bool -> String -> Either ParseError ()
check test errorMessage = if test then return () else parseError errorMessage
readStartXref :: EOL.Style -> ByteString -> Either ParseError Int64
readStartXref :: EOL.Style -> ByteString -> Either String Int
readStartXref eolStyle input =
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
"Badly formed document : missing EOF marker at the end"
@ -68,7 +62,7 @@ readStartXref eolStyle input =
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
findNextLine :: ByteString -> Int64
findNextLine :: ByteString -> Int
findNextLine input =
let (line, eolPrefixed) = BS.span notInEol input in
let nextNotInEol = BS.findIndex notInEol eolPrefixed in
@ -76,7 +70,7 @@ findNextLine input =
where
notInEol = not . (`elem` EOL.charset)
findNextSection :: Int64 -> ByteString -> Int64
findNextSection :: Int -> ByteString -> Int
findNextSection offset input =
case BS.findIndex (== BS.head eofMarker) input of
Nothing -> 0
@ -87,9 +81,9 @@ findNextSection offset input =
then newOffset + findNextLine newInput
else findNextSection (newOffset + 1) (BS.drop 1 newInput)
readStructures :: Int64 -> ByteString -> Either ParseError [InputStructure]
readStructures :: Int -> ByteString -> Either String [InputStructure]
readStructures startXref input =
parse structure ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
runParser structure () (BS.drop startXref input) >>= stopOrFollow
where
stopOrFollow s@(Structure {trailer}) =
case Map.lookup (Name "Prev") trailer of
@ -98,11 +92,11 @@ readStructures startXref input =
let offset = truncate newStartXref in
let startOffset = findNextSection offset (BS.drop offset input) in
(InputStructure startOffset s:) <$> (readStructures offset input)
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
Just v -> Left $ "Bad value for Prev entry in trailer: " ++ show v
parseDocument :: ByteString -> Either ParseError Document
parseDocument :: ByteString -> Either String Document
parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> EOL.parser) "" input
(pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
startXref <- readStartXref eolStyle input
structuresRead <- readStructures startXref input
let contents = populate input <$> structuresRead

View file

@ -1,21 +1,23 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
module PDF.Body (
populate
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
import Data.Int (Int64)
import Control.Applicative ((<|>))
import Control.Monad.State (get, gets, modify)
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (drop, unpack)
import Data.Map ((!))
import qualified Data.Map as Map (empty, insert, lookup)
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Object (
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
, Parser, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..)
, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..)
, blank, dictionary, directObject, integer, line
)
import Text.Parsec
import PDF.Parser (Parser, block, char, on, runParser, takeAll)
data UserState = UserState {
input :: ByteString
@ -26,7 +28,7 @@ data UserState = UserState {
type SParser = Parser UserState
modifyFlow :: (Flow -> Flow) -> SParser ()
modifyFlow f = modifyState $ \state -> state {flow = f $ flow state}
modifyFlow f = modify $ \state -> state {flow = f $ flow state}
addObject :: Int -> Object -> SParser ()
addObject objectId newObject = modifyFlow $ \flow -> flow {
@ -39,9 +41,10 @@ pushOccurrence newOccurrence = modifyFlow $ \flow -> flow {
}
comment :: Parser u String
comment = char '%' *> many (noneOf EOL.charset) <* EOL.parser
comment = BS.unpack <$>
(char '%' *> takeAll (not . (`elem` EOL.charset)) <* EOL.parser)
lookupOffset :: Int -> XRefSection -> Maybe Int64
lookupOffset :: Int -> XRefSection -> Maybe Int
lookupOffset _ [] = Nothing
lookupOffset objectId (xrefSubSection:others) =
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
@ -53,25 +56,20 @@ lookupOffset objectId (xrefSubSection:others) =
_ -> Nothing
else lookupOffset objectId others
getOffset :: Int -> SParser Int64
getOffset :: Int -> SParser Int
getOffset objectId = do
table <- xreferences <$> getState
table <- gets xreferences
case lookupOffset objectId table of
Nothing -> fail $
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
Just offset -> return offset
on :: Monad m => ParsecT s u m a -> s -> ParsecT s u m a
on parser input = do
originalInput <- getInput
setInput input >> parser <* setInput originalInput
loadNumber :: Int -> SParser Float
loadNumber objectId = do
offset <- getOffset objectId
objectStart <- BS.drop offset . input <$> getState
indirectObjCoordinates `on` objectStart >> return ()
objectValue <- (!objectId) . tmpObjects . flow <$> getState
objectStart <- BS.drop offset <$> gets input
indirectObjCoordinates `on` (objectStart :: ByteString) >> return ()
objectValue <- (!objectId) . tmpObjects <$> gets flow
case objectValue of
Direct (NumberObject (Number n)) -> return n
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
@ -83,7 +81,7 @@ getSize :: Maybe DirectObject -> SParser Float
getSize Nothing = fail "Missing '/Length' key on stream"
getSize (Just (NumberObject (Number size))) = return size
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
Flow {tmpObjects} <- flow <$> getState
Flow {tmpObjects} <- gets flow
case Map.lookup objectId tmpObjects of
Nothing -> loadNumber objectId
Just (Direct (NumberObject (Number size))) -> return size
@ -92,13 +90,13 @@ getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
streamObject :: SParser Object
streamObject = try $ do
streamObject = do
header <- dictionary <* blank
size <- getSize (Map.lookup (Name "Length") header)
streamContent <- BS.pack <$> stream (truncate size)
streamContent <- stream (truncate size)
return $ Stream {header, streamContent}
where
stream size = line "stream" *> count size anyChar <* blank <* line "endstream"
stream size = line "stream" *> block size <* blank <* line "endstream"
object :: SParser Object
object = streamObject <|> Direct <$> directObject
@ -117,7 +115,7 @@ occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
populate :: ByteString -> InputStructure -> Content
populate input structure =
let bodyInput = BS.drop (startOffset structure) input in
case runParser recurseOnOccurrences initialState "" bodyInput of
case runParser recurseOnOccurrences initialState bodyInput of
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
Right finalState ->
let Flow {occurrencesStack, tmpObjects} = flow finalState in
@ -135,4 +133,4 @@ populate input structure =
recurseOnOccurrences :: SParser UserState
recurseOnOccurrences =
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> get

View file

@ -1,19 +1,20 @@
{-# LANGUAGE OverloadedStrings #-}
module PDF.EOL (
Style(..)
, charset
, parser
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import Text.Parsec ((<|>), Parsec, string, try)
import Control.Applicative ((<|>))
import PDF.Parser (Parser, string)
data Style = CR | LF | CRLF deriving Show
charset :: String
charset = "\r\n"
parser :: Parsec ByteString u Style
parser :: Parser s Style
parser =
try (string "\r\n" >> return CRLF)
(string "\r\n" >> return CRLF)
<|> (string "\r" >> return CR)
<|> (string "\n" >> return LF)

View file

@ -11,7 +11,6 @@ module PDF.Object (
, Number(..)
, Object(..)
, Occurrence(..)
, Parser
, Structure(..)
, XRefEntry(..)
, XRefSection
@ -26,8 +25,11 @@ module PDF.Object (
, structure
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import Data.Int (Int64)
import Control.Applicative ((<|>))
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (
concat, cons, pack, singleton, unpack
)
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (elems, fromList, toList)
import qualified PDF.EOL as EOL (charset, parser)
@ -36,15 +38,17 @@ import PDF.Output (
OBuilder, Offset(..), Output(..)
, byteString, getOffsets, join, newLine, nextLine, saveOffset
)
import Text.Parsec
import PDF.Parser (
Parser, (<?>)
, char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
, sepBy, string, takeAll, takeAll1
)
import Text.Printf (printf)
type Parser u = Parsec ByteString u
line :: String -> Parser u ()
line l = string l *> EOL.parser *> return ()
line l = string (BS.pack l) *> EOL.parser *> return () <?> printf "line «%s»" l
magicNumber :: String
magicNumber :: ByteString
magicNumber = "%PDF-"
eofMarker :: ByteString
@ -53,20 +57,17 @@ eofMarker = "%%EOF"
whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 "
whiteSpace :: Parser u ()
whiteSpace = oneOf whiteSpaceCharset *> return () <|> EOL.parser *> return ()
blank :: Parser u ()
blank = skipMany whiteSpace
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
delimiterCharset :: String
delimiterCharset = "()<>[]{}/%"
regular :: Parser u Char
regular = noneOf $ EOL.charset ++ whiteSpaceCharset ++ delimiterCharset
regular :: Char -> Bool
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: (Read a, Num a) => Parser u a
integer = read <$> many1 digit <* whiteSpace
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
-------------------------------------
-- OBJECTS
@ -76,7 +77,8 @@ integer = read <$> many1 digit <* whiteSpace
-- Boolean
--
boolean :: Parser u Bool
boolean = (string "true" *> return True) <|> (string "false" *> return False)
boolean =
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
--
-- Number
@ -90,12 +92,13 @@ instance Output Number where
_ -> printf "%f" f
number :: Parser u Number
number = Number . read <$>
(mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart))
number = Number . read . BS.unpack <$>
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
<?> "number"
where
sign = string "-" <|> option "" (char '+' >> return "")
integerPart = mappend <$> many1 digit <*> option "" floatPart
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit)
integerPart = mappend <$> decNumber <*> option "" floatPart
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
--
-- StringObject
@ -108,14 +111,18 @@ instance Output StringObject where
stringObject :: Parser u StringObject
stringObject =
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
<?> "string object (literal or hexadecimal)"
where
literalStringBlock = many1 (noneOf "\\()") <|> matchingParenthesis <|> escapedChar
literalString = many literalStringBlock
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
normalChar = not . (`elem` ("\\()" :: String))
matchingParenthesis =
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
escapedChar =
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
--
-- Name
@ -126,13 +133,14 @@ instance Output Name where
output (Name n) = "/" `mappend` Output.string n
name :: Parser u Name
name = Name <$> (char '/' *> many regular)
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
--
-- Array
--
array :: Parser u [DirectObject]
array = char '[' *> blank *> directObject `endBy` blank <* char ']'
array =
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
--
-- Dictionary
@ -149,16 +157,16 @@ instance Output Dictionary where
dictionary :: Parser u Dictionary
dictionary =
try (string "<<" *> blank *> keyValPairs <* string ">>")
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
where
keyVal = (,) <$> name <* blank <*> directObject
keyValPairs = Map.fromList <$> keyVal `endBy` blank
keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank
--
-- Null
--
nullObject :: Parser u ()
nullObject = string "null" *> return ()
nullObject = string "null" *> return () <?> "null object"
--
-- Reference
@ -169,7 +177,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
} deriving Show
reference :: Parser u IndirectObjCoordinates
reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R'
reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R' <?> "reference to an object"
--
-- DirectObject
@ -198,14 +206,15 @@ instance Output DirectObject where
directObject :: Parser u DirectObject
directObject =
Boolean <$> try boolean
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> try number
<|> StringObject <$> try stringObject
<|> NameObject <$> try name
<|> Array <$> try array
<|> Dictionary <$> try dictionary
<|> const Null <$> try nullObject
Boolean <$> boolean
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> number
<|> StringObject <$> stringObject
<|> NameObject <$> name
<|> Array <$> array
<|> Dictionary <$> dictionary
<|> const Null <$> nullObject
<?> "direct object"
--
-- Object
@ -226,7 +235,6 @@ instance Output Object where
`nextLine` byteString streamContent
`mappend` "endstream"
--
-- Occurrence
--
@ -249,10 +257,10 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum
-- XRefEntry
--
data XRefEntry = InUse {
offset :: Int64
offset :: Int
, generation :: Int
} | Free {
nextFree :: Int64
nextFree :: Int
, generation :: Int
} deriving Show
@ -265,14 +273,13 @@ instance Output XRefEntry where
entry :: Parser u XRefEntry
entry = do
(big, small) <- (,) <$> integer <*> integer
(inUse big small <|> free big small) <* blank
(inUse big small <|> free big small <?> "XRef entry") <* blank
where
inUse :: Int64 -> Int -> Parser u XRefEntry
inUse :: Int -> Int -> Parser u XRefEntry
inUse offset generation = char 'n' *> return (InUse {offset, generation})
free :: Int64 -> Int -> Parser u XRefEntry
free :: Int -> Int -> Parser u XRefEntry
free nextFree generation = char 'f' *> return (Free {nextFree, generation})
--
-- XRefSubSection
--
@ -289,7 +296,7 @@ instance Output XRefSubSection where
xrefSubSection :: Parser u XRefSubSection
xrefSubSection = do
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
return $ XRefSubSection {firstObjectId, entriesNumber, entries}
@ -299,7 +306,7 @@ type XRefSection = [XRefSubSection]
-- Structure
--
data InputStructure = InputStructure {
startOffset :: Int64
startOffset :: Int
, inputStructure :: Structure
}
@ -314,7 +321,7 @@ structure =
<$> (line "xref" *> xrefSubSection `sepBy` EOL.parser)
<*> (line "trailer" *> dictionary <* EOL.parser)
updateXrefs :: XRefSection -> Map Offset Int64 -> (XRefSection, Int64)
updateXrefs :: XRefSection -> Map Offset Int -> (XRefSection, Int)
updateXrefs xrefSection offsets = (
updateSubSection <$> xrefSection
, offsets ! StartXRef

View file

@ -11,7 +11,6 @@ module PDF.Output (
, char
, getOffsets
, join
, lift
, newLine
, nextLine
, saveOffset
@ -19,10 +18,11 @@ module PDF.Output (
, render
) where
import Data.ByteString.Builder (Builder, char8, lazyByteString, string8, toLazyByteString)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (length)
import Data.Int (Int64)
import Data.ByteString.Builder (Builder, char8, string8, toLazyByteString)
import qualified Data.ByteString.Builder as B (byteString)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (length)
import qualified Data.ByteString.Lazy as Lazy (ByteString)
import Data.Map (Map)
import qualified Data.Map as Map (singleton)
import Data.String (IsString(..))
@ -31,7 +31,7 @@ import qualified PDF.EOL as EOL (Style(..))
data Offset = StartXRef | ObjectId Int deriving (Eq, Ord)
newtype OContext a = OContext (RWS EOL.Style (Map Offset Int64) Int64 a)
newtype OContext a = OContext (RWS EOL.Style (Map Offset Int) Int a)
type OBuilder = OContext Builder
instance Functor OContext where
@ -51,7 +51,7 @@ saveOffset offset = OContext $
lift :: (a -> Builder) -> a -> OBuilder
lift f a = return (f a)
getOffsets :: OBuilder -> OContext (OBuilder, Map Offset Int64)
getOffsets :: OBuilder -> OContext (OBuilder, Map Offset Int)
getOffsets (OContext builder) =
OContext (listen builder >>= \(a, w) -> return (return a, w))
@ -108,9 +108,9 @@ string :: String -> OBuilder
string s = lift string8 s <* OContext (modify (+ toEnum (length s)))
byteString :: ByteString -> OBuilder
byteString bs = lift lazyByteString bs <* OContext (modify (+ BS.length bs))
byteString bs = lift B.byteString bs <* OContext (modify (+ BS.length bs))
render :: Output a => EOL.Style -> a -> ByteString
render :: Output a => EOL.Style -> a -> Lazy.ByteString
render eolStyle a =
let OContext builder = output a in
let (outputByteString, _, _) = runRWS builder eolStyle 0 in

98
src/PDF/Parser.hs Normal file
View file

@ -0,0 +1,98 @@
module PDF.Parser (
Parser
, (<?>)
, block
, char
, choice
, count
, decNumber
, hexNumber
, many
, octDigit
, on
, oneOf
, option
, runParser
, sepBy
, string
, takeAll
, takeAll1
) where
import Control.Applicative ((<|>), empty)
import Control.Monad.State (StateT(..), evalStateT)
import Control.Monad.Trans (lift)
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
)
import Data.ByteString (ByteString)
import Data.Char (toLower)
import Data.Set (Set)
import qualified Data.Set as Set (fromList, member, unions)
type Parser s = StateT s Atto.Parser
(<?>) :: Parser s a -> String -> Parser s a
(<?>) parser debugMessage = parser <|> fail debugMessage
block :: Int -> Parser s ByteString
block = lift . Atto.take
char :: Char -> Parser s Char
char = lift . Atto.char
choice :: [Parser s a] -> Parser s a
choice = foldr (<|>) empty
count :: Int -> Parser s a -> Parser s [a]
count 0 _ = return []
count n p = (:) <$> p <*> count (n-1) p
decNumber :: Parser s ByteString
decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
digits :: Set Char
digits = Set.fromList ['0'..'9']
hexDigits :: Set Char
hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
where
af = ['A'..'F']
hexNumber :: Parser s ByteString
hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
many :: Parser s a -> Parser s [a]
many parser = (:) <$> parser <*> many parser <|> return []
octDigit :: Parser s Char
octDigit = oneOf ['0'..'7']
on :: Parser s a -> ByteString -> Parser s (Either String a)
on (StateT parserF) input = StateT $ \state ->
case Atto.parseOnly (parserF state) input of
Left errorMsg -> return (Left errorMsg, state)
Right (result, newState) -> return (Right result, newState)
oneOf :: String -> Parser s Char
oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
option :: a -> Parser s a -> Parser s a
option defaultValue p = p <|> pure defaultValue
runParser :: Parser s a -> s -> ByteString -> Either String a
runParser parser initState =
Atto.parseOnly (evalStateT parser initState)
sepBy :: Parser s a -> Parser s b -> Parser s [a]
sepBy parser separator =
option [] $ (:) <$> parser <*> many (separator *> parser)
string :: ByteString -> Parser s ByteString
string = lift . Atto.string
takeAll :: (Char -> Bool) -> Parser s ByteString
takeAll = lift . Atto.takeWhile
takeAll1 :: (Char -> Bool) -> Parser s ByteString
takeAll1 = lift . Atto.takeWhile1