From 11cb6504d741c3859c46d25ac24ca6341108cb01 Mon Sep 17 00:00:00 2001 From: Tissevert Date: Fri, 24 May 2019 10:48:09 +0200 Subject: [PATCH] Go strict ByteStrings with attoparsec --- ChangeLog.md | 5 ++ Hufflepdf.cabal | 11 +-- src/Data/ByteString/Char8/Util.hs | 16 ++++ src/Data/ByteString/Lazy/Char8/Util.hs | 17 ---- src/PDF.hs | 44 +++++----- src/PDF/Body.hs | 46 +++++------ src/PDF/EOL.hs | 9 ++- src/PDF/Object.hs | 107 +++++++++++++------------ src/PDF/Output.hs | 18 ++--- src/PDF/Parser.hs | 98 ++++++++++++++++++++++ 10 files changed, 237 insertions(+), 134 deletions(-) create mode 100644 src/Data/ByteString/Char8/Util.hs delete mode 100644 src/Data/ByteString/Lazy/Char8/Util.hs create mode 100644 src/PDF/Parser.hs diff --git a/ChangeLog.md b/ChangeLog.md index 65f4d13..6bde38f 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,10 @@ # Revision history for Hufflepdf +## 0.1.1.0 -- 2019-05-23 + +* Rewrite the parser using attoparsec instead of parsec (about 1.5x faster) +* Use strict ByteStrings instead of lazy ones for an additional gain in performance, simpler type interface (lazy ByteStrings are now used only for output because that's required by the ByteString Builder) and fewer import lines + ## 0.1.0.0 -- 2019-05-18 * First version. Released on an unsuspecting world. diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index 2236845..c695de0 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -17,17 +17,18 @@ cabal-version: >=1.10 library exposed-modules: PDF - , PDF.Object - other-modules: PDF.Body , PDF.EOL + , PDF.Object , PDF.Output - , Data.ByteString.Lazy.Char8.Util + other-modules: Data.ByteString.Char8.Util + , PDF.Body + , PDF.Parser -- other-extensions: - build-depends: base >=4.9 && <4.13 + build-depends: attoparsec + , base >=4.9 && <4.13 , bytestring , containers , mtl - , parsec hs-source-dirs: src ghc-options: -Wall default-language: Haskell2010 diff --git a/src/Data/ByteString/Char8/Util.hs b/src/Data/ByteString/Char8/Util.hs new file mode 100644 index 0000000..c091351 --- /dev/null +++ b/src/Data/ByteString/Char8/Util.hs @@ -0,0 +1,16 @@ +module Data.ByteString.Char8.Util ( + previous + , subBS + ) where + +import Data.ByteString.Char8 (ByteString) +import qualified Data.ByteString.Char8 as BS (drop, index, take) +import Prelude hiding (length) + +previous :: Char -> Int -> ByteString -> Int +previous char position byteString + | BS.index byteString position == char = position + | otherwise = previous char (position - 1) byteString + +subBS :: Int -> Int -> ByteString -> ByteString +subBS offset length = BS.take length . BS.drop offset diff --git a/src/Data/ByteString/Lazy/Char8/Util.hs b/src/Data/ByteString/Lazy/Char8/Util.hs deleted file mode 100644 index 831be25..0000000 --- a/src/Data/ByteString/Lazy/Char8/Util.hs +++ /dev/null @@ -1,17 +0,0 @@ -module Data.ByteString.Lazy.Char8.Util ( - previous - , subBS - ) where - -import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take) -import Data.Int (Int64) -import Prelude hiding (length) - -previous :: Char -> Int64 -> ByteString -> Int64 -previous char position byteString - | BS.index byteString position == char = position - | otherwise = previous char (position - 1) byteString - -subBS :: Int64 -> Int64 -> ByteString -> ByteString -subBS offset length = BS.take length . BS.drop offset diff --git a/src/PDF.hs b/src/PDF.hs index 8c96efd..6c3dffd 100644 --- a/src/PDF.hs +++ b/src/PDF.hs @@ -5,12 +5,12 @@ module PDF ( , render ) where -import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Lazy.Char8 as BS ( +import Data.ByteString.Char8 (ByteString) +import qualified Data.ByteString.Char8 as BS ( drop, findIndex, head, isPrefixOf, last, length, span, unpack ) -import Data.ByteString.Lazy.Char8.Util (previous, subBS) -import Data.Int (Int64) +import Data.ByteString.Char8.Util (previous, subBS) +import qualified Data.ByteString.Lazy as Lazy (ByteString) import qualified Data.Map as Map (lookup) import PDF.Body (populate) import qualified PDF.EOL as EOL (Style(..), charset, parser) @@ -21,10 +21,7 @@ import PDF.Object ( ) import qualified PDF.Output as Output (render, string) import PDF.Output (Output(..), nextLine) -import Text.Parsec -import Text.Parsec.ByteString.Lazy (Parser) -import Text.Parsec.Pos (newPos) -import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage) +import PDF.Parser (Parser, runParser, string, takeAll) import Text.Printf (printf) data Document = Document { @@ -38,20 +35,17 @@ instance Output Document where Output.string (printf "%%PDF-%s" pdfVersion) `nextLine` output contents -render :: Document -> ByteString +render :: Document -> Lazy.ByteString render document@(Document {eolStyle}) = Output.render eolStyle document -version :: Parser String -version = string magicNumber *> many (noneOf EOL.charset) +version :: Parser () String +version = BS.unpack <$> + (string magicNumber *> takeAll (not . (`elem` EOL.charset))) -parseError :: String -> Either ParseError a -parseError errorMessage = - Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0) +check :: Bool -> String -> Either String () +check test errorMessage = if test then return () else Left errorMessage -check :: Bool -> String -> Either ParseError () -check test errorMessage = if test then return () else parseError errorMessage - -readStartXref :: EOL.Style -> ByteString -> Either ParseError Int64 +readStartXref :: EOL.Style -> ByteString -> Either String Int readStartXref eolStyle input = check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input)) "Badly formed document : missing EOF marker at the end" @@ -68,7 +62,7 @@ readStartXref eolStyle input = previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1 startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition -findNextLine :: ByteString -> Int64 +findNextLine :: ByteString -> Int findNextLine input = let (line, eolPrefixed) = BS.span notInEol input in let nextNotInEol = BS.findIndex notInEol eolPrefixed in @@ -76,7 +70,7 @@ findNextLine input = where notInEol = not . (`elem` EOL.charset) -findNextSection :: Int64 -> ByteString -> Int64 +findNextSection :: Int -> ByteString -> Int findNextSection offset input = case BS.findIndex (== BS.head eofMarker) input of Nothing -> 0 @@ -87,9 +81,9 @@ findNextSection offset input = then newOffset + findNextLine newInput else findNextSection (newOffset + 1) (BS.drop 1 newInput) -readStructures :: Int64 -> ByteString -> Either ParseError [InputStructure] +readStructures :: Int -> ByteString -> Either String [InputStructure] readStructures startXref input = - parse structure ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow + runParser structure () (BS.drop startXref input) >>= stopOrFollow where stopOrFollow s@(Structure {trailer}) = case Map.lookup (Name "Prev") trailer of @@ -98,11 +92,11 @@ readStructures startXref input = let offset = truncate newStartXref in let startOffset = findNextSection offset (BS.drop offset input) in (InputStructure startOffset s:) <$> (readStructures offset input) - Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v + Just v -> Left $ "Bad value for Prev entry in trailer: " ++ show v -parseDocument :: ByteString -> Either ParseError Document +parseDocument :: ByteString -> Either String Document parseDocument input = do - (pdfVersion, eolStyle) <- parse ((,) <$> version <*> EOL.parser) "" input + (pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input startXref <- readStartXref eolStyle input structuresRead <- readStructures startXref input let contents = populate input <$> structuresRead diff --git a/src/PDF/Body.hs b/src/PDF/Body.hs index 23d4a9c..53fa6f0 100644 --- a/src/PDF/Body.hs +++ b/src/PDF/Body.hs @@ -1,21 +1,23 @@ {-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE OverloadedStrings #-} module PDF.Body ( populate ) where -import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack) -import Data.Int (Int64) +import Control.Applicative ((<|>)) +import Control.Monad.State (get, gets, modify) +import Data.ByteString.Char8 (ByteString) +import qualified Data.ByteString.Char8 as BS (drop, unpack) import Data.Map ((!)) import qualified Data.Map as Map (empty, insert, lookup) import qualified PDF.EOL as EOL (charset, parser) import PDF.Object ( Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..) , InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..) - , Parser, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..) + , Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..) , blank, dictionary, directObject, integer, line ) -import Text.Parsec +import PDF.Parser (Parser, block, char, on, runParser, takeAll) data UserState = UserState { input :: ByteString @@ -26,7 +28,7 @@ data UserState = UserState { type SParser = Parser UserState modifyFlow :: (Flow -> Flow) -> SParser () -modifyFlow f = modifyState $ \state -> state {flow = f $ flow state} +modifyFlow f = modify $ \state -> state {flow = f $ flow state} addObject :: Int -> Object -> SParser () addObject objectId newObject = modifyFlow $ \flow -> flow { @@ -39,9 +41,10 @@ pushOccurrence newOccurrence = modifyFlow $ \flow -> flow { } comment :: Parser u String -comment = char '%' *> many (noneOf EOL.charset) <* EOL.parser +comment = BS.unpack <$> + (char '%' *> takeAll (not . (`elem` EOL.charset)) <* EOL.parser) -lookupOffset :: Int -> XRefSection -> Maybe Int64 +lookupOffset :: Int -> XRefSection -> Maybe Int lookupOffset _ [] = Nothing lookupOffset objectId (xrefSubSection:others) = let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in @@ -53,25 +56,20 @@ lookupOffset objectId (xrefSubSection:others) = _ -> Nothing else lookupOffset objectId others -getOffset :: Int -> SParser Int64 +getOffset :: Int -> SParser Int getOffset objectId = do - table <- xreferences <$> getState + table <- gets xreferences case lookupOffset objectId table of Nothing -> fail $ "obj " ++ show objectId ++ " is referenced but missing in XRef table" Just offset -> return offset -on :: Monad m => ParsecT s u m a -> s -> ParsecT s u m a -on parser input = do - originalInput <- getInput - setInput input >> parser <* setInput originalInput - loadNumber :: Int -> SParser Float loadNumber objectId = do offset <- getOffset objectId - objectStart <- BS.drop offset . input <$> getState - indirectObjCoordinates `on` objectStart >> return () - objectValue <- (!objectId) . tmpObjects . flow <$> getState + objectStart <- BS.drop offset <$> gets input + indirectObjCoordinates `on` (objectStart :: ByteString) >> return () + objectValue <- (!objectId) . tmpObjects <$> gets flow case objectValue of Direct (NumberObject (Number n)) -> return n obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number" @@ -83,7 +81,7 @@ getSize :: Maybe DirectObject -> SParser Float getSize Nothing = fail "Missing '/Length' key on stream" getSize (Just (NumberObject (Number size))) = return size getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do - Flow {tmpObjects} <- flow <$> getState + Flow {tmpObjects} <- gets flow case Map.lookup objectId tmpObjects of Nothing -> loadNumber objectId Just (Direct (NumberObject (Number size))) -> return size @@ -92,13 +90,13 @@ getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length" streamObject :: SParser Object -streamObject = try $ do +streamObject = do header <- dictionary <* blank size <- getSize (Map.lookup (Name "Length") header) - streamContent <- BS.pack <$> stream (truncate size) + streamContent <- stream (truncate size) return $ Stream {header, streamContent} where - stream size = line "stream" *> count size anyChar <* blank <* line "endstream" + stream size = line "stream" *> block size <* blank <* line "endstream" object :: SParser Object object = streamObject <|> Direct <$> directObject @@ -117,7 +115,7 @@ occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates populate :: ByteString -> InputStructure -> Content populate input structure = let bodyInput = BS.drop (startOffset structure) input in - case runParser recurseOnOccurrences initialState "" bodyInput of + case runParser recurseOnOccurrences initialState bodyInput of Left _ -> Content {occurrences = [], objects = Map.empty, docStructure} Right finalState -> let Flow {occurrencesStack, tmpObjects} = flow finalState in @@ -135,4 +133,4 @@ populate input structure = recurseOnOccurrences :: SParser UserState recurseOnOccurrences = - (occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState + (occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> get diff --git a/src/PDF/EOL.hs b/src/PDF/EOL.hs index 4d88751..0493ea0 100644 --- a/src/PDF/EOL.hs +++ b/src/PDF/EOL.hs @@ -1,19 +1,20 @@ +{-# LANGUAGE OverloadedStrings #-} module PDF.EOL ( Style(..) , charset , parser ) where -import Data.ByteString.Lazy.Char8 (ByteString) -import Text.Parsec ((<|>), Parsec, string, try) +import Control.Applicative ((<|>)) +import PDF.Parser (Parser, string) data Style = CR | LF | CRLF deriving Show charset :: String charset = "\r\n" -parser :: Parsec ByteString u Style +parser :: Parser s Style parser = - try (string "\r\n" >> return CRLF) + (string "\r\n" >> return CRLF) <|> (string "\r" >> return CR) <|> (string "\n" >> return LF) diff --git a/src/PDF/Object.hs b/src/PDF/Object.hs index f67d130..17c37e4 100644 --- a/src/PDF/Object.hs +++ b/src/PDF/Object.hs @@ -11,7 +11,6 @@ module PDF.Object ( , Number(..) , Object(..) , Occurrence(..) - , Parser , Structure(..) , XRefEntry(..) , XRefSection @@ -26,8 +25,11 @@ module PDF.Object ( , structure ) where -import Data.ByteString.Lazy.Char8 (ByteString) -import Data.Int (Int64) +import Control.Applicative ((<|>)) +import Data.ByteString.Char8 (ByteString) +import qualified Data.ByteString.Char8 as BS ( + concat, cons, pack, singleton, unpack + ) import Data.Map (Map, (!), mapWithKey) import qualified Data.Map as Map (elems, fromList, toList) import qualified PDF.EOL as EOL (charset, parser) @@ -36,15 +38,17 @@ import PDF.Output ( OBuilder, Offset(..), Output(..) , byteString, getOffsets, join, newLine, nextLine, saveOffset ) -import Text.Parsec +import PDF.Parser ( + Parser, () + , char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option + , sepBy, string, takeAll, takeAll1 + ) import Text.Printf (printf) -type Parser u = Parsec ByteString u - line :: String -> Parser u () -line l = string l *> EOL.parser *> return () +line l = string (BS.pack l) *> EOL.parser *> return () printf "line «%s»" l -magicNumber :: String +magicNumber :: ByteString magicNumber = "%PDF-" eofMarker :: ByteString @@ -53,20 +57,17 @@ eofMarker = "%%EOF" whiteSpaceCharset :: String whiteSpaceCharset = "\0\t\12 " -whiteSpace :: Parser u () -whiteSpace = oneOf whiteSpaceCharset *> return () <|> EOL.parser *> return () - blank :: Parser u () -blank = skipMany whiteSpace +blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return () delimiterCharset :: String delimiterCharset = "()<>[]{}/%" -regular :: Parser u Char -regular = noneOf $ EOL.charset ++ whiteSpaceCharset ++ delimiterCharset +regular :: Char -> Bool +regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset)) integer :: (Read a, Num a) => Parser u a -integer = read <$> many1 digit <* whiteSpace +integer = read . BS.unpack <$> decNumber <* blank "decimal integer" ------------------------------------- -- OBJECTS @@ -76,7 +77,8 @@ integer = read <$> many1 digit <* whiteSpace -- Boolean -- boolean :: Parser u Bool -boolean = (string "true" *> return True) <|> (string "false" *> return False) +boolean = + (string "true" *> return True) <|> (string "false" *> return False) "boolean" -- -- Number @@ -90,12 +92,13 @@ instance Output Number where _ -> printf "%f" f number :: Parser u Number -number = Number . read <$> - (mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart)) +number = Number . read . BS.unpack <$> + (mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart)) + "number" where sign = string "-" <|> option "" (char '+' >> return "") - integerPart = mappend <$> many1 digit <*> option "" floatPart - floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit) + integerPart = mappend <$> decNumber <*> option "" floatPart + floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber) -- -- StringObject @@ -108,14 +111,18 @@ instance Output StringObject where stringObject :: Parser u StringObject stringObject = - Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')') - <|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>') + Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')') + <|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>') + "string object (literal or hexadecimal)" where - literalStringBlock = many1 (noneOf "\\()") <|> matchingParenthesis <|> escapedChar + literalString = many literalStringBlock + literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar + normalChar = not . (`elem` ("\\()" :: String)) matchingParenthesis = - (++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")" - escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode) - octalCode = choice $ (\n -> count n octDigit) <$> [1..3] + mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")" + escapedChar = + BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode) + octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3] -- -- Name @@ -126,13 +133,14 @@ instance Output Name where output (Name n) = "/" `mappend` Output.string n name :: Parser u Name -name = Name <$> (char '/' *> many regular) +name = Name . BS.unpack <$> (char '/' *> takeAll regular) "name" -- -- Array -- array :: Parser u [DirectObject] -array = char '[' *> blank *> directObject `endBy` blank <* char ']' +array = + char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' "array" -- -- Dictionary @@ -149,16 +157,16 @@ instance Output Dictionary where dictionary :: Parser u Dictionary dictionary = - try (string "<<" *> blank *> keyValPairs <* string ">>") + string "<<" *> blank *> keyValPairs <* string ">>" "dictionary" where keyVal = (,) <$> name <* blank <*> directObject - keyValPairs = Map.fromList <$> keyVal `endBy` blank + keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank -- -- Null -- nullObject :: Parser u () -nullObject = string "null" *> return () +nullObject = string "null" *> return () "null object" -- -- Reference @@ -169,7 +177,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates { } deriving Show reference :: Parser u IndirectObjCoordinates -reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R' +reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R' "reference to an object" -- -- DirectObject @@ -198,14 +206,15 @@ instance Output DirectObject where directObject :: Parser u DirectObject directObject = - Boolean <$> try boolean - <|> Reference <$> try reference {- defined before Number because Number is a prefix of it -} - <|> NumberObject <$> try number - <|> StringObject <$> try stringObject - <|> NameObject <$> try name - <|> Array <$> try array - <|> Dictionary <$> try dictionary - <|> const Null <$> try nullObject + Boolean <$> boolean + <|> Reference <$> reference {- defined before Number because Number is a prefix of it -} + <|> NumberObject <$> number + <|> StringObject <$> stringObject + <|> NameObject <$> name + <|> Array <$> array + <|> Dictionary <$> dictionary + <|> const Null <$> nullObject + "direct object" -- -- Object @@ -226,7 +235,6 @@ instance Output Object where `nextLine` byteString streamContent `mappend` "endstream" - -- -- Occurrence -- @@ -249,10 +257,10 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum -- XRefEntry -- data XRefEntry = InUse { - offset :: Int64 + offset :: Int , generation :: Int } | Free { - nextFree :: Int64 + nextFree :: Int , generation :: Int } deriving Show @@ -265,14 +273,13 @@ instance Output XRefEntry where entry :: Parser u XRefEntry entry = do (big, small) <- (,) <$> integer <*> integer - (inUse big small <|> free big small) <* blank + (inUse big small <|> free big small "XRef entry") <* blank where - inUse :: Int64 -> Int -> Parser u XRefEntry + inUse :: Int -> Int -> Parser u XRefEntry inUse offset generation = char 'n' *> return (InUse {offset, generation}) - free :: Int64 -> Int -> Parser u XRefEntry + free :: Int -> Int -> Parser u XRefEntry free nextFree generation = char 'f' *> return (Free {nextFree, generation}) - -- -- XRefSubSection -- @@ -289,7 +296,7 @@ instance Output XRefSubSection where xrefSubSection :: Parser u XRefSubSection xrefSubSection = do - (firstObjectId, entriesNumber) <- (,) <$> integer <*> integer + (firstObjectId, entriesNumber) <- (,) <$> integer <*> integer "XRef subsection" entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry return $ XRefSubSection {firstObjectId, entriesNumber, entries} @@ -299,7 +306,7 @@ type XRefSection = [XRefSubSection] -- Structure -- data InputStructure = InputStructure { - startOffset :: Int64 + startOffset :: Int , inputStructure :: Structure } @@ -314,7 +321,7 @@ structure = <$> (line "xref" *> xrefSubSection `sepBy` EOL.parser) <*> (line "trailer" *> dictionary <* EOL.parser) -updateXrefs :: XRefSection -> Map Offset Int64 -> (XRefSection, Int64) +updateXrefs :: XRefSection -> Map Offset Int -> (XRefSection, Int) updateXrefs xrefSection offsets = ( updateSubSection <$> xrefSection , offsets ! StartXRef diff --git a/src/PDF/Output.hs b/src/PDF/Output.hs index bcbb01a..27a77f1 100644 --- a/src/PDF/Output.hs +++ b/src/PDF/Output.hs @@ -11,7 +11,6 @@ module PDF.Output ( , char , getOffsets , join - , lift , newLine , nextLine , saveOffset @@ -19,10 +18,11 @@ module PDF.Output ( , render ) where -import Data.ByteString.Builder (Builder, char8, lazyByteString, string8, toLazyByteString) -import Data.ByteString.Lazy.Char8 (ByteString) -import qualified Data.ByteString.Lazy.Char8 as BS (length) -import Data.Int (Int64) +import Data.ByteString.Builder (Builder, char8, string8, toLazyByteString) +import qualified Data.ByteString.Builder as B (byteString) +import Data.ByteString (ByteString) +import qualified Data.ByteString as BS (length) +import qualified Data.ByteString.Lazy as Lazy (ByteString) import Data.Map (Map) import qualified Data.Map as Map (singleton) import Data.String (IsString(..)) @@ -31,7 +31,7 @@ import qualified PDF.EOL as EOL (Style(..)) data Offset = StartXRef | ObjectId Int deriving (Eq, Ord) -newtype OContext a = OContext (RWS EOL.Style (Map Offset Int64) Int64 a) +newtype OContext a = OContext (RWS EOL.Style (Map Offset Int) Int a) type OBuilder = OContext Builder instance Functor OContext where @@ -51,7 +51,7 @@ saveOffset offset = OContext $ lift :: (a -> Builder) -> a -> OBuilder lift f a = return (f a) -getOffsets :: OBuilder -> OContext (OBuilder, Map Offset Int64) +getOffsets :: OBuilder -> OContext (OBuilder, Map Offset Int) getOffsets (OContext builder) = OContext (listen builder >>= \(a, w) -> return (return a, w)) @@ -108,9 +108,9 @@ string :: String -> OBuilder string s = lift string8 s <* OContext (modify (+ toEnum (length s))) byteString :: ByteString -> OBuilder -byteString bs = lift lazyByteString bs <* OContext (modify (+ BS.length bs)) +byteString bs = lift B.byteString bs <* OContext (modify (+ BS.length bs)) -render :: Output a => EOL.Style -> a -> ByteString +render :: Output a => EOL.Style -> a -> Lazy.ByteString render eolStyle a = let OContext builder = output a in let (outputByteString, _, _) = runRWS builder eolStyle 0 in diff --git a/src/PDF/Parser.hs b/src/PDF/Parser.hs new file mode 100644 index 0000000..3f170e7 --- /dev/null +++ b/src/PDF/Parser.hs @@ -0,0 +1,98 @@ +module PDF.Parser ( + Parser + , () + , block + , char + , choice + , count + , decNumber + , hexNumber + , many + , octDigit + , on + , oneOf + , option + , runParser + , sepBy + , string + , takeAll + , takeAll1 + ) where + +import Control.Applicative ((<|>), empty) +import Control.Monad.State (StateT(..), evalStateT) +import Control.Monad.Trans (lift) +import qualified Data.Attoparsec.ByteString.Char8 as Atto ( + Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1 + ) +import Data.ByteString (ByteString) +import Data.Char (toLower) +import Data.Set (Set) +import qualified Data.Set as Set (fromList, member, unions) + +type Parser s = StateT s Atto.Parser + +() :: Parser s a -> String -> Parser s a +() parser debugMessage = parser <|> fail debugMessage + +block :: Int -> Parser s ByteString +block = lift . Atto.take + +char :: Char -> Parser s Char +char = lift . Atto.char + +choice :: [Parser s a] -> Parser s a +choice = foldr (<|>) empty + +count :: Int -> Parser s a -> Parser s [a] +count 0 _ = return [] +count n p = (:) <$> p <*> count (n-1) p + +decNumber :: Parser s ByteString +decNumber = lift $ Atto.takeWhile1 (`Set.member` digits) + +digits :: Set Char +digits = Set.fromList ['0'..'9'] + +hexDigits :: Set Char +hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af] + where + af = ['A'..'F'] + +hexNumber :: Parser s ByteString +hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits) + +many :: Parser s a -> Parser s [a] +many parser = (:) <$> parser <*> many parser <|> return [] + +octDigit :: Parser s Char +octDigit = oneOf ['0'..'7'] + +on :: Parser s a -> ByteString -> Parser s (Either String a) +on (StateT parserF) input = StateT $ \state -> + case Atto.parseOnly (parserF state) input of + Left errorMsg -> return (Left errorMsg, state) + Right (result, newState) -> return (Right result, newState) + +oneOf :: String -> Parser s Char +oneOf charSet = lift $ Atto.satisfy (`elem` charSet) + +option :: a -> Parser s a -> Parser s a +option defaultValue p = p <|> pure defaultValue + +runParser :: Parser s a -> s -> ByteString -> Either String a +runParser parser initState = + Atto.parseOnly (evalStateT parser initState) + +sepBy :: Parser s a -> Parser s b -> Parser s [a] +sepBy parser separator = + option [] $ (:) <$> parser <*> many (separator *> parser) + +string :: ByteString -> Parser s ByteString +string = lift . Atto.string + +takeAll :: (Char -> Bool) -> Parser s ByteString +takeAll = lift . Atto.takeWhile + +takeAll1 :: (Char -> Bool) -> Parser s ByteString +takeAll1 = lift . Atto.takeWhile1