Go strict ByteStrings with attoparsec
This commit is contained in:
parent
0daa03d958
commit
11cb6504d7
10 changed files with 237 additions and 134 deletions
|
@ -1,5 +1,10 @@
|
||||||
# Revision history for Hufflepdf
|
# Revision history for Hufflepdf
|
||||||
|
|
||||||
|
## 0.1.1.0 -- 2019-05-23
|
||||||
|
|
||||||
|
* Rewrite the parser using attoparsec instead of parsec (about 1.5x faster)
|
||||||
|
* Use strict ByteStrings instead of lazy ones for an additional gain in performance, simpler type interface (lazy ByteStrings are now used only for output because that's required by the ByteString Builder) and fewer import lines
|
||||||
|
|
||||||
## 0.1.0.0 -- 2019-05-18
|
## 0.1.0.0 -- 2019-05-18
|
||||||
|
|
||||||
* First version. Released on an unsuspecting world.
|
* First version. Released on an unsuspecting world.
|
||||||
|
|
|
@ -17,17 +17,18 @@ cabal-version: >=1.10
|
||||||
|
|
||||||
library
|
library
|
||||||
exposed-modules: PDF
|
exposed-modules: PDF
|
||||||
, PDF.Object
|
|
||||||
other-modules: PDF.Body
|
|
||||||
, PDF.EOL
|
, PDF.EOL
|
||||||
|
, PDF.Object
|
||||||
, PDF.Output
|
, PDF.Output
|
||||||
, Data.ByteString.Lazy.Char8.Util
|
other-modules: Data.ByteString.Char8.Util
|
||||||
|
, PDF.Body
|
||||||
|
, PDF.Parser
|
||||||
-- other-extensions:
|
-- other-extensions:
|
||||||
build-depends: base >=4.9 && <4.13
|
build-depends: attoparsec
|
||||||
|
, base >=4.9 && <4.13
|
||||||
, bytestring
|
, bytestring
|
||||||
, containers
|
, containers
|
||||||
, mtl
|
, mtl
|
||||||
, parsec
|
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
16
src/Data/ByteString/Char8/Util.hs
Normal file
16
src/Data/ByteString/Char8/Util.hs
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
module Data.ByteString.Char8.Util (
|
||||||
|
previous
|
||||||
|
, subBS
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Data.ByteString.Char8 (ByteString)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (drop, index, take)
|
||||||
|
import Prelude hiding (length)
|
||||||
|
|
||||||
|
previous :: Char -> Int -> ByteString -> Int
|
||||||
|
previous char position byteString
|
||||||
|
| BS.index byteString position == char = position
|
||||||
|
| otherwise = previous char (position - 1) byteString
|
||||||
|
|
||||||
|
subBS :: Int -> Int -> ByteString -> ByteString
|
||||||
|
subBS offset length = BS.take length . BS.drop offset
|
|
@ -1,17 +0,0 @@
|
||||||
module Data.ByteString.Lazy.Char8.Util (
|
|
||||||
previous
|
|
||||||
, subBS
|
|
||||||
) where
|
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
|
|
||||||
import Data.Int (Int64)
|
|
||||||
import Prelude hiding (length)
|
|
||||||
|
|
||||||
previous :: Char -> Int64 -> ByteString -> Int64
|
|
||||||
previous char position byteString
|
|
||||||
| BS.index byteString position == char = position
|
|
||||||
| otherwise = previous char (position - 1) byteString
|
|
||||||
|
|
||||||
subBS :: Int64 -> Int64 -> ByteString -> ByteString
|
|
||||||
subBS offset length = BS.take length . BS.drop offset
|
|
44
src/PDF.hs
44
src/PDF.hs
|
@ -5,12 +5,12 @@ module PDF (
|
||||||
, render
|
, render
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Data.ByteString.Char8 (ByteString)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BS (
|
import qualified Data.ByteString.Char8 as BS (
|
||||||
drop, findIndex, head, isPrefixOf, last, length, span, unpack
|
drop, findIndex, head, isPrefixOf, last, length, span, unpack
|
||||||
)
|
)
|
||||||
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
|
import Data.ByteString.Char8.Util (previous, subBS)
|
||||||
import Data.Int (Int64)
|
import qualified Data.ByteString.Lazy as Lazy (ByteString)
|
||||||
import qualified Data.Map as Map (lookup)
|
import qualified Data.Map as Map (lookup)
|
||||||
import PDF.Body (populate)
|
import PDF.Body (populate)
|
||||||
import qualified PDF.EOL as EOL (Style(..), charset, parser)
|
import qualified PDF.EOL as EOL (Style(..), charset, parser)
|
||||||
|
@ -21,10 +21,7 @@ import PDF.Object (
|
||||||
)
|
)
|
||||||
import qualified PDF.Output as Output (render, string)
|
import qualified PDF.Output as Output (render, string)
|
||||||
import PDF.Output (Output(..), nextLine)
|
import PDF.Output (Output(..), nextLine)
|
||||||
import Text.Parsec
|
import PDF.Parser (Parser, runParser, string, takeAll)
|
||||||
import Text.Parsec.ByteString.Lazy (Parser)
|
|
||||||
import Text.Parsec.Pos (newPos)
|
|
||||||
import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
|
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
data Document = Document {
|
data Document = Document {
|
||||||
|
@ -38,20 +35,17 @@ instance Output Document where
|
||||||
Output.string (printf "%%PDF-%s" pdfVersion)
|
Output.string (printf "%%PDF-%s" pdfVersion)
|
||||||
`nextLine` output contents
|
`nextLine` output contents
|
||||||
|
|
||||||
render :: Document -> ByteString
|
render :: Document -> Lazy.ByteString
|
||||||
render document@(Document {eolStyle}) = Output.render eolStyle document
|
render document@(Document {eolStyle}) = Output.render eolStyle document
|
||||||
|
|
||||||
version :: Parser String
|
version :: Parser () String
|
||||||
version = string magicNumber *> many (noneOf EOL.charset)
|
version = BS.unpack <$>
|
||||||
|
(string magicNumber *> takeAll (not . (`elem` EOL.charset)))
|
||||||
|
|
||||||
parseError :: String -> Either ParseError a
|
check :: Bool -> String -> Either String ()
|
||||||
parseError errorMessage =
|
check test errorMessage = if test then return () else Left errorMessage
|
||||||
Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0)
|
|
||||||
|
|
||||||
check :: Bool -> String -> Either ParseError ()
|
readStartXref :: EOL.Style -> ByteString -> Either String Int
|
||||||
check test errorMessage = if test then return () else parseError errorMessage
|
|
||||||
|
|
||||||
readStartXref :: EOL.Style -> ByteString -> Either ParseError Int64
|
|
||||||
readStartXref eolStyle input =
|
readStartXref eolStyle input =
|
||||||
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
|
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
|
||||||
"Badly formed document : missing EOF marker at the end"
|
"Badly formed document : missing EOF marker at the end"
|
||||||
|
@ -68,7 +62,7 @@ readStartXref eolStyle input =
|
||||||
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
|
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
|
||||||
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
|
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
|
||||||
|
|
||||||
findNextLine :: ByteString -> Int64
|
findNextLine :: ByteString -> Int
|
||||||
findNextLine input =
|
findNextLine input =
|
||||||
let (line, eolPrefixed) = BS.span notInEol input in
|
let (line, eolPrefixed) = BS.span notInEol input in
|
||||||
let nextNotInEol = BS.findIndex notInEol eolPrefixed in
|
let nextNotInEol = BS.findIndex notInEol eolPrefixed in
|
||||||
|
@ -76,7 +70,7 @@ findNextLine input =
|
||||||
where
|
where
|
||||||
notInEol = not . (`elem` EOL.charset)
|
notInEol = not . (`elem` EOL.charset)
|
||||||
|
|
||||||
findNextSection :: Int64 -> ByteString -> Int64
|
findNextSection :: Int -> ByteString -> Int
|
||||||
findNextSection offset input =
|
findNextSection offset input =
|
||||||
case BS.findIndex (== BS.head eofMarker) input of
|
case BS.findIndex (== BS.head eofMarker) input of
|
||||||
Nothing -> 0
|
Nothing -> 0
|
||||||
|
@ -87,9 +81,9 @@ findNextSection offset input =
|
||||||
then newOffset + findNextLine newInput
|
then newOffset + findNextLine newInput
|
||||||
else findNextSection (newOffset + 1) (BS.drop 1 newInput)
|
else findNextSection (newOffset + 1) (BS.drop 1 newInput)
|
||||||
|
|
||||||
readStructures :: Int64 -> ByteString -> Either ParseError [InputStructure]
|
readStructures :: Int -> ByteString -> Either String [InputStructure]
|
||||||
readStructures startXref input =
|
readStructures startXref input =
|
||||||
parse structure ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
|
runParser structure () (BS.drop startXref input) >>= stopOrFollow
|
||||||
where
|
where
|
||||||
stopOrFollow s@(Structure {trailer}) =
|
stopOrFollow s@(Structure {trailer}) =
|
||||||
case Map.lookup (Name "Prev") trailer of
|
case Map.lookup (Name "Prev") trailer of
|
||||||
|
@ -98,11 +92,11 @@ readStructures startXref input =
|
||||||
let offset = truncate newStartXref in
|
let offset = truncate newStartXref in
|
||||||
let startOffset = findNextSection offset (BS.drop offset input) in
|
let startOffset = findNextSection offset (BS.drop offset input) in
|
||||||
(InputStructure startOffset s:) <$> (readStructures offset input)
|
(InputStructure startOffset s:) <$> (readStructures offset input)
|
||||||
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
|
Just v -> Left $ "Bad value for Prev entry in trailer: " ++ show v
|
||||||
|
|
||||||
parseDocument :: ByteString -> Either ParseError Document
|
parseDocument :: ByteString -> Either String Document
|
||||||
parseDocument input = do
|
parseDocument input = do
|
||||||
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> EOL.parser) "" input
|
(pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
|
||||||
startXref <- readStartXref eolStyle input
|
startXref <- readStartXref eolStyle input
|
||||||
structuresRead <- readStructures startXref input
|
structuresRead <- readStructures startXref input
|
||||||
let contents = populate input <$> structuresRead
|
let contents = populate input <$> structuresRead
|
||||||
|
|
|
@ -1,21 +1,23 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
module PDF.Body (
|
module PDF.Body (
|
||||||
populate
|
populate
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Control.Applicative ((<|>))
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
|
import Control.Monad.State (get, gets, modify)
|
||||||
import Data.Int (Int64)
|
import Data.ByteString.Char8 (ByteString)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (drop, unpack)
|
||||||
import Data.Map ((!))
|
import Data.Map ((!))
|
||||||
import qualified Data.Map as Map (empty, insert, lookup)
|
import qualified Data.Map as Map (empty, insert, lookup)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
import PDF.Object (
|
import PDF.Object (
|
||||||
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
|
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
|
||||||
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
|
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
|
||||||
, Parser, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..)
|
, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..)
|
||||||
, blank, dictionary, directObject, integer, line
|
, blank, dictionary, directObject, integer, line
|
||||||
)
|
)
|
||||||
import Text.Parsec
|
import PDF.Parser (Parser, block, char, on, runParser, takeAll)
|
||||||
|
|
||||||
data UserState = UserState {
|
data UserState = UserState {
|
||||||
input :: ByteString
|
input :: ByteString
|
||||||
|
@ -26,7 +28,7 @@ data UserState = UserState {
|
||||||
type SParser = Parser UserState
|
type SParser = Parser UserState
|
||||||
|
|
||||||
modifyFlow :: (Flow -> Flow) -> SParser ()
|
modifyFlow :: (Flow -> Flow) -> SParser ()
|
||||||
modifyFlow f = modifyState $ \state -> state {flow = f $ flow state}
|
modifyFlow f = modify $ \state -> state {flow = f $ flow state}
|
||||||
|
|
||||||
addObject :: Int -> Object -> SParser ()
|
addObject :: Int -> Object -> SParser ()
|
||||||
addObject objectId newObject = modifyFlow $ \flow -> flow {
|
addObject objectId newObject = modifyFlow $ \flow -> flow {
|
||||||
|
@ -39,9 +41,10 @@ pushOccurrence newOccurrence = modifyFlow $ \flow -> flow {
|
||||||
}
|
}
|
||||||
|
|
||||||
comment :: Parser u String
|
comment :: Parser u String
|
||||||
comment = char '%' *> many (noneOf EOL.charset) <* EOL.parser
|
comment = BS.unpack <$>
|
||||||
|
(char '%' *> takeAll (not . (`elem` EOL.charset)) <* EOL.parser)
|
||||||
|
|
||||||
lookupOffset :: Int -> XRefSection -> Maybe Int64
|
lookupOffset :: Int -> XRefSection -> Maybe Int
|
||||||
lookupOffset _ [] = Nothing
|
lookupOffset _ [] = Nothing
|
||||||
lookupOffset objectId (xrefSubSection:others) =
|
lookupOffset objectId (xrefSubSection:others) =
|
||||||
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
|
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
|
||||||
|
@ -53,25 +56,20 @@ lookupOffset objectId (xrefSubSection:others) =
|
||||||
_ -> Nothing
|
_ -> Nothing
|
||||||
else lookupOffset objectId others
|
else lookupOffset objectId others
|
||||||
|
|
||||||
getOffset :: Int -> SParser Int64
|
getOffset :: Int -> SParser Int
|
||||||
getOffset objectId = do
|
getOffset objectId = do
|
||||||
table <- xreferences <$> getState
|
table <- gets xreferences
|
||||||
case lookupOffset objectId table of
|
case lookupOffset objectId table of
|
||||||
Nothing -> fail $
|
Nothing -> fail $
|
||||||
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
|
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
|
||||||
Just offset -> return offset
|
Just offset -> return offset
|
||||||
|
|
||||||
on :: Monad m => ParsecT s u m a -> s -> ParsecT s u m a
|
|
||||||
on parser input = do
|
|
||||||
originalInput <- getInput
|
|
||||||
setInput input >> parser <* setInput originalInput
|
|
||||||
|
|
||||||
loadNumber :: Int -> SParser Float
|
loadNumber :: Int -> SParser Float
|
||||||
loadNumber objectId = do
|
loadNumber objectId = do
|
||||||
offset <- getOffset objectId
|
offset <- getOffset objectId
|
||||||
objectStart <- BS.drop offset . input <$> getState
|
objectStart <- BS.drop offset <$> gets input
|
||||||
indirectObjCoordinates `on` objectStart >> return ()
|
indirectObjCoordinates `on` (objectStart :: ByteString) >> return ()
|
||||||
objectValue <- (!objectId) . tmpObjects . flow <$> getState
|
objectValue <- (!objectId) . tmpObjects <$> gets flow
|
||||||
case objectValue of
|
case objectValue of
|
||||||
Direct (NumberObject (Number n)) -> return n
|
Direct (NumberObject (Number n)) -> return n
|
||||||
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
|
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
|
||||||
|
@ -83,7 +81,7 @@ getSize :: Maybe DirectObject -> SParser Float
|
||||||
getSize Nothing = fail "Missing '/Length' key on stream"
|
getSize Nothing = fail "Missing '/Length' key on stream"
|
||||||
getSize (Just (NumberObject (Number size))) = return size
|
getSize (Just (NumberObject (Number size))) = return size
|
||||||
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
|
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
|
||||||
Flow {tmpObjects} <- flow <$> getState
|
Flow {tmpObjects} <- gets flow
|
||||||
case Map.lookup objectId tmpObjects of
|
case Map.lookup objectId tmpObjects of
|
||||||
Nothing -> loadNumber objectId
|
Nothing -> loadNumber objectId
|
||||||
Just (Direct (NumberObject (Number size))) -> return size
|
Just (Direct (NumberObject (Number size))) -> return size
|
||||||
|
@ -92,13 +90,13 @@ getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
|
||||||
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
|
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
|
||||||
|
|
||||||
streamObject :: SParser Object
|
streamObject :: SParser Object
|
||||||
streamObject = try $ do
|
streamObject = do
|
||||||
header <- dictionary <* blank
|
header <- dictionary <* blank
|
||||||
size <- getSize (Map.lookup (Name "Length") header)
|
size <- getSize (Map.lookup (Name "Length") header)
|
||||||
streamContent <- BS.pack <$> stream (truncate size)
|
streamContent <- stream (truncate size)
|
||||||
return $ Stream {header, streamContent}
|
return $ Stream {header, streamContent}
|
||||||
where
|
where
|
||||||
stream size = line "stream" *> count size anyChar <* blank <* line "endstream"
|
stream size = line "stream" *> block size <* blank <* line "endstream"
|
||||||
|
|
||||||
object :: SParser Object
|
object :: SParser Object
|
||||||
object = streamObject <|> Direct <$> directObject
|
object = streamObject <|> Direct <$> directObject
|
||||||
|
@ -117,7 +115,7 @@ occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
|
||||||
populate :: ByteString -> InputStructure -> Content
|
populate :: ByteString -> InputStructure -> Content
|
||||||
populate input structure =
|
populate input structure =
|
||||||
let bodyInput = BS.drop (startOffset structure) input in
|
let bodyInput = BS.drop (startOffset structure) input in
|
||||||
case runParser recurseOnOccurrences initialState "" bodyInput of
|
case runParser recurseOnOccurrences initialState bodyInput of
|
||||||
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
|
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
|
||||||
Right finalState ->
|
Right finalState ->
|
||||||
let Flow {occurrencesStack, tmpObjects} = flow finalState in
|
let Flow {occurrencesStack, tmpObjects} = flow finalState in
|
||||||
|
@ -135,4 +133,4 @@ populate input structure =
|
||||||
|
|
||||||
recurseOnOccurrences :: SParser UserState
|
recurseOnOccurrences :: SParser UserState
|
||||||
recurseOnOccurrences =
|
recurseOnOccurrences =
|
||||||
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState
|
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> get
|
||||||
|
|
|
@ -1,19 +1,20 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
module PDF.EOL (
|
module PDF.EOL (
|
||||||
Style(..)
|
Style(..)
|
||||||
, charset
|
, charset
|
||||||
, parser
|
, parser
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Control.Applicative ((<|>))
|
||||||
import Text.Parsec ((<|>), Parsec, string, try)
|
import PDF.Parser (Parser, string)
|
||||||
|
|
||||||
data Style = CR | LF | CRLF deriving Show
|
data Style = CR | LF | CRLF deriving Show
|
||||||
|
|
||||||
charset :: String
|
charset :: String
|
||||||
charset = "\r\n"
|
charset = "\r\n"
|
||||||
|
|
||||||
parser :: Parsec ByteString u Style
|
parser :: Parser s Style
|
||||||
parser =
|
parser =
|
||||||
try (string "\r\n" >> return CRLF)
|
(string "\r\n" >> return CRLF)
|
||||||
<|> (string "\r" >> return CR)
|
<|> (string "\r" >> return CR)
|
||||||
<|> (string "\n" >> return LF)
|
<|> (string "\n" >> return LF)
|
||||||
|
|
|
@ -11,7 +11,6 @@ module PDF.Object (
|
||||||
, Number(..)
|
, Number(..)
|
||||||
, Object(..)
|
, Object(..)
|
||||||
, Occurrence(..)
|
, Occurrence(..)
|
||||||
, Parser
|
|
||||||
, Structure(..)
|
, Structure(..)
|
||||||
, XRefEntry(..)
|
, XRefEntry(..)
|
||||||
, XRefSection
|
, XRefSection
|
||||||
|
@ -26,8 +25,11 @@ module PDF.Object (
|
||||||
, structure
|
, structure
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import Control.Applicative ((<|>))
|
||||||
import Data.Int (Int64)
|
import Data.ByteString.Char8 (ByteString)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (
|
||||||
|
concat, cons, pack, singleton, unpack
|
||||||
|
)
|
||||||
import Data.Map (Map, (!), mapWithKey)
|
import Data.Map (Map, (!), mapWithKey)
|
||||||
import qualified Data.Map as Map (elems, fromList, toList)
|
import qualified Data.Map as Map (elems, fromList, toList)
|
||||||
import qualified PDF.EOL as EOL (charset, parser)
|
import qualified PDF.EOL as EOL (charset, parser)
|
||||||
|
@ -36,15 +38,17 @@ import PDF.Output (
|
||||||
OBuilder, Offset(..), Output(..)
|
OBuilder, Offset(..), Output(..)
|
||||||
, byteString, getOffsets, join, newLine, nextLine, saveOffset
|
, byteString, getOffsets, join, newLine, nextLine, saveOffset
|
||||||
)
|
)
|
||||||
import Text.Parsec
|
import PDF.Parser (
|
||||||
|
Parser, (<?>)
|
||||||
|
, char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
|
||||||
|
, sepBy, string, takeAll, takeAll1
|
||||||
|
)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
type Parser u = Parsec ByteString u
|
|
||||||
|
|
||||||
line :: String -> Parser u ()
|
line :: String -> Parser u ()
|
||||||
line l = string l *> EOL.parser *> return ()
|
line l = string (BS.pack l) *> EOL.parser *> return () <?> printf "line «%s»" l
|
||||||
|
|
||||||
magicNumber :: String
|
magicNumber :: ByteString
|
||||||
magicNumber = "%PDF-"
|
magicNumber = "%PDF-"
|
||||||
|
|
||||||
eofMarker :: ByteString
|
eofMarker :: ByteString
|
||||||
|
@ -53,20 +57,17 @@ eofMarker = "%%EOF"
|
||||||
whiteSpaceCharset :: String
|
whiteSpaceCharset :: String
|
||||||
whiteSpaceCharset = "\0\t\12 "
|
whiteSpaceCharset = "\0\t\12 "
|
||||||
|
|
||||||
whiteSpace :: Parser u ()
|
|
||||||
whiteSpace = oneOf whiteSpaceCharset *> return () <|> EOL.parser *> return ()
|
|
||||||
|
|
||||||
blank :: Parser u ()
|
blank :: Parser u ()
|
||||||
blank = skipMany whiteSpace
|
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
|
||||||
|
|
||||||
delimiterCharset :: String
|
delimiterCharset :: String
|
||||||
delimiterCharset = "()<>[]{}/%"
|
delimiterCharset = "()<>[]{}/%"
|
||||||
|
|
||||||
regular :: Parser u Char
|
regular :: Char -> Bool
|
||||||
regular = noneOf $ EOL.charset ++ whiteSpaceCharset ++ delimiterCharset
|
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
|
||||||
|
|
||||||
integer :: (Read a, Num a) => Parser u a
|
integer :: (Read a, Num a) => Parser u a
|
||||||
integer = read <$> many1 digit <* whiteSpace
|
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
|
||||||
|
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
-- OBJECTS
|
-- OBJECTS
|
||||||
|
@ -76,7 +77,8 @@ integer = read <$> many1 digit <* whiteSpace
|
||||||
-- Boolean
|
-- Boolean
|
||||||
--
|
--
|
||||||
boolean :: Parser u Bool
|
boolean :: Parser u Bool
|
||||||
boolean = (string "true" *> return True) <|> (string "false" *> return False)
|
boolean =
|
||||||
|
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Number
|
-- Number
|
||||||
|
@ -90,12 +92,13 @@ instance Output Number where
|
||||||
_ -> printf "%f" f
|
_ -> printf "%f" f
|
||||||
|
|
||||||
number :: Parser u Number
|
number :: Parser u Number
|
||||||
number = Number . read <$>
|
number = Number . read . BS.unpack <$>
|
||||||
(mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart))
|
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
|
||||||
|
<?> "number"
|
||||||
where
|
where
|
||||||
sign = string "-" <|> option "" (char '+' >> return "")
|
sign = string "-" <|> option "" (char '+' >> return "")
|
||||||
integerPart = mappend <$> many1 digit <*> option "" floatPart
|
integerPart = mappend <$> decNumber <*> option "" floatPart
|
||||||
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit)
|
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
|
||||||
|
|
||||||
--
|
--
|
||||||
-- StringObject
|
-- StringObject
|
||||||
|
@ -108,14 +111,18 @@ instance Output StringObject where
|
||||||
|
|
||||||
stringObject :: Parser u StringObject
|
stringObject :: Parser u StringObject
|
||||||
stringObject =
|
stringObject =
|
||||||
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
|
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
||||||
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
|
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
|
||||||
|
<?> "string object (literal or hexadecimal)"
|
||||||
where
|
where
|
||||||
literalStringBlock = many1 (noneOf "\\()") <|> matchingParenthesis <|> escapedChar
|
literalString = many literalStringBlock
|
||||||
|
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
|
||||||
|
normalChar = not . (`elem` ("\\()" :: String))
|
||||||
matchingParenthesis =
|
matchingParenthesis =
|
||||||
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
|
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||||
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
|
escapedChar =
|
||||||
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
|
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||||
|
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Name
|
-- Name
|
||||||
|
@ -126,13 +133,14 @@ instance Output Name where
|
||||||
output (Name n) = "/" `mappend` Output.string n
|
output (Name n) = "/" `mappend` Output.string n
|
||||||
|
|
||||||
name :: Parser u Name
|
name :: Parser u Name
|
||||||
name = Name <$> (char '/' *> many regular)
|
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Array
|
-- Array
|
||||||
--
|
--
|
||||||
array :: Parser u [DirectObject]
|
array :: Parser u [DirectObject]
|
||||||
array = char '[' *> blank *> directObject `endBy` blank <* char ']'
|
array =
|
||||||
|
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Dictionary
|
-- Dictionary
|
||||||
|
@ -149,16 +157,16 @@ instance Output Dictionary where
|
||||||
|
|
||||||
dictionary :: Parser u Dictionary
|
dictionary :: Parser u Dictionary
|
||||||
dictionary =
|
dictionary =
|
||||||
try (string "<<" *> blank *> keyValPairs <* string ">>")
|
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
|
||||||
where
|
where
|
||||||
keyVal = (,) <$> name <* blank <*> directObject
|
keyVal = (,) <$> name <* blank <*> directObject
|
||||||
keyValPairs = Map.fromList <$> keyVal `endBy` blank
|
keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Null
|
-- Null
|
||||||
--
|
--
|
||||||
nullObject :: Parser u ()
|
nullObject :: Parser u ()
|
||||||
nullObject = string "null" *> return ()
|
nullObject = string "null" *> return () <?> "null object"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Reference
|
-- Reference
|
||||||
|
@ -169,7 +177,7 @@ data IndirectObjCoordinates = IndirectObjCoordinates {
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
reference :: Parser u IndirectObjCoordinates
|
reference :: Parser u IndirectObjCoordinates
|
||||||
reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R'
|
reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R' <?> "reference to an object"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- DirectObject
|
-- DirectObject
|
||||||
|
@ -198,14 +206,15 @@ instance Output DirectObject where
|
||||||
|
|
||||||
directObject :: Parser u DirectObject
|
directObject :: Parser u DirectObject
|
||||||
directObject =
|
directObject =
|
||||||
Boolean <$> try boolean
|
Boolean <$> boolean
|
||||||
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -}
|
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
||||||
<|> NumberObject <$> try number
|
<|> NumberObject <$> number
|
||||||
<|> StringObject <$> try stringObject
|
<|> StringObject <$> stringObject
|
||||||
<|> NameObject <$> try name
|
<|> NameObject <$> name
|
||||||
<|> Array <$> try array
|
<|> Array <$> array
|
||||||
<|> Dictionary <$> try dictionary
|
<|> Dictionary <$> dictionary
|
||||||
<|> const Null <$> try nullObject
|
<|> const Null <$> nullObject
|
||||||
|
<?> "direct object"
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Object
|
-- Object
|
||||||
|
@ -226,7 +235,6 @@ instance Output Object where
|
||||||
`nextLine` byteString streamContent
|
`nextLine` byteString streamContent
|
||||||
`mappend` "endstream"
|
`mappend` "endstream"
|
||||||
|
|
||||||
|
|
||||||
--
|
--
|
||||||
-- Occurrence
|
-- Occurrence
|
||||||
--
|
--
|
||||||
|
@ -249,10 +257,10 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum
|
||||||
-- XRefEntry
|
-- XRefEntry
|
||||||
--
|
--
|
||||||
data XRefEntry = InUse {
|
data XRefEntry = InUse {
|
||||||
offset :: Int64
|
offset :: Int
|
||||||
, generation :: Int
|
, generation :: Int
|
||||||
} | Free {
|
} | Free {
|
||||||
nextFree :: Int64
|
nextFree :: Int
|
||||||
, generation :: Int
|
, generation :: Int
|
||||||
} deriving Show
|
} deriving Show
|
||||||
|
|
||||||
|
@ -265,14 +273,13 @@ instance Output XRefEntry where
|
||||||
entry :: Parser u XRefEntry
|
entry :: Parser u XRefEntry
|
||||||
entry = do
|
entry = do
|
||||||
(big, small) <- (,) <$> integer <*> integer
|
(big, small) <- (,) <$> integer <*> integer
|
||||||
(inUse big small <|> free big small) <* blank
|
(inUse big small <|> free big small <?> "XRef entry") <* blank
|
||||||
where
|
where
|
||||||
inUse :: Int64 -> Int -> Parser u XRefEntry
|
inUse :: Int -> Int -> Parser u XRefEntry
|
||||||
inUse offset generation = char 'n' *> return (InUse {offset, generation})
|
inUse offset generation = char 'n' *> return (InUse {offset, generation})
|
||||||
free :: Int64 -> Int -> Parser u XRefEntry
|
free :: Int -> Int -> Parser u XRefEntry
|
||||||
free nextFree generation = char 'f' *> return (Free {nextFree, generation})
|
free nextFree generation = char 'f' *> return (Free {nextFree, generation})
|
||||||
|
|
||||||
|
|
||||||
--
|
--
|
||||||
-- XRefSubSection
|
-- XRefSubSection
|
||||||
--
|
--
|
||||||
|
@ -289,7 +296,7 @@ instance Output XRefSubSection where
|
||||||
|
|
||||||
xrefSubSection :: Parser u XRefSubSection
|
xrefSubSection :: Parser u XRefSubSection
|
||||||
xrefSubSection = do
|
xrefSubSection = do
|
||||||
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer
|
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
|
||||||
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
|
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
|
||||||
return $ XRefSubSection {firstObjectId, entriesNumber, entries}
|
return $ XRefSubSection {firstObjectId, entriesNumber, entries}
|
||||||
|
|
||||||
|
@ -299,7 +306,7 @@ type XRefSection = [XRefSubSection]
|
||||||
-- Structure
|
-- Structure
|
||||||
--
|
--
|
||||||
data InputStructure = InputStructure {
|
data InputStructure = InputStructure {
|
||||||
startOffset :: Int64
|
startOffset :: Int
|
||||||
, inputStructure :: Structure
|
, inputStructure :: Structure
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -314,7 +321,7 @@ structure =
|
||||||
<$> (line "xref" *> xrefSubSection `sepBy` EOL.parser)
|
<$> (line "xref" *> xrefSubSection `sepBy` EOL.parser)
|
||||||
<*> (line "trailer" *> dictionary <* EOL.parser)
|
<*> (line "trailer" *> dictionary <* EOL.parser)
|
||||||
|
|
||||||
updateXrefs :: XRefSection -> Map Offset Int64 -> (XRefSection, Int64)
|
updateXrefs :: XRefSection -> Map Offset Int -> (XRefSection, Int)
|
||||||
updateXrefs xrefSection offsets = (
|
updateXrefs xrefSection offsets = (
|
||||||
updateSubSection <$> xrefSection
|
updateSubSection <$> xrefSection
|
||||||
, offsets ! StartXRef
|
, offsets ! StartXRef
|
||||||
|
|
|
@ -11,7 +11,6 @@ module PDF.Output (
|
||||||
, char
|
, char
|
||||||
, getOffsets
|
, getOffsets
|
||||||
, join
|
, join
|
||||||
, lift
|
|
||||||
, newLine
|
, newLine
|
||||||
, nextLine
|
, nextLine
|
||||||
, saveOffset
|
, saveOffset
|
||||||
|
@ -19,10 +18,11 @@ module PDF.Output (
|
||||||
, render
|
, render
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.ByteString.Builder (Builder, char8, lazyByteString, string8, toLazyByteString)
|
import Data.ByteString.Builder (Builder, char8, string8, toLazyByteString)
|
||||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
import qualified Data.ByteString.Builder as B (byteString)
|
||||||
import qualified Data.ByteString.Lazy.Char8 as BS (length)
|
import Data.ByteString (ByteString)
|
||||||
import Data.Int (Int64)
|
import qualified Data.ByteString as BS (length)
|
||||||
|
import qualified Data.ByteString.Lazy as Lazy (ByteString)
|
||||||
import Data.Map (Map)
|
import Data.Map (Map)
|
||||||
import qualified Data.Map as Map (singleton)
|
import qualified Data.Map as Map (singleton)
|
||||||
import Data.String (IsString(..))
|
import Data.String (IsString(..))
|
||||||
|
@ -31,7 +31,7 @@ import qualified PDF.EOL as EOL (Style(..))
|
||||||
|
|
||||||
data Offset = StartXRef | ObjectId Int deriving (Eq, Ord)
|
data Offset = StartXRef | ObjectId Int deriving (Eq, Ord)
|
||||||
|
|
||||||
newtype OContext a = OContext (RWS EOL.Style (Map Offset Int64) Int64 a)
|
newtype OContext a = OContext (RWS EOL.Style (Map Offset Int) Int a)
|
||||||
type OBuilder = OContext Builder
|
type OBuilder = OContext Builder
|
||||||
|
|
||||||
instance Functor OContext where
|
instance Functor OContext where
|
||||||
|
@ -51,7 +51,7 @@ saveOffset offset = OContext $
|
||||||
lift :: (a -> Builder) -> a -> OBuilder
|
lift :: (a -> Builder) -> a -> OBuilder
|
||||||
lift f a = return (f a)
|
lift f a = return (f a)
|
||||||
|
|
||||||
getOffsets :: OBuilder -> OContext (OBuilder, Map Offset Int64)
|
getOffsets :: OBuilder -> OContext (OBuilder, Map Offset Int)
|
||||||
getOffsets (OContext builder) =
|
getOffsets (OContext builder) =
|
||||||
OContext (listen builder >>= \(a, w) -> return (return a, w))
|
OContext (listen builder >>= \(a, w) -> return (return a, w))
|
||||||
|
|
||||||
|
@ -108,9 +108,9 @@ string :: String -> OBuilder
|
||||||
string s = lift string8 s <* OContext (modify (+ toEnum (length s)))
|
string s = lift string8 s <* OContext (modify (+ toEnum (length s)))
|
||||||
|
|
||||||
byteString :: ByteString -> OBuilder
|
byteString :: ByteString -> OBuilder
|
||||||
byteString bs = lift lazyByteString bs <* OContext (modify (+ BS.length bs))
|
byteString bs = lift B.byteString bs <* OContext (modify (+ BS.length bs))
|
||||||
|
|
||||||
render :: Output a => EOL.Style -> a -> ByteString
|
render :: Output a => EOL.Style -> a -> Lazy.ByteString
|
||||||
render eolStyle a =
|
render eolStyle a =
|
||||||
let OContext builder = output a in
|
let OContext builder = output a in
|
||||||
let (outputByteString, _, _) = runRWS builder eolStyle 0 in
|
let (outputByteString, _, _) = runRWS builder eolStyle 0 in
|
||||||
|
|
98
src/PDF/Parser.hs
Normal file
98
src/PDF/Parser.hs
Normal file
|
@ -0,0 +1,98 @@
|
||||||
|
module PDF.Parser (
|
||||||
|
Parser
|
||||||
|
, (<?>)
|
||||||
|
, block
|
||||||
|
, char
|
||||||
|
, choice
|
||||||
|
, count
|
||||||
|
, decNumber
|
||||||
|
, hexNumber
|
||||||
|
, many
|
||||||
|
, octDigit
|
||||||
|
, on
|
||||||
|
, oneOf
|
||||||
|
, option
|
||||||
|
, runParser
|
||||||
|
, sepBy
|
||||||
|
, string
|
||||||
|
, takeAll
|
||||||
|
, takeAll1
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Applicative ((<|>), empty)
|
||||||
|
import Control.Monad.State (StateT(..), evalStateT)
|
||||||
|
import Control.Monad.Trans (lift)
|
||||||
|
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
||||||
|
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
|
||||||
|
)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import Data.Char (toLower)
|
||||||
|
import Data.Set (Set)
|
||||||
|
import qualified Data.Set as Set (fromList, member, unions)
|
||||||
|
|
||||||
|
type Parser s = StateT s Atto.Parser
|
||||||
|
|
||||||
|
(<?>) :: Parser s a -> String -> Parser s a
|
||||||
|
(<?>) parser debugMessage = parser <|> fail debugMessage
|
||||||
|
|
||||||
|
block :: Int -> Parser s ByteString
|
||||||
|
block = lift . Atto.take
|
||||||
|
|
||||||
|
char :: Char -> Parser s Char
|
||||||
|
char = lift . Atto.char
|
||||||
|
|
||||||
|
choice :: [Parser s a] -> Parser s a
|
||||||
|
choice = foldr (<|>) empty
|
||||||
|
|
||||||
|
count :: Int -> Parser s a -> Parser s [a]
|
||||||
|
count 0 _ = return []
|
||||||
|
count n p = (:) <$> p <*> count (n-1) p
|
||||||
|
|
||||||
|
decNumber :: Parser s ByteString
|
||||||
|
decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
|
||||||
|
|
||||||
|
digits :: Set Char
|
||||||
|
digits = Set.fromList ['0'..'9']
|
||||||
|
|
||||||
|
hexDigits :: Set Char
|
||||||
|
hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
|
||||||
|
where
|
||||||
|
af = ['A'..'F']
|
||||||
|
|
||||||
|
hexNumber :: Parser s ByteString
|
||||||
|
hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
|
||||||
|
|
||||||
|
many :: Parser s a -> Parser s [a]
|
||||||
|
many parser = (:) <$> parser <*> many parser <|> return []
|
||||||
|
|
||||||
|
octDigit :: Parser s Char
|
||||||
|
octDigit = oneOf ['0'..'7']
|
||||||
|
|
||||||
|
on :: Parser s a -> ByteString -> Parser s (Either String a)
|
||||||
|
on (StateT parserF) input = StateT $ \state ->
|
||||||
|
case Atto.parseOnly (parserF state) input of
|
||||||
|
Left errorMsg -> return (Left errorMsg, state)
|
||||||
|
Right (result, newState) -> return (Right result, newState)
|
||||||
|
|
||||||
|
oneOf :: String -> Parser s Char
|
||||||
|
oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
|
||||||
|
|
||||||
|
option :: a -> Parser s a -> Parser s a
|
||||||
|
option defaultValue p = p <|> pure defaultValue
|
||||||
|
|
||||||
|
runParser :: Parser s a -> s -> ByteString -> Either String a
|
||||||
|
runParser parser initState =
|
||||||
|
Atto.parseOnly (evalStateT parser initState)
|
||||||
|
|
||||||
|
sepBy :: Parser s a -> Parser s b -> Parser s [a]
|
||||||
|
sepBy parser separator =
|
||||||
|
option [] $ (:) <$> parser <*> many (separator *> parser)
|
||||||
|
|
||||||
|
string :: ByteString -> Parser s ByteString
|
||||||
|
string = lift . Atto.string
|
||||||
|
|
||||||
|
takeAll :: (Char -> Bool) -> Parser s ByteString
|
||||||
|
takeAll = lift . Atto.takeWhile
|
||||||
|
|
||||||
|
takeAll1 :: (Char -> Bool) -> Parser s ByteString
|
||||||
|
takeAll1 = lift . Atto.takeWhile1
|
Loading…
Reference in a new issue