Compare commits
14 Commits
Author | SHA1 | Date | |
---|---|---|---|
7eca875900 | |||
380c1e439b | |||
d6994f0813 | |||
68f90d20e2 | |||
3a39c75e6a | |||
29c5823f34 | |||
9ab010de61 | |||
699f830a45 | |||
dd79cb3fc7 | |||
264b0dc92b | |||
9dac275f68 | |||
85e4eb9273 | |||
11cb6504d7 | |||
0daa03d958 |
19
ChangeLog.md
19
ChangeLog.md
|
@ -1,5 +1,24 @@
|
|||
# Revision history for Hufflepdf
|
||||
|
||||
## 0.2.0.1 -- 2019-11-27
|
||||
|
||||
* Fix bug discovered while running Hufflepdf on a PDF output from pdftk : magic keywords like `obj`, `stream` or `xref` can have spaces after them before the EOL
|
||||
|
||||
## 0.2.0.0 -- 2019-10-14
|
||||
|
||||
* Implement PDF's multilayer update mechanism
|
||||
|
||||
## 0.1.1.1 -- 2019-05-31
|
||||
|
||||
* Fix a bug preventing files with the «trailer» keyword on the same line as the dictionary that follows it to be parsed
|
||||
* Improve error messages
|
||||
* Support empty lines as comments
|
||||
|
||||
## 0.1.1.0 -- 2019-05-23
|
||||
|
||||
* Rewrite the parser using attoparsec instead of parsec (about 1.5x faster)
|
||||
* Use strict ByteStrings instead of lazy ones for an additional gain in performance, simpler type interface (lazy ByteStrings are now used only for output because that's required by the ByteString Builder) and fewer import lines
|
||||
|
||||
## 0.1.0.0 -- 2019-05-18
|
||||
|
||||
* First version. Released on an unsuspecting world.
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
-- documentation, see http://haskell.org/cabal/users-guide/
|
||||
|
||||
name: Hufflepdf
|
||||
version: 0.1.0.0
|
||||
version: 0.2.0.1
|
||||
synopsis: A PDF parser
|
||||
-- description:
|
||||
license: BSD3
|
||||
|
@ -17,17 +17,37 @@ cabal-version: >=1.10
|
|||
|
||||
library
|
||||
exposed-modules: PDF
|
||||
, PDF.Object
|
||||
other-modules: PDF.Body
|
||||
, PDF.EOL
|
||||
, PDF.Object
|
||||
, PDF.Output
|
||||
, Data.ByteString.Lazy.Char8.Util
|
||||
, PDF.Update
|
||||
other-modules: Data.ByteString.Char8.Util
|
||||
, PDF.Body
|
||||
, PDF.Parser
|
||||
-- other-extensions:
|
||||
build-depends: base >=4.9 && <4.13
|
||||
build-depends: attoparsec
|
||||
, base >=4.9 && <4.13
|
||||
, bytestring
|
||||
, containers
|
||||
, mtl
|
||||
, parsec
|
||||
hs-source-dirs: src
|
||||
ghc-options: -Wall
|
||||
default-language: Haskell2010
|
||||
|
||||
executable equivalent
|
||||
main-is: examples/equivalent.hs
|
||||
build-depends: base
|
||||
, bytestring
|
||||
, Hufflepdf
|
||||
ghc-options: -Wall
|
||||
default-language: Haskell2010
|
||||
|
||||
executable getObj
|
||||
main-is: examples/getObj.hs
|
||||
build-depends: base
|
||||
, bytestring
|
||||
, containers
|
||||
, Hufflepdf
|
||||
, zlib
|
||||
ghc-options: -Wall
|
||||
default-language: Haskell2010
|
||||
|
|
13
examples/equivalent.hs
Normal file
13
examples/equivalent.hs
Normal file
|
@ -0,0 +1,13 @@
|
|||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (writeFile)
|
||||
import PDF (parseDocument, render)
|
||||
import System.Environment (getArgs)
|
||||
import System.IO (hPutStrLn, stderr)
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
[inputFile, outputFile] <- getArgs
|
||||
result <- parseDocument <$> BS.readFile inputFile
|
||||
case result of
|
||||
Left parseError -> hPutStrLn stderr parseError
|
||||
Right doc -> Lazy.writeFile outputFile $ render doc
|
53
examples/getObj.hs
Normal file
53
examples/getObj.hs
Normal file
|
@ -0,0 +1,53 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
|
||||
import Codec.Compression.Zlib (decompress)
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, putStr, toStrict)
|
||||
import Data.Map ((!?))
|
||||
import qualified Data.Map as Map (keys, lookup)
|
||||
import PDF (Document(..), parseDocument)
|
||||
import qualified PDF.EOL as EOL (Style)
|
||||
import PDF.Object (Content(..), DirectObject(..), Object(..), Name(..))
|
||||
import PDF.Output (ObjectId(..))
|
||||
import qualified PDF.Output as Output (render)
|
||||
import PDF.Update (unify)
|
||||
import System.Environment (getArgs, getProgName)
|
||||
import System.Exit (die)
|
||||
import Text.Printf (printf)
|
||||
|
||||
display :: EOL.Style -> Object -> ByteString
|
||||
display eolStyle d@(Direct _) = Output.render eolStyle d
|
||||
display eolStyle s@(Stream {header, streamContent}) = Output.render eolStyle $
|
||||
case Map.lookup (Name "Filter") header of
|
||||
Just (NameObject (Name "FlateDecode")) -> Stream {
|
||||
header
|
||||
, streamContent = Lazy.toStrict . decompress $ Lazy.fromStrict streamContent
|
||||
}
|
||||
_ -> s
|
||||
|
||||
extractObject :: ObjectId -> Document -> Either String ByteString
|
||||
extractObject objectId (Document {eolStyle, updates}) =
|
||||
case objects content !? objectId of
|
||||
Nothing -> Left $ "No object has ID " ++ show (getObjectId objectId)
|
||||
Just o -> Right $ display eolStyle o
|
||||
where
|
||||
content = unify updates
|
||||
|
||||
listObjectIds :: Document -> Either String [String]
|
||||
listObjectIds =
|
||||
Right . prependTitle . toString . Map.keys . objects . unify . updates
|
||||
where
|
||||
toString = fmap (show . getObjectId)
|
||||
prependTitle = ("ObjectIds defined in this PDF:":)
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
(inputFile, getData) <- parse =<< getArgs
|
||||
input <- BS.readFile inputFile
|
||||
either die id $ (parseDocument input >>= getData)
|
||||
where
|
||||
parse [inputFile] = return (inputFile, fmap (mapM_ putStrLn) . listObjectIds)
|
||||
parse [inputFile, objectId] = return
|
||||
(inputFile, fmap Lazy.putStr . extractObject (ObjectId (read objectId)))
|
||||
parse _ = die . printf "Syntax: %s inputFile [OBJECT_ID]\n" =<< getProgName
|
16
src/Data/ByteString/Char8/Util.hs
Normal file
16
src/Data/ByteString/Char8/Util.hs
Normal file
|
@ -0,0 +1,16 @@
|
|||
module Data.ByteString.Char8.Util (
|
||||
previous
|
||||
, subBS
|
||||
) where
|
||||
|
||||
import Data.ByteString.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (drop, index, take)
|
||||
import Prelude hiding (length)
|
||||
|
||||
previous :: Char -> Int -> ByteString -> Int
|
||||
previous char position byteString
|
||||
| BS.index byteString position == char = position
|
||||
| otherwise = previous char (position - 1) byteString
|
||||
|
||||
subBS :: Int -> Int -> ByteString -> ByteString
|
||||
subBS offset length = BS.take length . BS.drop offset
|
|
@ -1,17 +0,0 @@
|
|||
module Data.ByteString.Lazy.Char8.Util (
|
||||
previous
|
||||
, subBS
|
||||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, index, take)
|
||||
import Data.Int (Int64)
|
||||
import Prelude hiding (length)
|
||||
|
||||
previous :: Char -> Int64 -> ByteString -> Int64
|
||||
previous char position byteString
|
||||
| BS.index byteString position == char = position
|
||||
| otherwise = previous char (position - 1) byteString
|
||||
|
||||
subBS :: Int64 -> Int64 -> ByteString -> ByteString
|
||||
subBS offset length = BS.take length . BS.drop offset
|
60
src/PDF.hs
60
src/PDF.hs
|
@ -5,12 +5,12 @@ module PDF (
|
|||
, render
|
||||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (
|
||||
import Data.ByteString.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (
|
||||
drop, findIndex, head, isPrefixOf, last, length, span, unpack
|
||||
)
|
||||
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
|
||||
import Data.Int (Int64)
|
||||
import Data.ByteString.Char8.Util (previous, subBS)
|
||||
import qualified Data.ByteString.Lazy as Lazy (ByteString)
|
||||
import qualified Data.Map as Map (lookup)
|
||||
import PDF.Body (populate)
|
||||
import qualified PDF.EOL as EOL (Style(..), charset, parser)
|
||||
|
@ -19,39 +19,33 @@ import PDF.Object (
|
|||
, Structure(..)
|
||||
, eofMarker, magicNumber, structure
|
||||
)
|
||||
import qualified PDF.Output as Output (render, string)
|
||||
import PDF.Output (Output(..), nextLine)
|
||||
import Text.Parsec
|
||||
import Text.Parsec.ByteString.Lazy (Parser)
|
||||
import Text.Parsec.Pos (newPos)
|
||||
import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
|
||||
import qualified PDF.Output as Output (render, line)
|
||||
import PDF.Output (Output(..))
|
||||
import PDF.Parser (Parser, runParser, string, takeAll)
|
||||
import Text.Printf (printf)
|
||||
|
||||
data Document = Document {
|
||||
pdfVersion :: String
|
||||
, eolStyle :: EOL.Style
|
||||
, contents :: [Content]
|
||||
, updates :: [Content]
|
||||
} deriving Show
|
||||
|
||||
instance Output Document where
|
||||
output (Document {pdfVersion, contents}) =
|
||||
Output.string (printf "%%PDF-%s" pdfVersion)
|
||||
`nextLine` output contents
|
||||
output (Document {pdfVersion, updates}) =
|
||||
Output.line (printf "%%PDF-%s" pdfVersion)
|
||||
`mappend` output updates
|
||||
|
||||
render :: Document -> ByteString
|
||||
render :: Document -> Lazy.ByteString
|
||||
render document@(Document {eolStyle}) = Output.render eolStyle document
|
||||
|
||||
version :: Parser String
|
||||
version = string magicNumber *> many (noneOf EOL.charset)
|
||||
version :: Parser () String
|
||||
version = BS.unpack <$>
|
||||
(string magicNumber *> takeAll (not . (`elem` EOL.charset)))
|
||||
|
||||
parseError :: String -> Either ParseError a
|
||||
parseError errorMessage =
|
||||
Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0)
|
||||
check :: Bool -> String -> Either String ()
|
||||
check test errorMessage = if test then return () else Left errorMessage
|
||||
|
||||
check :: Bool -> String -> Either ParseError ()
|
||||
check test errorMessage = if test then return () else parseError errorMessage
|
||||
|
||||
readStartXref :: EOL.Style -> ByteString -> Either ParseError Int64
|
||||
readStartXref :: EOL.Style -> ByteString -> Either String Int
|
||||
readStartXref eolStyle input =
|
||||
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
|
||||
"Badly formed document : missing EOF marker at the end"
|
||||
|
@ -68,7 +62,7 @@ readStartXref eolStyle input =
|
|||
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
|
||||
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
|
||||
|
||||
findNextLine :: ByteString -> Int64
|
||||
findNextLine :: ByteString -> Int
|
||||
findNextLine input =
|
||||
let (line, eolPrefixed) = BS.span notInEol input in
|
||||
let nextNotInEol = BS.findIndex notInEol eolPrefixed in
|
||||
|
@ -76,7 +70,7 @@ findNextLine input =
|
|||
where
|
||||
notInEol = not . (`elem` EOL.charset)
|
||||
|
||||
findNextSection :: Int64 -> ByteString -> Int64
|
||||
findNextSection :: Int -> ByteString -> Int
|
||||
findNextSection offset input =
|
||||
case BS.findIndex (== BS.head eofMarker) input of
|
||||
Nothing -> 0
|
||||
|
@ -87,9 +81,9 @@ findNextSection offset input =
|
|||
then newOffset + findNextLine newInput
|
||||
else findNextSection (newOffset + 1) (BS.drop 1 newInput)
|
||||
|
||||
readStructures :: Int64 -> ByteString -> Either ParseError [InputStructure]
|
||||
readStructures :: Int -> ByteString -> Either String [InputStructure]
|
||||
readStructures startXref input =
|
||||
parse structure ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
|
||||
runParser structure () (BS.drop startXref input) >>= stopOrFollow
|
||||
where
|
||||
stopOrFollow s@(Structure {trailer}) =
|
||||
case Map.lookup (Name "Prev") trailer of
|
||||
|
@ -98,12 +92,12 @@ readStructures startXref input =
|
|||
let offset = truncate newStartXref in
|
||||
let startOffset = findNextSection offset (BS.drop offset input) in
|
||||
(InputStructure startOffset s:) <$> (readStructures offset input)
|
||||
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
|
||||
Just v -> Left $ "Bad value for Prev entry in trailer: " ++ show v
|
||||
|
||||
parseDocument :: ByteString -> Either ParseError Document
|
||||
parseDocument :: ByteString -> Either String Document
|
||||
parseDocument input = do
|
||||
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> EOL.parser) "" input
|
||||
(pdfVersion, eolStyle) <- runParser ((,) <$> version <*> EOL.parser) () input
|
||||
startXref <- readStartXref eolStyle input
|
||||
structuresRead <- readStructures startXref input
|
||||
let contents = populate input <$> structuresRead
|
||||
return $ Document {pdfVersion, eolStyle, contents}
|
||||
let updates = populate input <$> structuresRead
|
||||
return $ Document {pdfVersion, eolStyle, updates}
|
||||
|
|
|
@ -1,21 +1,24 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
module PDF.Body (
|
||||
populate
|
||||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
|
||||
import Data.Int (Int64)
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad.State (get, gets, modify)
|
||||
import Data.ByteString.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
|
||||
import Data.Map ((!))
|
||||
import qualified Data.Map as Map (empty, insert, lookup)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import PDF.Object (
|
||||
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
|
||||
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
|
||||
, Parser, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..)
|
||||
, Structure(..), XRefEntry(..), XRefSection
|
||||
, blank, dictionary, directObject, integer, line
|
||||
)
|
||||
import Text.Parsec
|
||||
import PDF.Output (ObjectId(..), Offset(..))
|
||||
import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
|
||||
|
||||
data UserState = UserState {
|
||||
input :: ByteString
|
||||
|
@ -26,9 +29,9 @@ data UserState = UserState {
|
|||
type SParser = Parser UserState
|
||||
|
||||
modifyFlow :: (Flow -> Flow) -> SParser ()
|
||||
modifyFlow f = modifyState $ \state -> state {flow = f $ flow state}
|
||||
modifyFlow f = modify $ \state -> state {flow = f $ flow state}
|
||||
|
||||
addObject :: Int -> Object -> SParser ()
|
||||
addObject :: ObjectId -> Object -> SParser ()
|
||||
addObject objectId newObject = modifyFlow $ \flow -> flow {
|
||||
tmpObjects = Map.insert objectId newObject $ tmpObjects flow
|
||||
}
|
||||
|
@ -39,39 +42,27 @@ pushOccurrence newOccurrence = modifyFlow $ \flow -> flow {
|
|||
}
|
||||
|
||||
comment :: Parser u String
|
||||
comment = char '%' *> many (noneOf EOL.charset) <* EOL.parser
|
||||
comment = BS.unpack <$> (option "" afterPercent <* EOL.parser)
|
||||
where
|
||||
afterPercent = BS.cons <$> char '%' <*> takeAll (not . (`elem` EOL.charset))
|
||||
|
||||
lookupOffset :: Int -> XRefSection -> Maybe Int64
|
||||
lookupOffset _ [] = Nothing
|
||||
lookupOffset objectId (xrefSubSection:others) =
|
||||
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
|
||||
let index = objectId - firstObjectId in
|
||||
if index >= 0 && index < entriesNumber
|
||||
then
|
||||
case Map.lookup index entries of
|
||||
Just (InUse {offset}) -> Just offset
|
||||
_ -> Nothing
|
||||
else lookupOffset objectId others
|
||||
|
||||
getOffset :: Int -> SParser Int64
|
||||
getOffset objectId = do
|
||||
table <- xreferences <$> getState
|
||||
case lookupOffset objectId table of
|
||||
lookupOffset :: ObjectId -> SParser Offset
|
||||
lookupOffset objectId = do
|
||||
table <- gets xreferences
|
||||
case Map.lookup objectId table >>= entryOffset of
|
||||
Nothing -> fail $
|
||||
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
|
||||
Just offset -> return offset
|
||||
where
|
||||
entryOffset (InUse {offset}) = Just offset
|
||||
entryOffset _ = Nothing
|
||||
|
||||
on :: Monad m => ParsecT s u m a -> s -> ParsecT s u m a
|
||||
on parser input = do
|
||||
originalInput <- getInput
|
||||
setInput input >> parser <* setInput originalInput
|
||||
|
||||
loadNumber :: Int -> SParser Float
|
||||
loadNumber :: ObjectId -> SParser Double
|
||||
loadNumber objectId = do
|
||||
offset <- getOffset objectId
|
||||
objectStart <- BS.drop offset . input <$> getState
|
||||
indirectObjCoordinates `on` objectStart >> return ()
|
||||
objectValue <- (!objectId) . tmpObjects . flow <$> getState
|
||||
offset <- getOffset <$> lookupOffset objectId
|
||||
objectStart <- BS.drop offset <$> gets input
|
||||
indirectObjCoordinates `on` (objectStart :: ByteString) >> return ()
|
||||
objectValue <- (!objectId) . tmpObjects <$> gets flow
|
||||
case objectValue of
|
||||
Direct (NumberObject (Number n)) -> return n
|
||||
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
|
||||
|
@ -79,11 +70,11 @@ loadNumber objectId = do
|
|||
invalidValue :: Object -> String
|
||||
invalidValue v = "Invalid value " ++ show v
|
||||
|
||||
getSize :: Maybe DirectObject -> SParser Float
|
||||
getSize :: Maybe DirectObject -> SParser Double
|
||||
getSize Nothing = fail "Missing '/Length' key on stream"
|
||||
getSize (Just (NumberObject (Number size))) = return size
|
||||
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
|
||||
Flow {tmpObjects} <- flow <$> getState
|
||||
Flow {tmpObjects} <- gets flow
|
||||
case Map.lookup objectId tmpObjects of
|
||||
Nothing -> loadNumber objectId
|
||||
Just (Direct (NumberObject (Number size))) -> return size
|
||||
|
@ -92,32 +83,33 @@ getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
|
|||
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
|
||||
|
||||
streamObject :: SParser Object
|
||||
streamObject = try $ do
|
||||
streamObject = do
|
||||
header <- dictionary <* blank
|
||||
size <- getSize (Map.lookup (Name "Length") header)
|
||||
streamContent <- BS.pack <$> stream (truncate size)
|
||||
streamContent <- stream (truncate size)
|
||||
return $ Stream {header, streamContent}
|
||||
where
|
||||
stream size = line "stream" *> count size anyChar <* blank <* line "endstream"
|
||||
stream size = line "stream" *> block size <* blank <* line "endstream"
|
||||
|
||||
object :: SParser Object
|
||||
object = streamObject <|> Direct <$> directObject
|
||||
|
||||
indirectObjCoordinates :: SParser IndirectObjCoordinates
|
||||
indirectObjCoordinates = do
|
||||
objectId <- integer
|
||||
objectId <- ObjectId <$> integer
|
||||
coordinates <- IndirectObjCoordinates objectId <$> integer
|
||||
objectValue <- line "obj" *> object <* blank <* line "endobj"
|
||||
addObject objectId objectValue
|
||||
return coordinates
|
||||
|
||||
occurrence :: SParser Occurrence
|
||||
occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
|
||||
occurrence =
|
||||
Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
|
||||
|
||||
populate :: ByteString -> InputStructure -> Content
|
||||
populate input structure =
|
||||
let bodyInput = BS.drop (startOffset structure) input in
|
||||
case runParser recurseOnOccurrences initialState "" bodyInput of
|
||||
case runParser recurseOnOccurrences initialState bodyInput of
|
||||
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
|
||||
Right finalState ->
|
||||
let Flow {occurrencesStack, tmpObjects} = flow finalState in
|
||||
|
@ -126,7 +118,7 @@ populate input structure =
|
|||
}
|
||||
where
|
||||
docStructure = inputStructure structure
|
||||
xreferences = xrefSection docStructure
|
||||
xreferences = xRef docStructure
|
||||
initialState = UserState {
|
||||
input, xreferences, flow = Flow {
|
||||
occurrencesStack = [], tmpObjects = Map.empty
|
||||
|
@ -135,4 +127,4 @@ populate input structure =
|
|||
|
||||
recurseOnOccurrences :: SParser UserState
|
||||
recurseOnOccurrences =
|
||||
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState
|
||||
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> get
|
||||
|
|
|
@ -1,19 +1,20 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
module PDF.EOL (
|
||||
Style(..)
|
||||
, charset
|
||||
, parser
|
||||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import Text.Parsec ((<|>), Parsec, string, try)
|
||||
import Control.Applicative ((<|>))
|
||||
import PDF.Parser (Parser, string)
|
||||
|
||||
data Style = CR | LF | CRLF deriving Show
|
||||
|
||||
charset :: String
|
||||
charset = "\r\n"
|
||||
|
||||
parser :: Parsec ByteString u Style
|
||||
parser :: Parser s Style
|
||||
parser =
|
||||
try (string "\r\n" >> return CRLF)
|
||||
(string "\r\n" >> return CRLF)
|
||||
<|> (string "\r" >> return CR)
|
||||
<|> (string "\n" >> return LF)
|
||||
|
|
|
@ -5,17 +5,16 @@ module PDF.Object (
|
|||
Content(..)
|
||||
, DirectObject(..)
|
||||
, Flow(..)
|
||||
, IndexedObjects
|
||||
, IndirectObjCoordinates(..)
|
||||
, InputStructure(..)
|
||||
, Name(..)
|
||||
, Number(..)
|
||||
, Object(..)
|
||||
, Occurrence(..)
|
||||
, Parser
|
||||
, Structure(..)
|
||||
, XRefEntry(..)
|
||||
, XRefSection
|
||||
, XRefSubSection(..)
|
||||
, blank
|
||||
, dictionary
|
||||
, directObject
|
||||
|
@ -26,25 +25,33 @@ module PDF.Object (
|
|||
, structure
|
||||
) where
|
||||
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import Data.Int (Int64)
|
||||
import Data.Map (Map, (!), mapWithKey)
|
||||
import qualified Data.Map as Map (elems, fromList, toList)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import qualified PDF.Output as Output (string)
|
||||
import PDF.Output (
|
||||
OBuilder, Offset(..), Output(..)
|
||||
, byteString, getOffsets, join, newLine, nextLine, saveOffset
|
||||
import Control.Applicative ((<|>))
|
||||
import Data.ByteString.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (
|
||||
concat, cons, pack, singleton, unpack
|
||||
)
|
||||
import Data.Map (Map, (!), mapWithKey)
|
||||
import qualified Data.Map as Map (
|
||||
delete, empty, fromList, lookup, minViewWithKey, toList, union
|
||||
)
|
||||
import qualified PDF.EOL as EOL (charset, parser)
|
||||
import qualified PDF.Output as Output (concat, line, string)
|
||||
import PDF.Output (
|
||||
OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
|
||||
, byteString, getObjectId, getOffset, getOffsets, join, newLine
|
||||
, saveOffset
|
||||
)
|
||||
import PDF.Parser (
|
||||
Parser, (<?>)
|
||||
, char, choice, count, decNumber, hexNumber, many, octDigit, oneOf, option
|
||||
, sepBy, string, takeAll, takeAll1
|
||||
)
|
||||
import Text.Parsec
|
||||
import Text.Printf (printf)
|
||||
|
||||
type Parser u = Parsec ByteString u
|
||||
|
||||
line :: String -> Parser u ()
|
||||
line l = string l *> EOL.parser *> return ()
|
||||
line l = (string (BS.pack l) *> blank *> return ()) <?> printf "line «%s»" l
|
||||
|
||||
magicNumber :: String
|
||||
magicNumber :: ByteString
|
||||
magicNumber = "%PDF-"
|
||||
|
||||
eofMarker :: ByteString
|
||||
|
@ -53,35 +60,35 @@ eofMarker = "%%EOF"
|
|||
whiteSpaceCharset :: String
|
||||
whiteSpaceCharset = "\0\t\12 "
|
||||
|
||||
whiteSpace :: Parser u ()
|
||||
whiteSpace = oneOf whiteSpaceCharset *> return () <|> EOL.parser *> return ()
|
||||
|
||||
blank :: Parser u ()
|
||||
blank = skipMany whiteSpace
|
||||
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> return ()
|
||||
|
||||
delimiterCharset :: String
|
||||
delimiterCharset = "()<>[]{}/%"
|
||||
|
||||
regular :: Parser u Char
|
||||
regular = noneOf $ EOL.charset ++ whiteSpaceCharset ++ delimiterCharset
|
||||
regular :: Char -> Bool
|
||||
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
|
||||
|
||||
integer :: (Read a, Num a) => Parser u a
|
||||
integer = read <$> many1 digit <* whiteSpace
|
||||
integer = read . BS.unpack <$> decNumber <* blank <?> "decimal integer"
|
||||
|
||||
-------------------------------------
|
||||
-- OBJECTS
|
||||
-------------------------------------
|
||||
|
||||
type IndexedObjects = Map ObjectId Object
|
||||
|
||||
--
|
||||
-- Boolean
|
||||
--
|
||||
boolean :: Parser u Bool
|
||||
boolean = (string "true" *> return True) <|> (string "false" *> return False)
|
||||
boolean =
|
||||
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
|
||||
|
||||
--
|
||||
-- Number
|
||||
--
|
||||
newtype Number = Number Float deriving Show
|
||||
newtype Number = Number Double deriving Show
|
||||
|
||||
instance Output Number where
|
||||
output (Number f) = Output.string $
|
||||
|
@ -90,12 +97,13 @@ instance Output Number where
|
|||
_ -> printf "%f" f
|
||||
|
||||
number :: Parser u Number
|
||||
number = Number . read <$>
|
||||
(mappend <$> sign <*> (integerPart <|> ('0':) <$> floatPart))
|
||||
number = Number . read . BS.unpack <$>
|
||||
(mappend <$> sign <*> (integerPart <|> BS.cons '0' <$> floatPart))
|
||||
<?> "number"
|
||||
where
|
||||
sign = string "-" <|> option "" (char '+' >> return "")
|
||||
integerPart = mappend <$> many1 digit <*> option "" floatPart
|
||||
floatPart = (:) <$> char '.' <*> (option "0" $ many1 digit)
|
||||
integerPart = mappend <$> decNumber <*> option "" floatPart
|
||||
floatPart = BS.cons <$> char '.' <*> (option "0" $ decNumber)
|
||||
|
||||
--
|
||||
-- StringObject
|
||||
|
@ -108,14 +116,18 @@ instance Output StringObject where
|
|||
|
||||
stringObject :: Parser u StringObject
|
||||
stringObject =
|
||||
Literal <$> (char '(' *> (concat <$> many literalStringBlock) <* char ')')
|
||||
<|> Hexadecimal <$> (char '<' *> many hexDigit <* char '>')
|
||||
Literal . BS.unpack <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
|
||||
<|> Hexadecimal . BS.unpack <$> (char '<' *> hexNumber <* char '>')
|
||||
<?> "string object (literal or hexadecimal)"
|
||||
where
|
||||
literalStringBlock = many1 (noneOf "\\()") <|> matchingParenthesis <|> escapedChar
|
||||
literalString = many literalStringBlock
|
||||
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
|
||||
normalChar = not . (`elem` ("\\()" :: String))
|
||||
matchingParenthesis =
|
||||
(++) <$> ((:) <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||
escapedChar = (:) <$> char '\\' <*> ((:[]) <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||
octalCode = choice $ (\n -> count n octDigit) <$> [1..3]
|
||||
mappend <$> (BS.cons <$> char '(' <*> literalStringBlock) <*> string ")"
|
||||
escapedChar =
|
||||
BS.cons <$> char '\\' <*> (BS.singleton <$> oneOf "nrtbf()\\" <|> octalCode)
|
||||
octalCode = choice $ (\n -> BS.pack <$> count n octDigit) <$> [1..3]
|
||||
|
||||
--
|
||||
-- Name
|
||||
|
@ -123,16 +135,17 @@ stringObject =
|
|||
newtype Name = Name String deriving (Eq, Ord, Show)
|
||||
|
||||
instance Output Name where
|
||||
output (Name n) = "/" `mappend` Output.string n
|
||||
output (Name n) = Output.string ('/':n)
|
||||
|
||||
name :: Parser u Name
|
||||
name = Name <$> (char '/' *> many regular)
|
||||
name = Name . BS.unpack <$> (char '/' *> takeAll regular) <?> "name"
|
||||
|
||||
--
|
||||
-- Array
|
||||
--
|
||||
array :: Parser u [DirectObject]
|
||||
array = char '[' *> blank *> directObject `endBy` blank <* char ']'
|
||||
array =
|
||||
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
|
||||
|
||||
--
|
||||
-- Dictionary
|
||||
|
@ -145,31 +158,32 @@ instance Output Dictionary where
|
|||
where
|
||||
keyValues = join " " $ outputKeyVal <$> Map.toList dict
|
||||
outputKeyVal :: (Name, DirectObject) -> OBuilder
|
||||
outputKeyVal (key, val) = output key `mappend` " " `mappend` output val
|
||||
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
|
||||
|
||||
dictionary :: Parser u Dictionary
|
||||
dictionary =
|
||||
try (string "<<" *> blank *> keyValPairs <* string ">>")
|
||||
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
|
||||
where
|
||||
keyVal = (,) <$> name <* blank <*> directObject
|
||||
keyValPairs = Map.fromList <$> keyVal `endBy` blank
|
||||
keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank
|
||||
|
||||
--
|
||||
-- Null
|
||||
--
|
||||
nullObject :: Parser u ()
|
||||
nullObject = string "null" *> return ()
|
||||
nullObject = string "null" *> return () <?> "null object"
|
||||
|
||||
--
|
||||
-- Reference
|
||||
--
|
||||
data IndirectObjCoordinates = IndirectObjCoordinates {
|
||||
objectId :: Int
|
||||
objectId :: ObjectId
|
||||
, versionNumber :: Int
|
||||
} deriving Show
|
||||
|
||||
reference :: Parser u IndirectObjCoordinates
|
||||
reference = IndirectObjCoordinates <$> integer <*> integer <* char 'R'
|
||||
reference = IndirectObjCoordinates
|
||||
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
|
||||
|
||||
--
|
||||
-- DirectObject
|
||||
|
@ -190,22 +204,23 @@ instance Output DirectObject where
|
|||
output (NumberObject n) = output n
|
||||
output (StringObject s) = output s
|
||||
output (NameObject n) = output n
|
||||
output (Array a) = "[" `mappend` join " " a `mappend` "]"
|
||||
output (Array a) = Output.concat ["[", join " " a, "]"]
|
||||
output (Dictionary d) = output d
|
||||
output (Null) = "null"
|
||||
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
|
||||
Output.string (printf "%d %d R" objectId versionNumber)
|
||||
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
|
||||
|
||||
directObject :: Parser u DirectObject
|
||||
directObject =
|
||||
Boolean <$> try boolean
|
||||
<|> Reference <$> try reference {- defined before Number because Number is a prefix of it -}
|
||||
<|> NumberObject <$> try number
|
||||
<|> StringObject <$> try stringObject
|
||||
<|> NameObject <$> try name
|
||||
<|> Array <$> try array
|
||||
<|> Dictionary <$> try dictionary
|
||||
<|> const Null <$> try nullObject
|
||||
Boolean <$> boolean
|
||||
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
|
||||
<|> NumberObject <$> number
|
||||
<|> StringObject <$> stringObject
|
||||
<|> NameObject <$> name
|
||||
<|> Array <$> array
|
||||
<|> Dictionary <$> dictionary
|
||||
<|> const Null <$> nullObject
|
||||
<?> "direct object"
|
||||
|
||||
--
|
||||
-- Object
|
||||
|
@ -220,26 +235,26 @@ data Object =
|
|||
|
||||
instance Output Object where
|
||||
output (Direct d) = output d
|
||||
output (Stream {header, streamContent}) =
|
||||
output header
|
||||
`nextLine` "stream"
|
||||
`nextLine` byteString streamContent
|
||||
`mappend` "endstream"
|
||||
|
||||
output (Stream {header, streamContent}) = Output.concat [
|
||||
output header, newLine
|
||||
, Output.line "stream"
|
||||
, byteString streamContent
|
||||
, "endstream"
|
||||
]
|
||||
|
||||
--
|
||||
-- Occurrence
|
||||
--
|
||||
data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
|
||||
|
||||
outputOccurrence :: Map Int Object -> Occurrence -> OBuilder
|
||||
outputOccurrence _ (Comment c) =
|
||||
Output.string (printf "%%%s" c) `mappend` newLine
|
||||
outputOccurrence :: IndexedObjects -> Occurrence -> OBuilder
|
||||
outputOccurrence _ (Comment c) = Output.line c
|
||||
outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNumber})) =
|
||||
saveOffset (ObjectId objectId)
|
||||
>> Output.string (printf "%d %d obj" objectId versionNumber)
|
||||
`nextLine` output (objects ! objectId)
|
||||
`nextLine` "endobj" `mappend` newLine
|
||||
saveOffset (Object objectId) >> Output.concat [
|
||||
Output.line (printf "%d %d obj" (getObjectId objectId) versionNumber)
|
||||
, output (objects ! objectId), newLine
|
||||
, Output.line "endobj"
|
||||
]
|
||||
|
||||
-------------------------------------
|
||||
-- XREF TABLE
|
||||
|
@ -249,89 +264,107 @@ outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNum
|
|||
-- XRefEntry
|
||||
--
|
||||
data XRefEntry = InUse {
|
||||
offset :: Int64
|
||||
offset :: Offset
|
||||
, generation :: Int
|
||||
} | Free {
|
||||
nextFree :: Int64
|
||||
nextFree :: ObjectId
|
||||
, generation :: Int
|
||||
} deriving Show
|
||||
|
||||
instance Output XRefEntry where
|
||||
output (InUse {offset, generation}) =
|
||||
Output.string (printf "%010d %05d n " offset generation) `mappend` newLine
|
||||
Output.line (printf "%010d %05d n " (getOffset offset) generation)
|
||||
output (Free {nextFree, generation}) =
|
||||
Output.string (printf "%010d %05d f " nextFree generation) `mappend` newLine
|
||||
Output.line (printf "%010d %05d f " (getObjectId nextFree) generation)
|
||||
|
||||
entry :: Parser u XRefEntry
|
||||
entry = do
|
||||
(big, small) <- (,) <$> integer <*> integer
|
||||
(inUse big small <|> free big small) <* blank
|
||||
(inUse big small <|> free big small <?> "XRef entry") <* blank
|
||||
where
|
||||
inUse :: Int64 -> Int -> Parser u XRefEntry
|
||||
inUse offset generation = char 'n' *> return (InUse {offset, generation})
|
||||
free :: Int64 -> Int -> Parser u XRefEntry
|
||||
free nextFree generation = char 'f' *> return (Free {nextFree, generation})
|
||||
|
||||
inUse :: Int -> Int -> Parser u XRefEntry
|
||||
inUse big generation =
|
||||
char 'n' *> return (InUse {offset = Offset big, generation})
|
||||
free :: Int -> Int -> Parser u XRefEntry
|
||||
free big generation =
|
||||
char 'f' *> return (Free {nextFree = ObjectId big, generation})
|
||||
|
||||
--
|
||||
-- XRefSubSection
|
||||
--
|
||||
data XRefSubSection = XRefSubSection {
|
||||
firstObjectId :: Int
|
||||
, entriesNumber :: Int
|
||||
, entries :: Map Int XRefEntry
|
||||
firstObjectId :: ObjectId
|
||||
, entries :: [XRefEntry]
|
||||
} deriving Show
|
||||
|
||||
instance Output XRefSubSection where
|
||||
output (XRefSubSection {firstObjectId, entriesNumber, entries}) =
|
||||
Output.string (printf "%d %d" firstObjectId entriesNumber)
|
||||
`nextLine` output (Map.elems entries)
|
||||
output (XRefSubSection {firstObjectId, entries}) =
|
||||
Output.line (printf "%d %d" (getObjectId firstObjectId) (length entries))
|
||||
`mappend` output entries
|
||||
|
||||
xrefSubSection :: Parser u XRefSubSection
|
||||
xrefSubSection = do
|
||||
(firstObjectId, entriesNumber) <- (,) <$> integer <*> integer
|
||||
entries <- Map.fromList . zip [firstObjectId..] <$> count entriesNumber entry
|
||||
return $ XRefSubSection {firstObjectId, entriesNumber, entries}
|
||||
xRefSubSection :: Parser u XRefSubSection
|
||||
xRefSubSection = do
|
||||
(firstId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
|
||||
entries <- count entriesNumber entry
|
||||
return $ XRefSubSection {firstObjectId = ObjectId firstId, entries}
|
||||
|
||||
type XRefSection = [XRefSubSection]
|
||||
type XRefSection = Map ObjectId XRefEntry
|
||||
|
||||
instance Output XRefSection where
|
||||
output = output . sections
|
||||
where
|
||||
sections tmp =
|
||||
case Map.minViewWithKey tmp of
|
||||
Nothing -> []
|
||||
Just ((objectId@(ObjectId value), firstEntry), rest) ->
|
||||
let (subSection, sndRest) = section objectId [firstEntry] (value + 1) rest in
|
||||
subSection : sections sndRest
|
||||
section firstObjectId stack nextValue tmp =
|
||||
let nextId = ObjectId nextValue in
|
||||
case Map.lookup nextId tmp of
|
||||
Nothing -> (XRefSubSection {firstObjectId, entries = reverse stack}, tmp)
|
||||
Just nextEntry ->
|
||||
section firstObjectId (nextEntry:stack) (nextValue + 1) (Map.delete nextId tmp)
|
||||
|
||||
|
||||
xRefSection :: Parser u XRefSection
|
||||
xRefSection = foldr addSubsection Map.empty <$>
|
||||
(line "xref" *> xRefSubSection `sepBy` many EOL.parser)
|
||||
where
|
||||
addSubsection (XRefSubSection {firstObjectId, entries}) =
|
||||
Map.union . Map.fromList $ zip ([firstObjectId..]) entries
|
||||
|
||||
--
|
||||
-- Structure
|
||||
--
|
||||
data InputStructure = InputStructure {
|
||||
startOffset :: Int64
|
||||
startOffset :: Int
|
||||
, inputStructure :: Structure
|
||||
}
|
||||
|
||||
data Structure = Structure {
|
||||
xrefSection :: XRefSection
|
||||
xRef :: XRefSection
|
||||
, trailer :: Dictionary
|
||||
} deriving Show
|
||||
|
||||
structure :: Parser u Structure
|
||||
structure =
|
||||
Structure
|
||||
<$> (line "xref" *> xrefSubSection `sepBy` EOL.parser)
|
||||
<*> (line "trailer" *> dictionary <* EOL.parser)
|
||||
<$> xRefSection
|
||||
<*> (string "trailer" *> blank *> dictionary <* EOL.parser)
|
||||
|
||||
updateXrefs :: XRefSection -> Map Offset Int64 -> (XRefSection, Int64)
|
||||
updateXrefs xrefSection offsets = (
|
||||
updateSubSection <$> xrefSection
|
||||
, offsets ! StartXRef
|
||||
)
|
||||
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
|
||||
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
|
||||
where
|
||||
updateSubSection subSection@(XRefSubSection {firstObjectId, entries}) =
|
||||
subSection {entries = mapWithKey (updateEntry firstObjectId) entries}
|
||||
updateEntry firstObjectId index e@(InUse {}) =
|
||||
e {offset = offsets ! (ObjectId $ firstObjectId + index)}
|
||||
updateEntry _ _ e = e
|
||||
updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
|
||||
updateEntry _ e = e
|
||||
|
||||
--
|
||||
-- Flow
|
||||
--
|
||||
data Flow = Flow {
|
||||
occurrencesStack :: [Occurrence]
|
||||
, tmpObjects :: Map Int Object
|
||||
, tmpObjects :: IndexedObjects
|
||||
} deriving Show
|
||||
|
||||
--
|
||||
|
@ -339,25 +372,26 @@ data Flow = Flow {
|
|||
--
|
||||
data Content = Content {
|
||||
occurrences :: [Occurrence]
|
||||
, objects :: Map Int Object
|
||||
, objects :: IndexedObjects
|
||||
, docStructure :: Structure
|
||||
} deriving Show
|
||||
|
||||
outputBody :: ([Occurrence], Map Int Object) -> OBuilder
|
||||
outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
|
||||
outputBody (occurrences, objects) =
|
||||
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
|
||||
|
||||
instance Output Content where
|
||||
output (Content {occurrences, objects, docStructure}) =
|
||||
fmap (updateXrefs xrefSection) <$> getOffsets (outputBody (occurrences, objects))
|
||||
>>= \(body, (xref, startXRef)) ->
|
||||
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
|
||||
>>= \(body, (xref, startXRef)) -> Output.concat [
|
||||
body
|
||||
`mappend` "xref"
|
||||
`nextLine` output xref
|
||||
`mappend` "trailer"
|
||||
`nextLine` output trailer
|
||||
`nextLine` "startxref"
|
||||
`nextLine` (Output.string (printf "%d" startXRef))
|
||||
`nextLine` byteString eofMarker
|
||||
, Output.line "xref"
|
||||
, output xref
|
||||
, Output.line "trailer"
|
||||
, output trailer, newLine
|
||||
, Output.line "startxref"
|
||||
, Output.line (printf "%d" (getOffset startXRef))
|
||||
, byteString eofMarker
|
||||
]
|
||||
where
|
||||
Structure {xrefSection, trailer} = docStructure
|
||||
Structure {xRef, trailer} = docStructure
|
||||
|
|
|
@ -2,47 +2,44 @@
|
|||
{-# LANGUAGE MultiParamTypeClasses #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE AllowAmbiguousTypes #-}
|
||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||
module PDF.Output (
|
||||
OBuilder
|
||||
, ObjectId(..)
|
||||
, OContext(..)
|
||||
, Offset(..)
|
||||
, Output(..)
|
||||
, Resource(..)
|
||||
, byteString
|
||||
, char
|
||||
, concat
|
||||
, getOffsets
|
||||
, join
|
||||
, lift
|
||||
, line
|
||||
, newLine
|
||||
, nextLine
|
||||
, saveOffset
|
||||
, string
|
||||
, render
|
||||
) where
|
||||
|
||||
--import Data.ByteString.Builder (Builder, char8, lazyByteString, string8)
|
||||
import Data.ByteString.Builder (Builder, char8, lazyByteString, string8, toLazyByteString)
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Lazy.Char8 as BS (length)
|
||||
import Data.Int (Int64)
|
||||
import Data.ByteString.Builder (Builder, char8, string8, toLazyByteString)
|
||||
import qualified Data.ByteString.Builder as B (byteString)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (length)
|
||||
import qualified Data.ByteString.Lazy as Lazy (ByteString)
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map (singleton)
|
||||
-- #if MIN_VERSION_base(4,9,0)
|
||||
-- import qualified Data.Semigroup as Sem
|
||||
-- #endif
|
||||
import Data.String (IsString(..))
|
||||
--import Control.Monad.Reader (MonadReader(..), Reader, runReader)
|
||||
--import Control.Monad.Reader (MonadReader(..), Reader, runReader)
|
||||
--import Control.Monad.Reader (MonadReader(..), Reader, runReader)
|
||||
import Control.Monad.RWS (RWS, runRWS, ask, get, listen, modify, tell)
|
||||
import qualified PDF.EOL as EOL (Style(..))
|
||||
import Prelude hiding (concat)
|
||||
|
||||
data Offset = StartXRef | ObjectId Int deriving (Eq, Ord)
|
||||
newtype ObjectId = ObjectId {getObjectId :: Int} deriving (Enum, Eq, Ord, Show)
|
||||
newtype Offset = Offset {getOffset :: Int} deriving (Show)
|
||||
|
||||
{-
|
||||
incrOffset :: (Int64 -> Int64) -> OutputState
|
||||
-}
|
||||
data Resource = StartXRef | Object ObjectId deriving (Eq, Ord)
|
||||
|
||||
newtype OContext a = OContext (RWS EOL.Style (Map Offset Int64) Int64 a)
|
||||
newtype OContext a = OContext (RWS EOL.Style (Map Resource Offset) Offset a)
|
||||
type OBuilder = OContext Builder
|
||||
|
||||
instance Functor OContext where
|
||||
|
@ -55,20 +52,23 @@ instance Applicative OContext where
|
|||
instance Monad OContext where
|
||||
(>>=) (OContext a) f = OContext (a >>= (\x -> let OContext y = f x in y))
|
||||
|
||||
saveOffset :: Offset -> OContext ()
|
||||
saveOffset offset = OContext $
|
||||
get >>= tell . Map.singleton offset
|
||||
saveOffset :: Resource -> OContext ()
|
||||
saveOffset resource = OContext $
|
||||
get >>= tell . Map.singleton resource
|
||||
|
||||
lift :: (a -> Builder) -> a -> OBuilder
|
||||
lift f a = return (f a)
|
||||
|
||||
getOffsets :: OBuilder -> OContext (OBuilder, Map Offset Int64)
|
||||
getOffsets :: OBuilder -> OContext (OBuilder, Map Resource Offset)
|
||||
getOffsets (OContext builder) =
|
||||
OContext (listen builder >>= \(a, w) -> return (return a, w))
|
||||
|
||||
append :: OBuilder -> OBuilder -> OBuilder
|
||||
append (OContext a) (OContext b) = OContext (mappend <$> a <*> b)
|
||||
|
||||
concat :: [OBuilder] -> OBuilder
|
||||
concat = foldl mappend mempty
|
||||
|
||||
#if MIN_VERSION_base(4,11,0)
|
||||
instance Semigroup OBuilder where
|
||||
(<>) = append
|
||||
|
@ -94,7 +94,7 @@ instance Output Bool where
|
|||
output True = string "true"
|
||||
|
||||
instance Output a => Output [a] where
|
||||
output = foldl mappend mempty . fmap output
|
||||
output = concat . fmap output
|
||||
|
||||
join :: Output a => String -> [a] -> OBuilder
|
||||
join _ [] = mempty
|
||||
|
@ -102,27 +102,30 @@ join _ [a] = output a
|
|||
join separator (a:as) =
|
||||
output a `mappend` string separator `mappend` (join separator as)
|
||||
|
||||
newLine :: OBuilder
|
||||
newLine = OContext $ buildEOL =<< ask
|
||||
where
|
||||
buildEOL EOL.CR = return (char8 '\r') <* modify (+1)
|
||||
buildEOL EOL.LF = return (char8 '\n') <* modify (+1)
|
||||
buildEOL EOL.CRLF = return (string8 "\r\n") <* modify (+2)
|
||||
offset :: (Int -> Int) -> OContext ()
|
||||
offset f = OContext $ modify $ \(Offset o) -> Offset (f o)
|
||||
|
||||
nextLine :: OBuilder -> OBuilder -> OBuilder
|
||||
nextLine a b = a `mappend` newLine `mappend` b
|
||||
newLine :: OBuilder
|
||||
newLine = buildEOL =<< OContext ask
|
||||
where
|
||||
buildEOL EOL.CR = return (char8 '\r') <* offset (+1)
|
||||
buildEOL EOL.LF = return (char8 '\n') <* offset (+1)
|
||||
buildEOL EOL.CRLF = return (string8 "\r\n") <* offset (+2)
|
||||
|
||||
char :: Char -> OBuilder
|
||||
char c = lift char8 c <* OContext (modify (+1))
|
||||
char c = lift char8 c <* offset (+1)
|
||||
|
||||
string :: String -> OBuilder
|
||||
string s = lift string8 s <* OContext (modify (+ toEnum (length s)))
|
||||
string s = lift string8 s <* offset (+ toEnum (length s))
|
||||
|
||||
line :: String -> OBuilder
|
||||
line l = string l `mappend` newLine
|
||||
|
||||
byteString :: ByteString -> OBuilder
|
||||
byteString bs = lift lazyByteString bs <* OContext (modify (+ BS.length bs))
|
||||
byteString bs = lift B.byteString bs <* offset (+ BS.length bs)
|
||||
|
||||
render :: Output a => EOL.Style -> a -> ByteString
|
||||
render :: Output a => EOL.Style -> a -> Lazy.ByteString
|
||||
render eolStyle a =
|
||||
let OContext builder = output a in
|
||||
let (outputByteString, _, _) = runRWS builder eolStyle 0 in
|
||||
let (outputByteString, _, _) = runRWS builder eolStyle (Offset 0) in
|
||||
toLazyByteString outputByteString
|
||||
|
|
98
src/PDF/Parser.hs
Normal file
98
src/PDF/Parser.hs
Normal file
|
@ -0,0 +1,98 @@
|
|||
module PDF.Parser (
|
||||
Parser
|
||||
, (<?>)
|
||||
, block
|
||||
, char
|
||||
, choice
|
||||
, count
|
||||
, decNumber
|
||||
, hexNumber
|
||||
, many
|
||||
, octDigit
|
||||
, on
|
||||
, oneOf
|
||||
, option
|
||||
, runParser
|
||||
, sepBy
|
||||
, string
|
||||
, takeAll
|
||||
, takeAll1
|
||||
) where
|
||||
|
||||
import Control.Applicative ((<|>), empty)
|
||||
import Control.Monad.State (StateT(..), evalStateT)
|
||||
import Control.Monad.Trans (lift)
|
||||
import qualified Data.Attoparsec.ByteString.Char8 as Atto (
|
||||
Parser, char, parseOnly, satisfy, string, take, takeWhile, takeWhile1
|
||||
)
|
||||
import Data.ByteString (ByteString)
|
||||
import Data.Char (toLower)
|
||||
import Data.Set (Set)
|
||||
import qualified Data.Set as Set (fromList, member, unions)
|
||||
|
||||
type Parser s = StateT s Atto.Parser
|
||||
|
||||
(<?>) :: Parser s a -> String -> Parser s a
|
||||
(<?>) parser debugMessage = parser <|> fail debugMessage
|
||||
|
||||
block :: Int -> Parser s ByteString
|
||||
block = lift . Atto.take
|
||||
|
||||
char :: Char -> Parser s Char
|
||||
char = lift . Atto.char
|
||||
|
||||
choice :: [Parser s a] -> Parser s a
|
||||
choice = foldr (<|>) empty
|
||||
|
||||
count :: Int -> Parser s a -> Parser s [a]
|
||||
count 0 _ = return []
|
||||
count n p = (:) <$> p <*> count (n-1) p
|
||||
|
||||
decNumber :: Parser s ByteString
|
||||
decNumber = lift $ Atto.takeWhile1 (`Set.member` digits)
|
||||
|
||||
digits :: Set Char
|
||||
digits = Set.fromList ['0'..'9']
|
||||
|
||||
hexDigits :: Set Char
|
||||
hexDigits = Set.unions [digits, Set.fromList af, Set.fromList $ toLower <$> af]
|
||||
where
|
||||
af = ['A'..'F']
|
||||
|
||||
hexNumber :: Parser s ByteString
|
||||
hexNumber = lift $ Atto.takeWhile1 (`Set.member` hexDigits)
|
||||
|
||||
many :: Parser s a -> Parser s [a]
|
||||
many parser = (:) <$> parser <*> many parser <|> return []
|
||||
|
||||
octDigit :: Parser s Char
|
||||
octDigit = oneOf ['0'..'7']
|
||||
|
||||
on :: Parser s a -> ByteString -> Parser s (Either String a)
|
||||
on (StateT parserF) input = StateT $ \state ->
|
||||
case Atto.parseOnly (parserF state) input of
|
||||
Left errorMsg -> return (Left errorMsg, state)
|
||||
Right (result, newState) -> return (Right result, newState)
|
||||
|
||||
oneOf :: String -> Parser s Char
|
||||
oneOf charSet = lift $ Atto.satisfy (`elem` charSet)
|
||||
|
||||
option :: a -> Parser s a -> Parser s a
|
||||
option defaultValue p = p <|> pure defaultValue
|
||||
|
||||
runParser :: Parser s a -> s -> ByteString -> Either String a
|
||||
runParser parser initState =
|
||||
Atto.parseOnly (evalStateT parser initState)
|
||||
|
||||
sepBy :: Parser s a -> Parser s b -> Parser s [a]
|
||||
sepBy parser separator =
|
||||
option [] $ (:) <$> parser <*> many (separator *> parser)
|
||||
|
||||
string :: ByteString -> Parser s ByteString
|
||||
string = lift . Atto.string
|
||||
|
||||
takeAll :: (Char -> Bool) -> Parser s ByteString
|
||||
takeAll = lift . Atto.takeWhile
|
||||
|
||||
takeAll1 :: (Char -> Bool) -> Parser s ByteString
|
||||
takeAll1 = lift . Atto.takeWhile1
|
47
src/PDF/Update.hs
Normal file
47
src/PDF/Update.hs
Normal file
|
@ -0,0 +1,47 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module PDF.Update (
|
||||
unify
|
||||
) where
|
||||
|
||||
import Data.Map (member)
|
||||
import qualified Data.Map as Map (empty, union)
|
||||
import PDF.Object (
|
||||
Content(..), IndexedObjects, IndirectObjCoordinates(..), Occurrence(..)
|
||||
, Structure(..)
|
||||
)
|
||||
|
||||
emptyContent :: Content
|
||||
emptyContent = Content {
|
||||
docStructure = Structure {xRef = Map.empty, trailer = Map.empty}
|
||||
, objects = Map.empty
|
||||
, occurrences = []
|
||||
}
|
||||
|
||||
unify :: [Content] -> Content
|
||||
unify = foldl complete emptyContent
|
||||
where
|
||||
complete tmpContent older =
|
||||
let mergedObjects = Map.union (objects tmpContent) (objects older) in
|
||||
Content {
|
||||
docStructure =
|
||||
unifyDocStructure (docStructure tmpContent) (docStructure older)
|
||||
, objects = mergedObjects
|
||||
, occurrences =
|
||||
unifyOccurrences mergedObjects (occurrences tmpContent) (occurrences older)
|
||||
}
|
||||
|
||||
unifyDocStructure :: Structure -> Structure -> Structure
|
||||
unifyDocStructure update original = Structure {
|
||||
xRef = Map.union (xRef update) (xRef original)
|
||||
, trailer = Map.union (trailer update) (trailer original)
|
||||
}
|
||||
|
||||
unifyOccurrences :: IndexedObjects -> [Occurrence] -> [Occurrence] -> [Occurrence]
|
||||
unifyOccurrences objects update = foldr addOlder update
|
||||
where
|
||||
addOlder occurrence@(Comment _) existing = occurrence : existing
|
||||
addOlder occurrence@(Indirect indirect) existing =
|
||||
if objectId indirect `member` objects
|
||||
then occurrence : existing
|
||||
else existing
|
||||
|
Loading…
Reference in New Issue
Block a user