Hufflepdf/src/PDF/Object.hs

407 lines
12 KiB
Haskell
Raw Normal View History

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleInstances #-}
module PDF.Object (
Content(..)
, Dictionary
, DirectObject(..)
, Flow(..)
, IndexedObjects
, IndirectObjCoordinates(..)
2019-05-18 09:01:13 +02:00
, InputStructure(..)
, Name(..)
, Number(..)
, Object(..)
, Occurrence(..)
, StringObject(..)
, Structure(..)
, XRefEntry(..)
, XRefSection
, array
, blank
, dictionary
, directObject
, eofMarker
, integer
, line
, magicNumber
, name
, number
, regular
, stringObject
, structure
, toByteString
) where
import Control.Applicative ((<|>), many)
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat)
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
2019-05-18 09:01:13 +02:00
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union
)
import qualified PDF.EOL as EOL (charset, parser)
import qualified PDF.Output as Output (concat, line, string)
import PDF.Output (
OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
, byteString, getObjectId, getOffset, getOffsets, join, newLine
, saveOffset
)
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import Text.Printf (printf)
line :: MonadParser m => String -> m ()
2019-11-27 18:05:47 +01:00
line l = (string (Char8.pack l) *> blank *> return ()) <?> printf "line «%s»" l
2019-05-24 10:48:09 +02:00
magicNumber :: ByteString
magicNumber = "%PDF-"
eofMarker :: ByteString
eofMarker = "%%EOF"
whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 "
blank :: MonadParser m => m ()
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> pure ()
delimiterCharset :: String
delimiterCharset = "()<>[]{}/%"
2019-05-24 10:48:09 +02:00
regular :: Char -> Bool
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: (Read a, Num a, MonadParser m) => m a
integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"
-------------------------------------
-- OBJECTS
-------------------------------------
type IndexedObjects = Map ObjectId Object
--
-- Boolean
--
boolean :: MonadParser m => m Bool
2019-05-24 10:48:09 +02:00
boolean =
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
--
-- Number
--
newtype Number = Number Double deriving Show
instance Output Number where
output (Number f) = Output.string $
case properFraction f of
(n, 0) -> printf "%d" (n :: Int)
_ -> printf "%f" f
number :: MonadParser m => m Number
number = Number . read . Char8.unpack <$>
(mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
2019-05-24 10:48:09 +02:00
<?> "number"
where
sign = string "-" <|> option "" (char '+' >> return "")
2019-05-24 10:48:09 +02:00
integerPart = mappend <$> decNumber <*> option "" floatPart
floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
--
-- StringObject
--
data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show
instance Output StringObject where
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
stringObject :: MonadParser m => m StringObject
stringObject =
Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
2019-05-24 10:48:09 +02:00
<?> "string object (literal or hexadecimal)"
where
2019-05-24 10:48:09 +02:00
literalString = many literalStringBlock
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
normalChar = not . (`elem` ("\\()" :: String))
matchingParenthesis =
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
2019-05-24 10:48:09 +02:00
escapedChar =
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
toByteString :: StringObject -> ByteString
toByteString (Hexadecimal h) = b16ToBytes h
toByteString (Literal s) = unescape s
--
-- Name
--
newtype Name = Name String deriving (Eq, Ord, Show)
instance Output Name where
output (Name n) = Output.string ('/':n)
name :: MonadParser m => m Name
name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
--
-- Array
--
array :: MonadParser m => m [DirectObject]
2019-05-24 10:48:09 +02:00
array =
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
--
-- Dictionary
--
type Dictionary = Map Name DirectObject
instance Output Dictionary where
output dict =
"<<" `mappend` keyValues `mappend` ">>"
where
keyValues = join " " $ outputKeyVal <$> Map.toList dict
outputKeyVal :: (Name, DirectObject) -> OBuilder
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
dictionary :: MonadParser m => m Dictionary
dictionary =
2019-05-24 10:48:09 +02:00
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
where
keyVal = (,) <$> name <* blank <*> directObject
2019-05-24 10:48:09 +02:00
keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank
--
-- Null
--
nullObject :: MonadParser m => m ()
2019-05-24 10:48:09 +02:00
nullObject = string "null" *> return () <?> "null object"
--
-- Reference
--
data IndirectObjCoordinates = IndirectObjCoordinates {
objectId :: ObjectId
, versionNumber :: Int
} deriving Show
reference :: MonadParser m => m IndirectObjCoordinates
reference = IndirectObjCoordinates
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
--
-- DirectObject
--
data DirectObject =
Boolean Bool
| NumberObject Number
| StringObject StringObject
| NameObject Name
| Array [DirectObject]
| Dictionary Dictionary
| Null
| Reference IndirectObjCoordinates
deriving Show
instance Output DirectObject where
output (Boolean b) = output b
output (NumberObject n) = output n
output (StringObject s) = output s
output (NameObject n) = output n
output (Array a) = Output.concat ["[", join " " a, "]"]
output (Dictionary d) = output d
output (Null) = "null"
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
directObject :: MonadParser m => m DirectObject
directObject =
2019-05-24 10:48:09 +02:00
Boolean <$> boolean
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> number
<|> StringObject <$> stringObject
<|> NameObject <$> name
<|> Array <$> array
<|> Dictionary <$> dictionary
<|> const Null <$> nullObject
<?> "direct object"
--
-- Object
--
data Object =
Direct DirectObject
| Stream {
header :: Dictionary
, streamContent :: ByteString
}
deriving Show
instance Output Object where
output (Direct d) = output d
output (Stream {header, streamContent}) = Output.concat [
output header, newLine
, Output.line "stream"
, byteString streamContent
, "endstream"
]
--
-- Occurrence
--
data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
outputOccurrence :: IndexedObjects -> Occurrence -> OBuilder
outputOccurrence _ (Comment c) = Output.line c
2019-05-16 22:41:14 +02:00
outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNumber})) =
saveOffset (Object objectId) >> Output.concat [
Output.line (printf "%d %d obj" (getObjectId objectId) versionNumber)
, output (objects ! objectId), newLine
, Output.line "endobj"
]
2019-05-16 22:41:14 +02:00
-------------------------------------
-- XREF TABLE
-------------------------------------
--
-- XRefEntry
--
data XRefEntry = InUse {
offset :: Offset
, generation :: Int
} | Free {
nextFree :: ObjectId
, generation :: Int
} deriving Show
instance Output XRefEntry where
output (InUse {offset, generation}) =
Output.line (printf "%010d %05d n " (getOffset offset) generation)
output (Free {nextFree, generation}) =
Output.line (printf "%010d %05d f " (getObjectId nextFree) generation)
entry :: Parser u XRefEntry
entry = do
(big, small) <- (,) <$> integer <*> integer
2019-05-24 10:48:09 +02:00
(inUse big small <|> free big small <?> "XRef entry") <* blank
where
2019-05-24 10:48:09 +02:00
inUse :: Int -> Int -> Parser u XRefEntry
inUse big generation =
char 'n' *> return (InUse {offset = Offset big, generation})
2019-05-24 10:48:09 +02:00
free :: Int -> Int -> Parser u XRefEntry
free big generation =
char 'f' *> return (Free {nextFree = ObjectId big, generation})
--
-- XRefSubSection
--
data XRefSubSection = XRefSubSection {
firstObjectId :: ObjectId
, entries :: [XRefEntry]
} deriving Show
instance Output XRefSubSection where
output (XRefSubSection {firstObjectId, entries}) =
Output.line (printf "%d %d" (getObjectId firstObjectId) (length entries))
`mappend` output entries
xRefSubSection :: Parser u XRefSubSection
xRefSubSection = do
(firstId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
entries <- count entriesNumber entry
return $ XRefSubSection {firstObjectId = ObjectId firstId, entries}
type XRefSection = Map ObjectId XRefEntry
instance Output XRefSection where
output = output . sections
where
sections tmp =
case Map.minViewWithKey tmp of
Nothing -> []
Just ((objectId@(ObjectId value), firstEntry), rest) ->
let (subSection, sndRest) = section objectId [firstEntry] (value + 1) rest in
subSection : sections sndRest
section firstObjectId stack nextValue tmp =
let nextId = ObjectId nextValue in
case Map.lookup nextId tmp of
Nothing -> (XRefSubSection {firstObjectId, entries = reverse stack}, tmp)
Just nextEntry ->
section firstObjectId (nextEntry:stack) (nextValue + 1) (Map.delete nextId tmp)
xRefSection :: Parser u XRefSection
xRefSection = foldr addSubsection Map.empty <$>
(line "xref" *> xRefSubSection `sepBy` many EOL.parser)
where
addSubsection (XRefSubSection {firstObjectId, entries}) =
Map.union . Map.fromList $ zip ([firstObjectId..]) entries
--
-- Structure
--
2019-05-18 09:01:13 +02:00
data InputStructure = InputStructure {
2019-05-24 10:48:09 +02:00
startOffset :: Int
2019-05-18 09:01:13 +02:00
, inputStructure :: Structure
}
data Structure = Structure {
xRef :: XRefSection
2019-05-18 09:01:13 +02:00
, trailer :: Dictionary
} deriving Show
structure :: Parser u Structure
structure =
2019-05-18 09:01:13 +02:00
Structure
<$> xRefSection
<*> (string "trailer" *> blank *> dictionary <* EOL.parser)
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
2019-05-18 09:01:13 +02:00
where
updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
updateEntry _ e = e
--
-- Flow
--
data Flow = Flow {
occurrencesStack :: [Occurrence]
, tmpObjects :: IndexedObjects
} deriving Show
--
-- Content
--
data Content = Content {
occurrences :: [Occurrence]
, objects :: IndexedObjects
2019-05-18 09:01:13 +02:00
, docStructure :: Structure
} deriving Show
outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
outputBody (occurrences, objects) =
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
instance Output Content where
2019-05-18 09:01:13 +02:00
output (Content {occurrences, objects, docStructure}) =
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
>>= \(body, (xref, startXRef)) -> Output.concat [
body
, Output.line "xref"
, output xref
, Output.line "trailer"
, output trailer, newLine
, Output.line "startxref"
, Output.line (printf "%d" (getOffset startXRef))
, byteString eofMarker
]
2019-05-18 09:01:13 +02:00
where
Structure {xRef, trailer} = docStructure