Hufflepdf/src/PDF/Object.hs

446 lines
13 KiB
Haskell
Raw Normal View History

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.Object (
Content(..)
, Dictionary
, DirectObject(..)
, Flow(..)
, IndexedObjects
, IndirectObjCoordinates(..)
2019-05-18 09:01:13 +02:00
, InputStructure(..)
, Name(..)
, Number(..)
, Object(..)
, Occurrence(..)
, StringObject(..)
, Structure(..)
, XRefEntry(..)
, XRefSection
, (//)
, array
, blank
, dictionary
, directObject
, eofMarker
, integer
, line
, magicNumber
, name
, number
, regular
, stringObject
, structure
, toByteString
) where
import Control.Applicative ((<|>), many)
import Control.Monad.Reader (MonadReader(..))
import Control.Monad.Fail (MonadFail(..))
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat)
import qualified Data.ByteString.Char8 as Char8 (cons, pack, singleton, unpack)
import Data.ByteString.Char8.Util (B16Int(..), b16ToBytes, unescape)
2019-05-18 09:01:13 +02:00
import Data.Map (Map, (!), mapWithKey)
import qualified Data.Map as Map (
delete, empty, fromList, lookup, minViewWithKey, toList, union
)
import qualified PDF.EOL as EOL (charset, parser)
import qualified PDF.Output as Output (concat, line, string)
import PDF.Output (
OBuilder, ObjectId(..), Offset(..), Output(..), Resource(..)
, byteString, getObjectId, getOffset, getOffsets, join, newLine
, saveOffset
)
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import Prelude hiding (fail)
import Text.Printf (printf)
line :: MonadParser m => String -> m ()
2019-11-27 18:05:47 +01:00
line l = (string (Char8.pack l) *> blank *> return ()) <?> printf "line «%s»" l
2019-05-24 10:48:09 +02:00
magicNumber :: ByteString
magicNumber = "%PDF-"
eofMarker :: ByteString
eofMarker = "%%EOF"
whiteSpaceCharset :: String
whiteSpaceCharset = "\0\t\12 "
blank :: MonadParser m => m ()
blank = takeAll (`elem` (EOL.charset ++ whiteSpaceCharset)) *> pure ()
delimiterCharset :: String
delimiterCharset = "()<>[]{}/%"
2019-05-24 10:48:09 +02:00
regular :: Char -> Bool
regular = not . (`elem` (EOL.charset ++ whiteSpaceCharset ++ delimiterCharset))
integer :: (Read a, Num a, MonadParser m) => m a
integer = read . Char8.unpack <$> decNumber <* blank <?> "decimal integer"
-------------------------------------
-- OBJECTS
-------------------------------------
type IndexedObjects = Map ObjectId Object
--
-- Boolean
--
boolean :: MonadParser m => m Bool
2019-05-24 10:48:09 +02:00
boolean =
(string "true" *> return True) <|> (string "false" *> return False) <?> "boolean"
--
-- Number
--
newtype Number = Number Double deriving Show
instance Output Number where
output (Number f) = Output.string $
case properFraction f of
(n, 0) -> printf "%d" (n :: Int)
_ -> printf "%f" f
number :: MonadParser m => m Number
number = Number . read . Char8.unpack <$>
(mappend <$> sign <*> (integerPart <|> Char8.cons '0' <$> floatPart))
2019-05-24 10:48:09 +02:00
<?> "number"
where
sign = string "-" <|> option "" (char '+' >> return "")
2019-05-24 10:48:09 +02:00
integerPart = mappend <$> decNumber <*> option "" floatPart
floatPart = Char8.cons <$> char '.' <*> (option "0" $ decNumber)
--
-- StringObject
--
data StringObject = Literal ByteString | Hexadecimal B16Int deriving Show
instance Output StringObject where
output (Literal s) = Output.string (printf "(%s)" (Char8.unpack s))
output (Hexadecimal (B16Int n)) = Output.string (printf "<%s>" (Char8.unpack n))
stringObject :: MonadParser m => m StringObject
stringObject =
Literal <$> (char '(' *> (BS.concat <$> literalString) <* char ')')
<|> Hexadecimal <$> (char '<' *> hexNumber <* char '>')
2019-05-24 10:48:09 +02:00
<?> "string object (literal or hexadecimal)"
where
2019-05-24 10:48:09 +02:00
literalString = many literalStringBlock
literalStringBlock = takeAll1 normalChar <|> matchingParenthesis <|> escapedChar
normalChar = not . (`elem` ("\\()" :: String))
matchingParenthesis =
mappend <$> (Char8.cons <$> char '(' <*> literalStringBlock) <*> string ")"
2019-05-24 10:48:09 +02:00
escapedChar =
Char8.cons <$> char '\\' <*> (Char8.singleton <$> oneOf "nrtbf()\\\n" <|> octalCode)
octalCode = choice $ (\n -> Char8.pack <$> count n octDigit) <$> [1..3]
toByteString :: StringObject -> ByteString
toByteString (Hexadecimal h) = b16ToBytes h
toByteString (Literal s) = unescape s
--
-- Name
--
newtype Name = Name String deriving (Eq, Ord, Show)
instance Output Name where
output (Name n) = Output.string ('/':n)
name :: MonadParser m => m Name
name = Name . Char8.unpack <$> (char '/' *> takeAll regular) <?> "name"
--
-- Array
--
array :: MonadParser m => m [DirectObject]
2019-05-24 10:48:09 +02:00
array =
char '[' *> blank *> directObject `sepBy` blank <* blank <* char ']' <?> "array"
--
-- Dictionary
--
type Dictionary = Map Name DirectObject
instance Output Dictionary where
output dict =
"<<" `mappend` keyValues `mappend` ">>"
where
keyValues = join " " $ outputKeyVal <$> Map.toList dict
outputKeyVal :: (Name, DirectObject) -> OBuilder
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
dictionary :: MonadParser m => m Dictionary
dictionary =
2019-05-24 10:48:09 +02:00
string "<<" *> blank *> keyValPairs <* string ">>" <?> "dictionary"
where
keyVal = (,) <$> name <* blank <*> directObject
2019-05-24 10:48:09 +02:00
keyValPairs = Map.fromList <$> keyVal `sepBy` blank <* blank
--
-- Null
--
nullObject :: MonadParser m => m ()
2019-05-24 10:48:09 +02:00
nullObject = string "null" *> return () <?> "null object"
--
-- Reference
--
data IndirectObjCoordinates = IndirectObjCoordinates {
objectId :: ObjectId
, versionNumber :: Int
} deriving Show
reference :: MonadParser m => m IndirectObjCoordinates
reference = IndirectObjCoordinates
<$> (fmap ObjectId integer) <*> integer <* char 'R' <?> "reference to an object"
--
-- DirectObject
--
data DirectObject =
Boolean Bool
| NumberObject Number
| StringObject StringObject
| NameObject Name
| Array [DirectObject]
| Dictionary Dictionary
| Null
| Reference IndirectObjCoordinates
deriving Show
instance Output DirectObject where
output (Boolean b) = output b
output (NumberObject n) = output n
output (StringObject s) = output s
output (NameObject n) = output n
output (Array a) = Output.concat ["[", join " " a, "]"]
output (Dictionary d) = output d
output (Null) = "null"
output (Reference (IndirectObjCoordinates {objectId, versionNumber})) =
Output.string (printf "%d %d R" (getObjectId objectId) versionNumber)
directObject :: MonadParser m => m DirectObject
directObject =
2019-05-24 10:48:09 +02:00
Boolean <$> boolean
<|> Reference <$> reference {- defined before Number because Number is a prefix of it -}
<|> NumberObject <$> number
<|> StringObject <$> stringObject
<|> NameObject <$> name
<|> Array <$> array
<|> Dictionary <$> dictionary
<|> const Null <$> nullObject
<?> "direct object"
--
-- Object
--
data Object =
Direct DirectObject
| Stream {
header :: Dictionary
, streamContent :: ByteString
}
deriving Show
instance Output Object where
output (Direct d) = output d
output (Stream {header, streamContent}) = Output.concat [
output header, newLine
, Output.line "stream"
, byteString streamContent
, "endstream"
]
--
-- Occurrence
--
data Occurrence = Comment String | Indirect IndirectObjCoordinates deriving Show
outputOccurrence :: IndexedObjects -> Occurrence -> OBuilder
outputOccurrence _ (Comment c) = Output.line c
2019-05-16 22:41:14 +02:00
outputOccurrence objects (Indirect (IndirectObjCoordinates {objectId, versionNumber})) =
saveOffset (Object objectId) >> Output.concat [
Output.line (printf "%d %d obj" (getObjectId objectId) versionNumber)
, output (objects ! objectId), newLine
, Output.line "endobj"
]
2019-05-16 22:41:14 +02:00
-------------------------------------
-- XREF TABLE
-------------------------------------
--
-- XRefEntry
--
data XRefEntry = InUse {
offset :: Offset
, generation :: Int
} | Free {
nextFree :: ObjectId
, generation :: Int
} deriving Show
instance Output XRefEntry where
output (InUse {offset, generation}) =
Output.line (printf "%010d %05d n " (getOffset offset) generation)
output (Free {nextFree, generation}) =
Output.line (printf "%010d %05d f " (getObjectId nextFree) generation)
entry :: Parser u XRefEntry
entry = do
(big, small) <- (,) <$> integer <*> integer
2019-05-24 10:48:09 +02:00
(inUse big small <|> free big small <?> "XRef entry") <* blank
where
2019-05-24 10:48:09 +02:00
inUse :: Int -> Int -> Parser u XRefEntry
inUse big generation =
char 'n' *> return (InUse {offset = Offset big, generation})
2019-05-24 10:48:09 +02:00
free :: Int -> Int -> Parser u XRefEntry
free big generation =
char 'f' *> return (Free {nextFree = ObjectId big, generation})
--
-- XRefSubSection
--
data XRefSubSection = XRefSubSection {
firstObjectId :: ObjectId
, entries :: [XRefEntry]
} deriving Show
instance Output XRefSubSection where
output (XRefSubSection {firstObjectId, entries}) =
Output.line (printf "%d %d" (getObjectId firstObjectId) (length entries))
`mappend` output entries
xRefSubSection :: Parser u XRefSubSection
xRefSubSection = do
(firstId, entriesNumber) <- (,) <$> integer <*> integer <?> "XRef subsection"
entries <- count entriesNumber entry
return $ XRefSubSection {firstObjectId = ObjectId firstId, entries}
type XRefSection = Map ObjectId XRefEntry
instance Output XRefSection where
output = output . sections
where
sections tmp =
case Map.minViewWithKey tmp of
Nothing -> []
Just ((objectId@(ObjectId value), firstEntry), rest) ->
let (subSection, sndRest) = section objectId [firstEntry] (value + 1) rest in
subSection : sections sndRest
section firstObjectId stack nextValue tmp =
let nextId = ObjectId nextValue in
case Map.lookup nextId tmp of
Nothing -> (XRefSubSection {firstObjectId, entries = reverse stack}, tmp)
Just nextEntry ->
section firstObjectId (nextEntry:stack) (nextValue + 1) (Map.delete nextId tmp)
xRefSection :: Parser u XRefSection
xRefSection = foldr addSubsection Map.empty <$>
(line "xref" *> xRefSubSection `sepBy` many EOL.parser)
where
addSubsection (XRefSubSection {firstObjectId, entries}) =
Map.union . Map.fromList $ zip ([firstObjectId..]) entries
--
-- Structure
--
2019-05-18 09:01:13 +02:00
data InputStructure = InputStructure {
2019-05-24 10:48:09 +02:00
startOffset :: Int
2019-05-18 09:01:13 +02:00
, inputStructure :: Structure
}
data Structure = Structure {
xRef :: XRefSection
2019-05-18 09:01:13 +02:00
, trailer :: Dictionary
} deriving Show
structure :: Parser u Structure
structure =
2019-05-18 09:01:13 +02:00
Structure
<$> xRefSection
<*> (string "trailer" *> blank *> dictionary <* EOL.parser)
updateXrefs :: XRefSection -> Map Resource Offset -> (XRefSection, Offset)
updateXrefs xRef offsets = (mapWithKey updateEntry xRef, offsets ! StartXRef)
2019-05-18 09:01:13 +02:00
where
updateEntry objectId e@(InUse {}) = e {offset = offsets ! (Object objectId)}
updateEntry _ e = e
--
-- Flow
--
data Flow = Flow {
occurrencesStack :: [Occurrence]
, tmpObjects :: IndexedObjects
} deriving Show
--
-- Content
--
data Content = Content {
occurrences :: [Occurrence]
, objects :: IndexedObjects
2019-05-18 09:01:13 +02:00
, docStructure :: Structure
} deriving Show
outputBody :: ([Occurrence], IndexedObjects) -> OBuilder
outputBody (occurrences, objects) =
output (outputOccurrence objects <$> occurrences) <* saveOffset StartXRef
instance Output Content where
2019-05-18 09:01:13 +02:00
output (Content {occurrences, objects, docStructure}) =
fmap (updateXrefs xRef) <$> getOffsets (outputBody (occurrences, objects))
>>= \(body, (xref, startXRef)) -> Output.concat [
body
, Output.line "xref"
, output xref
, Output.line "trailer"
, output trailer, newLine
, Output.line "startxref"
, Output.line (printf "%d" (getOffset startXRef))
, byteString eofMarker
]
2019-05-18 09:01:13 +02:00
where
Structure {xRef, trailer} = docStructure
type PDFContent m = (MonadReader Content m, MonadFail m)
--
-- Navigation
--
key :: MonadFail m => String -> Dictionary -> m DirectObject
key keyName dictionary =
maybe (fail errorMessage) return (Map.lookup (Name keyName) dictionary)
where
errorMessage =
printf "Key %s not found in dictionary %s" keyName (show dictionary)
dict :: PDFContent m => Object -> m Dictionary
dict (Direct (Dictionary aDict)) = return aDict
dict obj = fail $ "Not a dictionary : " ++ show obj
getObject :: PDFContent m => ObjectId -> m Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
getResource :: PDFContent m => DirectObject -> m Dictionary
getResource (Dictionary aDict) = return aDict
getResource (Reference (IndirectObjCoordinates {objectId})) =
getObject objectId >>= dict
getResource directObject =
fail $ "Not a resource (dictionary or reference) : " ++ show directObject
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
(//) aDict [] = return $ Dictionary aDict
(//) aDict [field] = key field aDict
(//) aDict (field:fields) = key field aDict >>= getResource >>= (// fields)