Hufflepdf/src/PDF/Body.hs

117 lines
4.0 KiB
Haskell

{-# LANGUAGE NamedFieldPuns #-}
module PDF.Body where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
import Data.Int (Int64)
import qualified Data.Map as Map (insert, lookup)
import PDF.Object (
Content(..), DirectObject(..), IndirectObjCoordinates(..), Object(..)
, Occurrence(..), Parser, XRefEntry(..), XRefSection, XRefSubSection(..)
, eol, eolCharset, dictionary, directObject, integer, line
)
import Text.Parsec
data UserState = UserState {
input :: ByteString
, content :: Content
}
type SParser = Parser UserState
modifyContent :: (Content -> Content) -> SParser ()
modifyContent f = modifyState $ \state -> state {content = f $ content state}
addObject :: Int -> Object -> SParser ()
addObject objectId newObject = modifyContent $ \content -> content {
objects = Map.insert objectId newObject $ objects content
}
pushOccurrence :: Occurrence -> SParser ()
pushOccurrence newOccurrence = modifyContent $ \content -> content {
body = newOccurrence : (body content)
}
comment :: Parser u String
comment = char '%' *> many (noneOf eolCharset) <* eol
lookupOffset :: Int -> XRefSection -> Maybe Int64
lookupOffset _ [] = Nothing
lookupOffset objectId (xrefSubSection:others) =
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
let index = objectId - firstObjectId in
if index >= 0 && index < entriesNumber
then
case Map.lookup index entries of
Just (InUse {offset}) -> Just offset
_ -> Nothing
else lookupOffset objectId others
getOffset :: Int -> SParser Int64
getOffset objectId = do
Content {xrefSection} <- content <$> getState
case lookupOffset objectId xrefSection of
Nothing -> fail $
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
Just offset -> return offset
loadNumber :: Int -> SParser Float
loadNumber objectId = do
offset <- getOffset objectId
state <- getState
let objectStart = BS.drop offset (input state)
case runParser object state ("object@" ++ show offset) objectStart of
Left _ -> fail $ "Could not parse length object@" ++ show offset
Right number@(Direct (Number n)) ->
addObject objectId number >> return n
Right obj -> fail $
"Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
invalidValue :: Object -> String
invalidValue v = "Invalid value " ++ show v
getSize :: Maybe DirectObject -> SParser Float
getSize Nothing = fail "Missing '/Length' key on stream"
getSize (Just (Number size)) = return size
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
Content {objects} <- content <$> getState
case Map.lookup objectId objects of
Nothing -> loadNumber objectId
Just (Direct (Number size)) -> return size
Just v -> fail $
invalidValue v ++ " for obj " ++ show objectId ++ "used as /Length"
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
object :: SParser Object
object =
Direct <$> directObject
<|> do
header <- dictionary
size <- getSize (Map.lookup "Length" header)
streamContent <- BS.pack <$> stream (truncate size)
return $ Stream {header, streamContent}
where
stream size = line "stream" *> count size anyChar <* eol <* line "endstream"
occurrence :: SParser Occurrence
occurrence = Comment <$> comment <|> indirectObj
where
indirectObj = do
objectId <- integer
coordinates <- IndirectObjCoordinates objectId <$> integer
objectValue <- line "obj" *> object <* eol <* line "endobj"
addObject objectId objectValue
return $ Indirect coordinates
populate :: ByteString -> Content -> Content
populate input initialContent =
case runParser recurseOnOccurrences initialState "" input of
Left _ -> initialContent
Right finalState -> content finalState
where
initialState = UserState {input, content = initialContent}
recurseOnOccurrences :: SParser UserState
recurseOnOccurrences =
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState