Hufflepdf/src/PDF/Body.hs

139 lines
4.8 KiB
Haskell
Raw Normal View History

{-# LANGUAGE NamedFieldPuns #-}
module PDF.Body (
populate
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
import Data.Int (Int64)
import Data.Map ((!))
import qualified Data.Map as Map (empty, insert, lookup)
2019-05-16 22:41:14 +02:00
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Object (
2019-05-18 09:01:13 +02:00
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
, Parser, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..)
2019-05-16 22:41:14 +02:00
, blank, dictionary, directObject, integer, line
)
import Text.Parsec
data UserState = UserState {
input :: ByteString
2019-05-18 09:01:13 +02:00
, xreferences :: XRefSection
, flow :: Flow
}
type SParser = Parser UserState
modifyFlow :: (Flow -> Flow) -> SParser ()
modifyFlow f = modifyState $ \state -> state {flow = f $ flow state}
addObject :: Int -> Object -> SParser ()
addObject objectId newObject = modifyFlow $ \flow -> flow {
tmpObjects = Map.insert objectId newObject $ tmpObjects flow
}
pushOccurrence :: Occurrence -> SParser ()
pushOccurrence newOccurrence = modifyFlow $ \flow -> flow {
occurrencesStack = newOccurrence : (occurrencesStack flow)
}
comment :: Parser u String
2019-05-16 22:41:14 +02:00
comment = char '%' *> many (noneOf EOL.charset) <* EOL.parser
lookupOffset :: Int -> XRefSection -> Maybe Int64
lookupOffset _ [] = Nothing
lookupOffset objectId (xrefSubSection:others) =
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
let index = objectId - firstObjectId in
if index >= 0 && index < entriesNumber
then
case Map.lookup index entries of
Just (InUse {offset}) -> Just offset
_ -> Nothing
else lookupOffset objectId others
getOffset :: Int -> SParser Int64
getOffset objectId = do
2019-05-18 09:01:13 +02:00
table <- xreferences <$> getState
case lookupOffset objectId table of
Nothing -> fail $
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
Just offset -> return offset
on :: Monad m => ParsecT s u m a -> s -> ParsecT s u m a
on parser input = do
originalInput <- getInput
setInput input >> parser <* setInput originalInput
loadNumber :: Int -> SParser Float
loadNumber objectId = do
offset <- getOffset objectId
objectStart <- BS.drop offset . input <$> getState
indirectObjCoordinates `on` objectStart >> return ()
objectValue <- (!objectId) . tmpObjects . flow <$> getState
case objectValue of
Direct (NumberObject (Number n)) -> return n
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
invalidValue :: Object -> String
invalidValue v = "Invalid value " ++ show v
getSize :: Maybe DirectObject -> SParser Float
getSize Nothing = fail "Missing '/Length' key on stream"
getSize (Just (NumberObject (Number size))) = return size
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
Flow {tmpObjects} <- flow <$> getState
case Map.lookup objectId tmpObjects of
Nothing -> loadNumber objectId
Just (Direct (NumberObject (Number size))) -> return size
Just v -> fail $
invalidValue v ++ " for obj " ++ show objectId ++ "used as /Length"
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
streamObject :: SParser Object
streamObject = try $ do
header <- dictionary <* blank
size <- getSize (Map.lookup (Name "Length") header)
streamContent <- BS.pack <$> stream (truncate size)
return $ Stream {header, streamContent}
where
stream size = line "stream" *> count size anyChar <* blank <* line "endstream"
object :: SParser Object
object = streamObject <|> Direct <$> directObject
indirectObjCoordinates :: SParser IndirectObjCoordinates
indirectObjCoordinates = do
objectId <- integer
coordinates <- IndirectObjCoordinates objectId <$> integer
objectValue <- line "obj" *> object <* blank <* line "endobj"
addObject objectId objectValue
return coordinates
occurrence :: SParser Occurrence
occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
2019-05-18 09:01:13 +02:00
populate :: ByteString -> InputStructure -> Content
populate input structure =
let bodyInput = BS.drop (startOffset structure) input in
case runParser recurseOnOccurrences initialState "" bodyInput of
2019-05-18 09:01:13 +02:00
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
Right finalState ->
let Flow {occurrencesStack, tmpObjects} = flow finalState in
2019-05-18 09:01:13 +02:00
Content {
occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure
}
where
2019-05-18 09:01:13 +02:00
docStructure = inputStructure structure
xreferences = xrefSection docStructure
initialState = UserState {
2019-05-18 09:01:13 +02:00
input, xreferences, flow = Flow {
occurrencesStack = [], tmpObjects = Map.empty
}
}
recurseOnOccurrences :: SParser UserState
recurseOnOccurrences =
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState