Hufflepdf/src/PDF/Body.hs

132 lines
4.7 KiB
Haskell
Raw Normal View History

{-# LANGUAGE NamedFieldPuns #-}
2019-05-24 10:48:09 +02:00
{-# LANGUAGE OverloadedStrings #-}
module PDF.Body (
populate
) where
2019-05-24 10:48:09 +02:00
import Control.Applicative ((<|>))
import Control.Monad.State (get, gets, modify)
import Data.Attoparsec.ByteString.Char8 (option)
2019-05-24 10:48:09 +02:00
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (cons, drop, unpack)
import Data.Map ((!))
import qualified Data.Map as Map (empty, insert, lookup)
2019-05-16 22:41:14 +02:00
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Object (
2019-05-18 09:01:13 +02:00
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
, Structure(..), XRefEntry(..), XRefSection
2019-05-16 22:41:14 +02:00
, blank, dictionary, directObject, integer, line
)
import PDF.Output (ObjectId(..), Offset(..))
import PDF.Parser (Parser, (<?>), block, char, evalParser, on, takeAll)
data UserState = UserState {
input :: ByteString
2019-05-18 09:01:13 +02:00
, xreferences :: XRefSection
, flow :: Flow
}
type SParser = Parser UserState
modifyFlow :: (Flow -> Flow) -> SParser ()
2019-05-24 10:48:09 +02:00
modifyFlow f = modify $ \state -> state {flow = f $ flow state}
addObject :: ObjectId -> Object -> SParser ()
addObject objectId newObject = modifyFlow $ \flow -> flow {
tmpObjects = Map.insert objectId newObject $ tmpObjects flow
}
pushOccurrence :: Occurrence -> SParser ()
pushOccurrence newOccurrence = modifyFlow $ \flow -> flow {
occurrencesStack = newOccurrence : (occurrencesStack flow)
}
comment :: Parser u String
comment = BS.unpack <$> (option "" afterPercent <* EOL.parser)
where
afterPercent = BS.cons <$> char '%' <*> takeAll (not . (`elem` EOL.charset))
lookupOffset :: ObjectId -> SParser Offset
lookupOffset objectId = do
2019-05-24 10:48:09 +02:00
table <- gets xreferences
case Map.lookup objectId table >>= entryOffset of
Nothing -> fail $
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
Just offset -> return offset
where
entryOffset (InUse {offset}) = Just offset
entryOffset _ = Nothing
loadNumber :: ObjectId -> SParser Double
loadNumber objectId = do
offset <- getOffset <$> lookupOffset objectId
2019-05-24 10:48:09 +02:00
objectStart <- BS.drop offset <$> gets input
indirectObjCoordinates `on` (objectStart :: ByteString) >> return ()
objectValue <- (!objectId) . tmpObjects <$> gets flow
case objectValue of
Direct (NumberObject (Number n)) -> return n
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
invalidValue :: Object -> String
invalidValue v = "Invalid value " ++ show v
getSize :: Maybe DirectObject -> SParser Double
getSize Nothing = fail "Missing '/Length' key on stream"
getSize (Just (NumberObject (Number size))) = return size
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
2019-05-24 10:48:09 +02:00
Flow {tmpObjects} <- gets flow
case Map.lookup objectId tmpObjects of
Nothing -> loadNumber objectId
Just (Direct (NumberObject (Number size))) -> return size
Just v -> fail $
invalidValue v ++ " for obj " ++ show objectId ++ "used as /Length"
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
streamObject :: SParser Object
2019-05-24 10:48:09 +02:00
streamObject = do
header <- dictionary <* blank
size <- getSize (Map.lookup (Name "Length") header)
2019-05-24 10:48:09 +02:00
streamContent <- stream (truncate size)
return $ Stream {header, streamContent}
where
2019-05-24 10:48:09 +02:00
stream size = line "stream" *> block size <* blank <* line "endstream"
object :: SParser Object
object = streamObject <|> Direct <$> directObject
indirectObjCoordinates :: SParser IndirectObjCoordinates
indirectObjCoordinates = do
objectId <- ObjectId <$> integer
coordinates <- IndirectObjCoordinates objectId <$> integer
objectValue <- line "obj" *> object <* blank <* line "endobj"
addObject objectId objectValue
return coordinates
occurrence :: SParser Occurrence
occurrence =
Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
2019-05-18 09:01:13 +02:00
populate :: ByteString -> InputStructure -> Content
populate input structure =
let bodyInput = BS.drop (startOffset structure) input in
case evalParser recurseOnOccurrences initialState bodyInput of
2019-05-18 09:01:13 +02:00
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
Right finalState ->
let Flow {occurrencesStack, tmpObjects} = flow finalState in
2019-05-18 09:01:13 +02:00
Content {
occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure
}
where
2019-05-18 09:01:13 +02:00
docStructure = inputStructure structure
xreferences = xRef docStructure
initialState = UserState {
2019-05-18 09:01:13 +02:00
input, xreferences, flow = Flow {
occurrencesStack = [], tmpObjects = Map.empty
}
}
recurseOnOccurrences :: SParser UserState
recurseOnOccurrences =
2019-05-24 10:48:09 +02:00
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> get