Hufflepdf/src/PDF/Body.hs

138 lines
4.8 KiB
Haskell
Raw Normal View History

{-# LANGUAGE NamedFieldPuns #-}
2019-05-24 10:48:09 +02:00
{-# LANGUAGE OverloadedStrings #-}
module PDF.Body (
populate
) where
2019-05-24 10:48:09 +02:00
import Control.Applicative ((<|>))
import Control.Monad.State (get, gets, modify)
import Data.ByteString.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (drop, unpack)
import Data.Map ((!))
import qualified Data.Map as Map (empty, insert, lookup)
2019-05-16 22:41:14 +02:00
import qualified PDF.EOL as EOL (charset, parser)
import PDF.Object (
2019-05-18 09:01:13 +02:00
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
2019-05-24 10:48:09 +02:00
, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..)
2019-05-16 22:41:14 +02:00
, blank, dictionary, directObject, integer, line
)
import PDF.Parser (Parser, (<?>), block, char, on, option, runParser, takeAll)
data UserState = UserState {
input :: ByteString
2019-05-18 09:01:13 +02:00
, xreferences :: XRefSection
, flow :: Flow
}
type SParser = Parser UserState
modifyFlow :: (Flow -> Flow) -> SParser ()
2019-05-24 10:48:09 +02:00
modifyFlow f = modify $ \state -> state {flow = f $ flow state}
addObject :: Int -> Object -> SParser ()
addObject objectId newObject = modifyFlow $ \flow -> flow {
tmpObjects = Map.insert objectId newObject $ tmpObjects flow
}
pushOccurrence :: Occurrence -> SParser ()
pushOccurrence newOccurrence = modifyFlow $ \flow -> flow {
occurrencesStack = newOccurrence : (occurrencesStack flow)
}
comment :: Parser u String
2019-05-24 10:48:09 +02:00
comment = BS.unpack <$>
(char '%' *> takeAll (not . (`elem` EOL.charset)) <* EOL.parser)
2019-05-24 10:48:09 +02:00
lookupOffset :: Int -> XRefSection -> Maybe Int
lookupOffset _ [] = Nothing
lookupOffset objectId (xrefSubSection:others) =
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
let index = objectId - firstObjectId in
if index >= 0 && index < entriesNumber
then
case Map.lookup index entries of
Just (InUse {offset}) -> Just offset
_ -> Nothing
else lookupOffset objectId others
2019-05-24 10:48:09 +02:00
getOffset :: Int -> SParser Int
getOffset objectId = do
2019-05-24 10:48:09 +02:00
table <- gets xreferences
2019-05-18 09:01:13 +02:00
case lookupOffset objectId table of
Nothing -> fail $
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
Just offset -> return offset
loadNumber :: Int -> SParser Float
loadNumber objectId = do
offset <- getOffset objectId
2019-05-24 10:48:09 +02:00
objectStart <- BS.drop offset <$> gets input
indirectObjCoordinates `on` (objectStart :: ByteString) >> return ()
objectValue <- (!objectId) . tmpObjects <$> gets flow
case objectValue of
Direct (NumberObject (Number n)) -> return n
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
invalidValue :: Object -> String
invalidValue v = "Invalid value " ++ show v
getSize :: Maybe DirectObject -> SParser Float
getSize Nothing = fail "Missing '/Length' key on stream"
getSize (Just (NumberObject (Number size))) = return size
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
2019-05-24 10:48:09 +02:00
Flow {tmpObjects} <- gets flow
case Map.lookup objectId tmpObjects of
Nothing -> loadNumber objectId
Just (Direct (NumberObject (Number size))) -> return size
Just v -> fail $
invalidValue v ++ " for obj " ++ show objectId ++ "used as /Length"
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
streamObject :: SParser Object
2019-05-24 10:48:09 +02:00
streamObject = do
header <- dictionary <* blank
size <- getSize (Map.lookup (Name "Length") header)
2019-05-24 10:48:09 +02:00
streamContent <- stream (truncate size)
return $ Stream {header, streamContent}
where
2019-05-24 10:48:09 +02:00
stream size = line "stream" *> block size <* blank <* line "endstream"
object :: SParser Object
object = streamObject <|> Direct <$> directObject
indirectObjCoordinates :: SParser IndirectObjCoordinates
indirectObjCoordinates = do
objectId <- integer
coordinates <- IndirectObjCoordinates objectId <$> integer
objectValue <- line "obj" *> object <* blank <* line "endobj"
addObject objectId objectValue
return coordinates
occurrence :: SParser Occurrence
occurrence =
Comment <$> comment <|> Indirect <$> indirectObjCoordinates <?> "comment or object"
2019-05-18 09:01:13 +02:00
populate :: ByteString -> InputStructure -> Content
populate input structure =
let bodyInput = BS.drop (startOffset structure) input in
2019-05-24 10:48:09 +02:00
case runParser recurseOnOccurrences initialState bodyInput of
2019-05-18 09:01:13 +02:00
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
Right finalState ->
let Flow {occurrencesStack, tmpObjects} = flow finalState in
2019-05-18 09:01:13 +02:00
Content {
occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure
}
where
2019-05-18 09:01:13 +02:00
docStructure = inputStructure structure
xreferences = xrefSection docStructure
initialState = UserState {
2019-05-18 09:01:13 +02:00
input, xreferences, flow = Flow {
occurrencesStack = [], tmpObjects = Map.empty
}
}
recurseOnOccurrences :: SParser UserState
recurseOnOccurrences =
2019-05-24 10:48:09 +02:00
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> get