2019-05-14 18:42:11 +02:00
|
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
2019-05-15 19:12:38 +02:00
|
|
|
module PDF.Body (
|
|
|
|
populate
|
|
|
|
) where
|
2019-05-14 18:42:11 +02:00
|
|
|
|
|
|
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
|
|
|
import qualified Data.ByteString.Lazy.Char8 as BS (drop, pack)
|
|
|
|
import Data.Int (Int64)
|
2019-05-15 15:03:55 +02:00
|
|
|
import Data.Map ((!))
|
2019-05-17 16:14:06 +02:00
|
|
|
import qualified Data.Map as Map (empty, insert, lookup)
|
2019-05-16 22:41:14 +02:00
|
|
|
import qualified PDF.EOL as EOL (charset, parser)
|
2019-05-14 18:42:11 +02:00
|
|
|
import PDF.Object (
|
2019-05-18 09:01:13 +02:00
|
|
|
Content(..), DirectObject(..), Flow(..), IndirectObjCoordinates(..)
|
|
|
|
, InputStructure(..), Name(..), Number(..), Object(..), Occurrence(..)
|
|
|
|
, Parser, Structure(..), XRefEntry(..), XRefSection, XRefSubSection(..)
|
2019-05-16 22:41:14 +02:00
|
|
|
, blank, dictionary, directObject, integer, line
|
2019-05-14 18:42:11 +02:00
|
|
|
)
|
|
|
|
import Text.Parsec
|
|
|
|
|
|
|
|
data UserState = UserState {
|
|
|
|
input :: ByteString
|
2019-05-18 09:01:13 +02:00
|
|
|
, xreferences :: XRefSection
|
2019-05-17 16:14:06 +02:00
|
|
|
, flow :: Flow
|
2019-05-14 18:42:11 +02:00
|
|
|
}
|
|
|
|
|
2019-05-15 09:04:17 +02:00
|
|
|
type SParser = Parser UserState
|
2019-05-14 18:42:11 +02:00
|
|
|
|
2019-05-17 16:14:06 +02:00
|
|
|
modifyFlow :: (Flow -> Flow) -> SParser ()
|
|
|
|
modifyFlow f = modifyState $ \state -> state {flow = f $ flow state}
|
2019-05-14 18:42:11 +02:00
|
|
|
|
|
|
|
addObject :: Int -> Object -> SParser ()
|
2019-05-17 16:14:06 +02:00
|
|
|
addObject objectId newObject = modifyFlow $ \flow -> flow {
|
|
|
|
tmpObjects = Map.insert objectId newObject $ tmpObjects flow
|
2019-05-14 18:42:11 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
pushOccurrence :: Occurrence -> SParser ()
|
2019-05-17 16:14:06 +02:00
|
|
|
pushOccurrence newOccurrence = modifyFlow $ \flow -> flow {
|
|
|
|
occurrencesStack = newOccurrence : (occurrencesStack flow)
|
2019-05-14 18:42:11 +02:00
|
|
|
}
|
|
|
|
|
2019-05-15 09:04:17 +02:00
|
|
|
comment :: Parser u String
|
2019-05-16 22:41:14 +02:00
|
|
|
comment = char '%' *> many (noneOf EOL.charset) <* EOL.parser
|
2019-05-14 18:42:11 +02:00
|
|
|
|
|
|
|
lookupOffset :: Int -> XRefSection -> Maybe Int64
|
|
|
|
lookupOffset _ [] = Nothing
|
|
|
|
lookupOffset objectId (xrefSubSection:others) =
|
|
|
|
let XRefSubSection {firstObjectId, entriesNumber, entries} = xrefSubSection in
|
|
|
|
let index = objectId - firstObjectId in
|
|
|
|
if index >= 0 && index < entriesNumber
|
|
|
|
then
|
|
|
|
case Map.lookup index entries of
|
|
|
|
Just (InUse {offset}) -> Just offset
|
|
|
|
_ -> Nothing
|
|
|
|
else lookupOffset objectId others
|
|
|
|
|
|
|
|
getOffset :: Int -> SParser Int64
|
|
|
|
getOffset objectId = do
|
2019-05-18 09:01:13 +02:00
|
|
|
table <- xreferences <$> getState
|
|
|
|
case lookupOffset objectId table of
|
2019-05-14 18:42:11 +02:00
|
|
|
Nothing -> fail $
|
|
|
|
"obj " ++ show objectId ++ " is referenced but missing in XRef table"
|
|
|
|
Just offset -> return offset
|
|
|
|
|
2019-05-15 15:03:55 +02:00
|
|
|
on :: Monad m => ParsecT s u m a -> s -> ParsecT s u m a
|
|
|
|
on parser input = do
|
|
|
|
originalInput <- getInput
|
|
|
|
setInput input >> parser <* setInput originalInput
|
|
|
|
|
2019-05-14 18:42:11 +02:00
|
|
|
loadNumber :: Int -> SParser Float
|
|
|
|
loadNumber objectId = do
|
|
|
|
offset <- getOffset objectId
|
2019-05-15 15:03:55 +02:00
|
|
|
objectStart <- BS.drop offset . input <$> getState
|
|
|
|
indirectObjCoordinates `on` objectStart >> return ()
|
2019-05-17 16:14:06 +02:00
|
|
|
objectValue <- (!objectId) . tmpObjects . flow <$> getState
|
2019-05-15 15:03:55 +02:00
|
|
|
case objectValue of
|
2019-05-17 16:14:06 +02:00
|
|
|
Direct (NumberObject (Number n)) -> return n
|
2019-05-15 15:03:55 +02:00
|
|
|
obj -> fail $ "Expected obj@" ++ show offset ++ " (" ++ show obj ++ ") to be a Number"
|
2019-05-14 18:42:11 +02:00
|
|
|
|
|
|
|
invalidValue :: Object -> String
|
|
|
|
invalidValue v = "Invalid value " ++ show v
|
|
|
|
|
|
|
|
getSize :: Maybe DirectObject -> SParser Float
|
|
|
|
getSize Nothing = fail "Missing '/Length' key on stream"
|
2019-05-17 16:14:06 +02:00
|
|
|
getSize (Just (NumberObject (Number size))) = return size
|
2019-05-14 18:42:11 +02:00
|
|
|
getSize (Just (Reference (IndirectObjCoordinates {objectId}))) = do
|
2019-05-17 16:14:06 +02:00
|
|
|
Flow {tmpObjects} <- flow <$> getState
|
|
|
|
case Map.lookup objectId tmpObjects of
|
2019-05-14 18:42:11 +02:00
|
|
|
Nothing -> loadNumber objectId
|
2019-05-17 16:14:06 +02:00
|
|
|
Just (Direct (NumberObject (Number size))) -> return size
|
2019-05-14 18:42:11 +02:00
|
|
|
Just v -> fail $
|
|
|
|
invalidValue v ++ " for obj " ++ show objectId ++ "used as /Length"
|
|
|
|
getSize (Just v) = fail $ invalidValue (Direct v) ++ " for /Length"
|
|
|
|
|
2019-05-15 15:03:55 +02:00
|
|
|
streamObject :: SParser Object
|
|
|
|
streamObject = try $ do
|
|
|
|
header <- dictionary <* blank
|
2019-05-17 16:14:06 +02:00
|
|
|
size <- getSize (Map.lookup (Name "Length") header)
|
2019-05-15 15:03:55 +02:00
|
|
|
streamContent <- BS.pack <$> stream (truncate size)
|
|
|
|
return $ Stream {header, streamContent}
|
2019-05-14 18:42:11 +02:00
|
|
|
where
|
2019-05-15 15:03:55 +02:00
|
|
|
stream size = line "stream" *> count size anyChar <* blank <* line "endstream"
|
|
|
|
|
|
|
|
object :: SParser Object
|
|
|
|
object = streamObject <|> Direct <$> directObject
|
|
|
|
|
|
|
|
indirectObjCoordinates :: SParser IndirectObjCoordinates
|
|
|
|
indirectObjCoordinates = do
|
|
|
|
objectId <- integer
|
|
|
|
coordinates <- IndirectObjCoordinates objectId <$> integer
|
|
|
|
objectValue <- line "obj" *> object <* blank <* line "endobj"
|
|
|
|
addObject objectId objectValue
|
|
|
|
return coordinates
|
2019-05-14 18:42:11 +02:00
|
|
|
|
|
|
|
occurrence :: SParser Occurrence
|
2019-05-15 15:03:55 +02:00
|
|
|
occurrence = Comment <$> comment <|> Indirect <$> indirectObjCoordinates
|
2019-05-14 18:42:11 +02:00
|
|
|
|
2019-05-18 09:01:13 +02:00
|
|
|
populate :: ByteString -> InputStructure -> Content
|
2019-05-17 16:14:06 +02:00
|
|
|
populate input structure =
|
|
|
|
let bodyInput = BS.drop (startOffset structure) input in
|
2019-05-16 11:01:50 +02:00
|
|
|
case runParser recurseOnOccurrences initialState "" bodyInput of
|
2019-05-18 09:01:13 +02:00
|
|
|
Left _ -> Content {occurrences = [], objects = Map.empty, docStructure}
|
2019-05-16 11:01:50 +02:00
|
|
|
Right finalState ->
|
2019-05-17 16:14:06 +02:00
|
|
|
let Flow {occurrencesStack, tmpObjects} = flow finalState in
|
2019-05-18 09:01:13 +02:00
|
|
|
Content {
|
|
|
|
occurrences = reverse occurrencesStack, objects = tmpObjects, docStructure
|
|
|
|
}
|
2019-05-14 18:42:11 +02:00
|
|
|
where
|
2019-05-18 09:01:13 +02:00
|
|
|
docStructure = inputStructure structure
|
|
|
|
xreferences = xrefSection docStructure
|
2019-05-17 16:14:06 +02:00
|
|
|
initialState = UserState {
|
2019-05-18 09:01:13 +02:00
|
|
|
input, xreferences, flow = Flow {
|
|
|
|
occurrencesStack = [], tmpObjects = Map.empty
|
|
|
|
}
|
2019-05-17 16:14:06 +02:00
|
|
|
}
|
2019-05-14 18:42:11 +02:00
|
|
|
|
|
|
|
recurseOnOccurrences :: SParser UserState
|
|
|
|
recurseOnOccurrences =
|
|
|
|
(occurrence >>= pushOccurrence >> recurseOnOccurrences) <|> getState
|