{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE NamedFieldPuns #-} module PDF ( parseDocument , Document(..) , Content(..) , DirectObject(..) ) where import Data.ByteString.Lazy.Char8 (ByteString) import qualified Data.ByteString.Lazy.Char8 as BS ( drop, findIndex, head, isPrefixOf, last, length, span, unpack ) import Data.ByteString.Lazy.Char8.Util (previous, subBS) import Data.Int (Int64) import qualified Data.Map as Map (lookup) import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset) import PDF.Body (populate) import Text.Parsec import Text.Parsec.ByteString.Lazy (Parser) import Text.Parsec.Pos (newPos) import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage) data Document = Document { pdfVersion :: String , contents :: [Content] } deriving Show version :: Parser String version = string magicNumber *> many (noneOf eolCharset) parseError :: String -> Either ParseError a parseError errorMessage = Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0) check :: Bool -> String -> Either ParseError () check test errorMessage = if test then return () else parseError errorMessage readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64 readStartXref eolStyle input = check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input)) "Badly formed document : missing EOF marker at the end" >> return (read . BS.unpack $ subBS startXrefPosition startXrefLength input) where (eolOffset, eolLastByte) = case eolStyle of CRLF -> (2, '\n') CR -> (1, '\r') _ -> (1, '\n') eofMarkerPosition = BS.length input - BS.length eofMarker - if BS.last input == BS.last eofMarker then 0 else eolOffset startXrefPosition = previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1 startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition nextLine :: ByteString -> Int64 nextLine input = let (line, eolPrefixed) = BS.span notInEol input in let nextNotInEol = BS.findIndex notInEol eolPrefixed in BS.length line + (maybe (BS.length eolPrefixed) id nextNotInEol) where notInEol = not . (`elem` eolCharset) nextSection :: Int64 -> ByteString -> Int64 nextSection offset input = case BS.findIndex (== BS.head eofMarker) input of Nothing -> 0 Just delta -> let newInput = BS.drop delta input in let newOffset = offset + delta in if BS.isPrefixOf eofMarker newInput then newOffset + nextLine newInput else nextSection (newOffset + 1) (BS.drop 1 newInput) iterateContents :: Int64 -> ByteString -> Either ParseError [Content] iterateContents startXref input = parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow where stopOrFollow c@(Content {trailer}) = case Map.lookup "Prev" trailer of Nothing -> Right [c {startOffset = nextLine input}] Just (Number newStartXref) -> let offset = truncate newStartXref in let startOffset = nextSection offset (BS.drop offset input) in (c {startOffset}:) <$> (iterateContents offset input) Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v parseDocument :: ByteString -> Either ParseError Document parseDocument input = do (pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input startXref <- readStartXref eolStyle input structures <- iterateContents startXref input let contents = populate input <$> structures return $ Document {pdfVersion, contents}