2019-05-13 08:05:28 +02:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
|
|
|
module PDF (
|
2019-05-13 18:22:05 +02:00
|
|
|
parseDocument
|
|
|
|
, Document(..)
|
|
|
|
, Content(..)
|
|
|
|
, DirectObject(..)
|
2019-05-13 08:05:28 +02:00
|
|
|
) where
|
|
|
|
|
|
|
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
2019-05-13 18:22:05 +02:00
|
|
|
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack)
|
2019-05-13 08:05:28 +02:00
|
|
|
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
|
|
|
|
import Data.Int (Int64)
|
2019-05-13 18:22:05 +02:00
|
|
|
import qualified Data.Map as Map (lookup)
|
|
|
|
import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset)
|
2019-05-13 08:05:28 +02:00
|
|
|
import Text.Parsec
|
|
|
|
import Text.Parsec.ByteString.Lazy (Parser)
|
|
|
|
import Text.Parsec.Pos (newPos)
|
|
|
|
import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
|
|
|
|
|
|
|
|
data Document = Document {
|
|
|
|
pdfVersion :: String
|
2019-05-13 18:22:05 +02:00
|
|
|
, contents :: [Content]
|
2019-05-13 08:05:28 +02:00
|
|
|
} deriving Show
|
|
|
|
|
2019-05-13 18:22:05 +02:00
|
|
|
magicNumber :: String
|
|
|
|
magicNumber = "%PDF-"
|
2019-05-13 08:05:28 +02:00
|
|
|
|
|
|
|
version :: Parser String
|
|
|
|
version = string magicNumber *> many (noneOf eolCharset)
|
|
|
|
|
|
|
|
eofMarker :: ByteString
|
|
|
|
eofMarker = "%%EOF"
|
|
|
|
|
2019-05-13 18:22:05 +02:00
|
|
|
parseError :: String -> Either ParseError a
|
|
|
|
parseError errorMessage =
|
|
|
|
Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0)
|
|
|
|
|
2019-05-13 08:05:28 +02:00
|
|
|
check :: Bool -> String -> Either ParseError ()
|
2019-05-13 18:22:05 +02:00
|
|
|
check test errorMessage = if test then return () else parseError errorMessage
|
2019-05-13 08:05:28 +02:00
|
|
|
|
|
|
|
readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64
|
|
|
|
readStartXref eolStyle input =
|
|
|
|
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
|
|
|
|
"Badly formed document : missing EOF marker at the end"
|
|
|
|
>> return (read . BS.unpack $ subBS startXrefPosition startXrefLength input)
|
|
|
|
where
|
|
|
|
(eolOffset, eolLastByte) = case eolStyle of
|
|
|
|
CRLF -> (2, '\n')
|
|
|
|
CR -> (1, '\r')
|
|
|
|
_ -> (1, '\n')
|
|
|
|
eofMarkerPosition =
|
|
|
|
BS.length input - BS.length eofMarker
|
|
|
|
- if BS.last input == BS.last eofMarker then 0 else eolOffset
|
|
|
|
startXrefPosition =
|
2019-05-13 11:34:15 +02:00
|
|
|
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
|
2019-05-13 08:05:28 +02:00
|
|
|
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
|
|
|
|
|
|
|
|
parseDocument :: ByteString -> Either ParseError Document
|
|
|
|
parseDocument input = do
|
|
|
|
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
|
|
|
|
startXref <- readStartXref eolStyle input
|
2019-05-13 18:22:05 +02:00
|
|
|
contents <- iterateContents startXref input
|
|
|
|
return $ Document {pdfVersion, contents}
|
|
|
|
|
|
|
|
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
|
|
|
|
iterateContents startXref input =
|
|
|
|
parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
|
|
|
|
where
|
|
|
|
stopOrFollow c@(Content {trailer}) =
|
|
|
|
case Map.lookup "Prev" trailer of
|
|
|
|
Nothing -> Right [c]
|
|
|
|
Just (Number f) -> (c:) <$> (iterateContents (truncate f) input)
|
|
|
|
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
|
2019-05-13 08:05:28 +02:00
|
|
|
|