Hufflepdf/src/PDF.hs

78 lines
2.7 KiB
Haskell

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NamedFieldPuns #-}
module PDF (
parseDocument
, Document(..)
, Content(..)
, DirectObject(..)
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (drop, isPrefixOf, last, length, unpack)
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
import Data.Int (Int64)
import qualified Data.Map as Map (lookup)
import PDF.Object (Content(..), DirectObject(..), EOLStyle(..), content, eol, eolCharset)
import PDF.Body (populate)
import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser)
import Text.Parsec.Pos (newPos)
import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
data Document = Document {
pdfVersion :: String
, contents :: [Content]
} deriving Show
magicNumber :: String
magicNumber = "%PDF-"
version :: Parser String
version = string magicNumber *> many (noneOf eolCharset)
eofMarker :: ByteString
eofMarker = "%%EOF"
parseError :: String -> Either ParseError a
parseError errorMessage =
Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0)
check :: Bool -> String -> Either ParseError ()
check test errorMessage = if test then return () else parseError errorMessage
readStartXref :: EOLStyle -> ByteString -> Either ParseError Int64
readStartXref eolStyle input =
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
"Badly formed document : missing EOF marker at the end"
>> return (read . BS.unpack $ subBS startXrefPosition startXrefLength input)
where
(eolOffset, eolLastByte) = case eolStyle of
CRLF -> (2, '\n')
CR -> (1, '\r')
_ -> (1, '\n')
eofMarkerPosition =
BS.length input - BS.length eofMarker
- if BS.last input == BS.last eofMarker then 0 else eolOffset
startXrefPosition =
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
parseDocument :: ByteString -> Either ParseError Document
parseDocument input = do
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> eol) "" input
startXref <- readStartXref eolStyle input
structures <- iterateContents startXref input
let contents = populate input <$> structures
return $ Document {pdfVersion, contents}
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
iterateContents startXref input =
parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
where
stopOrFollow c@(Content {trailer}) =
case Map.lookup "Prev" trailer of
Nothing -> Right [c]
Just (Number f) -> (c:) <$> (iterateContents (truncate f) input)
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v