Hufflepdf/src/PDF.hs

109 lines
4.0 KiB
Haskell
Raw Normal View History

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NamedFieldPuns #-}
module PDF (
2019-05-16 22:41:14 +02:00
Document(..)
, Content(..)
, DirectObject(..)
2019-05-16 22:41:14 +02:00
, parseDocument
, render
) where
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Lazy.Char8 as BS (
drop, findIndex, head, isPrefixOf, last, length, span, unpack
)
import Data.ByteString.Lazy.Char8.Util (previous, subBS)
import Data.Int (Int64)
import qualified Data.Map as Map (lookup)
import PDF.Body (populate)
2019-05-16 22:41:14 +02:00
import qualified PDF.EOL as EOL (Style(..), charset, parser)
import PDF.Object (Content(..), DirectObject(..), content, eofMarker, magicNumber)
import qualified PDF.Output as Output (render, string)
import PDF.Output (Output(..), nextLine)
import Text.Parsec
import Text.Parsec.ByteString.Lazy (Parser)
import Text.Parsec.Pos (newPos)
import Text.ParserCombinators.Parsec.Error (Message(..), newErrorMessage)
2019-05-16 22:41:14 +02:00
import Text.Printf (printf)
data Document = Document {
pdfVersion :: String
2019-05-16 22:41:14 +02:00
, eolStyle :: EOL.Style
, contents :: [Content]
} deriving Show
2019-05-16 22:41:14 +02:00
instance Output Document where
output (Document {pdfVersion, contents}) =
Output.string (printf "%%PDF-%s" pdfVersion)
`nextLine` output contents
render :: Document -> ByteString
render document@(Document {eolStyle}) = Output.render eolStyle document
version :: Parser String
2019-05-16 22:41:14 +02:00
version = string magicNumber *> many (noneOf EOL.charset)
parseError :: String -> Either ParseError a
parseError errorMessage =
Left $ newErrorMessage (Message errorMessage) (newPos "" 0 0)
check :: Bool -> String -> Either ParseError ()
check test errorMessage = if test then return () else parseError errorMessage
2019-05-16 22:41:14 +02:00
readStartXref :: EOL.Style -> ByteString -> Either ParseError Int64
readStartXref eolStyle input =
check (eofMarker `BS.isPrefixOf` (BS.drop eofMarkerPosition input))
"Badly formed document : missing EOF marker at the end"
>> return (read . BS.unpack $ subBS startXrefPosition startXrefLength input)
where
(eolOffset, eolLastByte) = case eolStyle of
2019-05-16 22:41:14 +02:00
EOL.CRLF -> (2, '\n')
EOL.CR -> (1, '\r')
_ -> (1, '\n')
eofMarkerPosition =
BS.length input - BS.length eofMarker
- if BS.last input == BS.last eofMarker then 0 else eolOffset
startXrefPosition =
previous eolLastByte (eofMarkerPosition - eolOffset - 1) input + 1
startXrefLength = eofMarkerPosition - eolOffset - startXrefPosition
2019-05-16 22:41:14 +02:00
findNextLine :: ByteString -> Int64
findNextLine input =
let (line, eolPrefixed) = BS.span notInEol input in
let nextNotInEol = BS.findIndex notInEol eolPrefixed in
BS.length line + (maybe (BS.length eolPrefixed) id nextNotInEol)
where
2019-05-16 22:41:14 +02:00
notInEol = not . (`elem` EOL.charset)
2019-05-16 22:41:14 +02:00
findNextSection :: Int64 -> ByteString -> Int64
findNextSection offset input =
case BS.findIndex (== BS.head eofMarker) input of
Nothing -> 0
Just delta ->
let newInput = BS.drop delta input in
let newOffset = offset + delta in
if BS.isPrefixOf eofMarker newInput
2019-05-16 22:41:14 +02:00
then newOffset + findNextLine newInput
else findNextSection (newOffset + 1) (BS.drop 1 newInput)
iterateContents :: Int64 -> ByteString -> Either ParseError [Content]
iterateContents startXref input =
parse content ('@':show startXref) (BS.drop startXref input) >>= stopOrFollow
where
stopOrFollow c@(Content {trailer}) =
case Map.lookup "Prev" trailer of
2019-05-16 22:41:14 +02:00
Nothing -> Right [c {startOffset = findNextLine input}]
Just (Number newStartXref) ->
let offset = truncate newStartXref in
2019-05-16 22:41:14 +02:00
let startOffset = findNextSection offset (BS.drop offset input) in
(c {startOffset}:) <$> (iterateContents offset input)
Just v -> parseError $ "Bad value for Prev entry in trailer: " ++ show v
parseDocument :: ByteString -> Either ParseError Document
parseDocument input = do
2019-05-16 22:41:14 +02:00
(pdfVersion, eolStyle) <- parse ((,) <$> version <*> EOL.parser) "" input
startXref <- readStartXref eolStyle input
structures <- iterateContents startXref input
let contents = populate input <$> structures
2019-05-16 22:41:14 +02:00
return $ Document {pdfVersion, eolStyle, contents}