WIP: still trying to figure things out, moved to a separate submodule for Navigation, proper naming is hell

This commit is contained in:
Tissevert 2020-02-11 08:29:08 +01:00
parent e77bbbcda9
commit aed7af376a
5 changed files with 88 additions and 64 deletions

View File

@ -21,6 +21,7 @@ library
, PDF.Content
, PDF.EOL
, PDF.Object
, PDF.Object.Navigation
, PDF.Output
, PDF.Parser
, PDF.Pages

View File

@ -12,6 +12,7 @@ module PDF.CMap (
) where
import Control.Applicative ((<|>), many)
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.State (modify)
import Data.Attoparsec.ByteString.Char8 (count)
import Data.ByteString (ByteString)
@ -32,6 +33,7 @@ import PDF.Object (
, blank, directObject, integer, line, stringObject
)
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
import Prelude hiding (fail)
type CMappers = Map Name CMap
type Mapping = Map ByteString Text
@ -70,8 +72,8 @@ matches :: ByteString -> CRange -> Bool
matches code (CRange {fromSequence, toSequence}) =
fromSequence <= code && code <= toSequence
cMap :: ByteString -> Either String Font
cMap = fmap (toFont . snd) <$> runParser
cMap :: MonadFail m => ByteString -> m Font
cMap = either fail (return . toFont . snd) . runParser
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
emptyCMap
where

View File

@ -1,8 +1,6 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.Object (
Content(..)
, Dictionary
@ -19,7 +17,6 @@ module PDF.Object (
, Structure(..)
, XRefEntry(..)
, XRefSection
, (//)
, array
, blank
, dictionary
@ -37,8 +34,6 @@ module PDF.Object (
) where
import Control.Applicative ((<|>), many)
import Control.Monad.Reader (MonadReader(..))
import Control.Monad.Fail (MonadFail(..))
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS (concat)
@ -56,7 +51,6 @@ import PDF.Output (
, saveOffset
)
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
import Prelude hiding (fail)
import Text.Printf (printf)
line :: MonadParser m => String -> m ()
@ -168,10 +162,10 @@ array =
type Dictionary = Map Name DirectObject
instance Output Dictionary where
output dict =
output aDictionary =
"<<" `mappend` keyValues `mappend` ">>"
where
keyValues = join " " $ outputKeyVal <$> Map.toList dict
keyValues = join " " $ outputKeyVal <$> Map.toList aDictionary
outputKeyVal :: (Name, DirectObject) -> OBuilder
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
@ -410,36 +404,3 @@ instance Output Content where
]
where
Structure {xRef, trailer} = docStructure
type PDFContent m = (MonadReader Content m, MonadFail m)
--
-- Navigation
--
key :: MonadFail m => String -> Dictionary -> m DirectObject
key keyName dictionary =
maybe (fail errorMessage) return (Map.lookup (Name keyName) dictionary)
where
errorMessage =
printf "Key %s not found in dictionary %s" keyName (show dictionary)
dict :: PDFContent m => Object -> m Dictionary
dict (Direct (Dictionary aDict)) = return aDict
dict obj = fail $ "Not a dictionary : " ++ show obj
getObject :: PDFContent m => ObjectId -> m Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
getResource :: PDFContent m => DirectObject -> m Dictionary
getResource (Dictionary aDict) = return aDict
getResource (Reference (IndirectObjCoordinates {objectId})) =
getObject objectId >>= dict
getResource directObject =
fail $ "Not a resource (dictionary or reference) : " ++ show directObject
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
(//) aDict [] = return $ Dictionary aDict
(//) aDict [field] = key field aDict
(//) aDict (field:fields) = key field aDict >>= getResource >>= (// fields)

View File

@ -0,0 +1,70 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
module PDF.Object.Navigation (
(//)
, getDictionary
, getField
, getObject
, openStream
, origin
) where
import Codec.Compression.Zlib (decompress)
import Control.Monad.Reader (MonadReader(..))
import Control.Monad.Fail (MonadFail(..))
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map ((!))
import qualified Data.Map as Map (lookup)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..), Object(..), Structure(..)
)
import PDF.Output (ObjectId)
import Prelude hiding (fail)
import Text.Printf (printf)
type PDFContent m = (MonadReader Content m, MonadFail m)
expected :: (PDFContent m, Show a) => String -> a -> m b
expected name = fail . printf "Not a %s: %s" name . show
getField :: MonadFail m => String -> Dictionary -> m DirectObject
getField key aDictionary =
maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
where
errorMessage =
printf "Key %s not found in dictionary %s" key (show aDictionary)
castDictionary :: PDFContent m => Object -> m Dictionary
castDictionary (Direct (Dictionary aDict)) = return aDict
castDictionary obj = expected "dictionary : " obj
getObject :: PDFContent m => ObjectId -> m Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
getDictionary :: PDFContent m => DirectObject -> m Dictionary
getDictionary (Dictionary aDictionary) = return aDictionary
getDictionary (Reference (IndirectObjCoordinates {objectId})) =
getObject objectId >>= castDictionary
getDictionary aDirectObject =
expected "resource (dictionary or reference)" aDirectObject
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
(//) aDict [] = return $ Dictionary aDict
(//) aDict [key] = getField key aDict
(//) aDict (key:keys) = getField key aDict >>= getDictionary >>= (// keys)
origin :: PDFContent m => m Dictionary
origin = trailer . docStructure <$> ask
openStream :: PDFContent m => Object -> m ByteString
openStream (Stream {header, streamContent}) = return $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
openStream obj = expected "stream" obj

View File

@ -5,13 +5,13 @@ module PDF.Pages (
, getAll
) where
import Codec.Compression.Zlib (decompress)
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.Except (Except, runExcept)
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
@ -24,11 +24,13 @@ import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Object.Navigation ((//), getDictionary, origin, stream)
import PDF.Output (ObjectId(..))
import Prelude hiding (fail)
import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedFonts (Either String)
type T = RWST Content () CachedFonts (Except String)
data Page = Page {
contents :: [Text]
, source :: ObjectId
@ -42,17 +44,6 @@ infixl 1 //=
(//=) :: Either String a -> (a -> T b) -> T b
x //= f = lift x >>= f
expected :: Show a => String -> a -> Either String b
expected name = Left . printf "Not a %s: %s" name . show
stream :: Object -> Either String ByteString
stream (Stream {header, streamContent}) = Right $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
stream obj = expected "stream" obj
getResource :: DirectObject -> T Dictionary
getResource (Dictionary dictionary) = return dictionary
getResource (Reference (IndirectObjCoordinates {objectId})) =
@ -81,10 +72,10 @@ loadFont objectId = getObject objectId \\= dict >>= tryMappings
tryMappings dictionary =
loadCMap dictionary
<|> lift (key "Encoding" dictionary >>= loadEncoding)
<|> lift (Left $ unknownFormat (show objectId) (show dictionary))
<|> (fail $ unknownFormat (show objectId) (show dictionary))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadCMap dictionary =
key "ToUnicode" dictionary //= follow \\= stream \\= cMap
getField "ToUnicode" dictionary >>= follow \\= stream >>= cMap
loadEncoding (NameObject (Name name)) = encoding name
loadEncoding directObject =
Left . printf "Encoding must be a name, not that : %s" $ show directObject
@ -129,8 +120,7 @@ dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
pagesList :: T [ObjectId]
pagesList = do
root <- dictObject "Root" . trailer . docStructure =<< ask
pages <- dictObject "Pages" root
pages <- origin >>= (// ["Root", "Pages"]) >>= getDictionary
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
@ -158,7 +148,7 @@ loadPage source =
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
getAll :: Content -> Either String (Map Int Page)
getAll content = fst <$> evalRWST getPages content Map.empty
getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty
where
numbered = Map.fromList . zip [1..]
getPages = numbered <$> (mapM loadPage =<< pagesList)
@ -166,8 +156,8 @@ getAll content = fst <$> evalRWST getPages content Map.empty
get :: Content -> Int -> Either String Page
get content pageNumber
| pageNumber < 1 = Left "Pages start at 1"
| otherwise = fst <$> evalRWST getPage content Map.empty
| otherwise = runExcept $ fst <$> evalRWST getPage content Map.empty
where
firstPage [] = lift $ Left "Page is out of bounds"
firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage