WIP: still trying to figure things out, moved to a separate submodule for Navigation, proper naming is hell
This commit is contained in:
parent
e77bbbcda9
commit
aed7af376a
|
@ -21,6 +21,7 @@ library
|
|||
, PDF.Content
|
||||
, PDF.EOL
|
||||
, PDF.Object
|
||||
, PDF.Object.Navigation
|
||||
, PDF.Output
|
||||
, PDF.Parser
|
||||
, PDF.Pages
|
||||
|
|
|
@ -12,6 +12,7 @@ module PDF.CMap (
|
|||
) where
|
||||
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.State (modify)
|
||||
import Data.Attoparsec.ByteString.Char8 (count)
|
||||
import Data.ByteString (ByteString)
|
||||
|
@ -32,6 +33,7 @@ import PDF.Object (
|
|||
, blank, directObject, integer, line, stringObject
|
||||
)
|
||||
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
||||
import Prelude hiding (fail)
|
||||
|
||||
type CMappers = Map Name CMap
|
||||
type Mapping = Map ByteString Text
|
||||
|
@ -70,8 +72,8 @@ matches :: ByteString -> CRange -> Bool
|
|||
matches code (CRange {fromSequence, toSequence}) =
|
||||
fromSequence <= code && code <= toSequence
|
||||
|
||||
cMap :: ByteString -> Either String Font
|
||||
cMap = fmap (toFont . snd) <$> runParser
|
||||
cMap :: MonadFail m => ByteString -> m Font
|
||||
cMap = either fail (return . toFont . snd) . runParser
|
||||
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
||||
emptyCMap
|
||||
where
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
{-# LANGUAGE ConstraintKinds #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
module PDF.Object (
|
||||
Content(..)
|
||||
, Dictionary
|
||||
|
@ -19,7 +17,6 @@ module PDF.Object (
|
|||
, Structure(..)
|
||||
, XRefEntry(..)
|
||||
, XRefSection
|
||||
, (//)
|
||||
, array
|
||||
, blank
|
||||
, dictionary
|
||||
|
@ -37,8 +34,6 @@ module PDF.Object (
|
|||
) where
|
||||
|
||||
import Control.Applicative ((<|>), many)
|
||||
import Control.Monad.Reader (MonadReader(..))
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS (concat)
|
||||
|
@ -56,7 +51,6 @@ import PDF.Output (
|
|||
, saveOffset
|
||||
)
|
||||
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
||||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
||||
line :: MonadParser m => String -> m ()
|
||||
|
@ -168,10 +162,10 @@ array =
|
|||
type Dictionary = Map Name DirectObject
|
||||
|
||||
instance Output Dictionary where
|
||||
output dict =
|
||||
output aDictionary =
|
||||
"<<" `mappend` keyValues `mappend` ">>"
|
||||
where
|
||||
keyValues = join " " $ outputKeyVal <$> Map.toList dict
|
||||
keyValues = join " " $ outputKeyVal <$> Map.toList aDictionary
|
||||
outputKeyVal :: (Name, DirectObject) -> OBuilder
|
||||
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
|
||||
|
||||
|
@ -410,36 +404,3 @@ instance Output Content where
|
|||
]
|
||||
where
|
||||
Structure {xRef, trailer} = docStructure
|
||||
|
||||
type PDFContent m = (MonadReader Content m, MonadFail m)
|
||||
|
||||
--
|
||||
-- Navigation
|
||||
--
|
||||
key :: MonadFail m => String -> Dictionary -> m DirectObject
|
||||
key keyName dictionary =
|
||||
maybe (fail errorMessage) return (Map.lookup (Name keyName) dictionary)
|
||||
where
|
||||
errorMessage =
|
||||
printf "Key %s not found in dictionary %s" keyName (show dictionary)
|
||||
|
||||
dict :: PDFContent m => Object -> m Dictionary
|
||||
dict (Direct (Dictionary aDict)) = return aDict
|
||||
dict obj = fail $ "Not a dictionary : " ++ show obj
|
||||
|
||||
getObject :: PDFContent m => ObjectId -> m Object
|
||||
getObject objectId = do
|
||||
content <- ask
|
||||
return (objects content ! objectId)
|
||||
|
||||
getResource :: PDFContent m => DirectObject -> m Dictionary
|
||||
getResource (Dictionary aDict) = return aDict
|
||||
getResource (Reference (IndirectObjCoordinates {objectId})) =
|
||||
getObject objectId >>= dict
|
||||
getResource directObject =
|
||||
fail $ "Not a resource (dictionary or reference) : " ++ show directObject
|
||||
|
||||
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
|
||||
(//) aDict [] = return $ Dictionary aDict
|
||||
(//) aDict [field] = key field aDict
|
||||
(//) aDict (field:fields) = key field aDict >>= getResource >>= (// fields)
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE ConstraintKinds #-}
|
||||
{-# LANGUAGE FlexibleContexts #-}
|
||||
module PDF.Object.Navigation (
|
||||
(//)
|
||||
, getDictionary
|
||||
, getField
|
||||
, getObject
|
||||
, openStream
|
||||
, origin
|
||||
) where
|
||||
|
||||
import Codec.Compression.Zlib (decompress)
|
||||
import Control.Monad.Reader (MonadReader(..))
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||
import Data.Map ((!))
|
||||
import qualified Data.Map as Map (lookup)
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Name(..), Object(..), Structure(..)
|
||||
)
|
||||
import PDF.Output (ObjectId)
|
||||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
||||
type PDFContent m = (MonadReader Content m, MonadFail m)
|
||||
|
||||
expected :: (PDFContent m, Show a) => String -> a -> m b
|
||||
expected name = fail . printf "Not a %s: %s" name . show
|
||||
|
||||
getField :: MonadFail m => String -> Dictionary -> m DirectObject
|
||||
getField key aDictionary =
|
||||
maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
|
||||
where
|
||||
errorMessage =
|
||||
printf "Key %s not found in dictionary %s" key (show aDictionary)
|
||||
|
||||
castDictionary :: PDFContent m => Object -> m Dictionary
|
||||
castDictionary (Direct (Dictionary aDict)) = return aDict
|
||||
castDictionary obj = expected "dictionary : " obj
|
||||
|
||||
getObject :: PDFContent m => ObjectId -> m Object
|
||||
getObject objectId = do
|
||||
content <- ask
|
||||
return (objects content ! objectId)
|
||||
|
||||
getDictionary :: PDFContent m => DirectObject -> m Dictionary
|
||||
getDictionary (Dictionary aDictionary) = return aDictionary
|
||||
getDictionary (Reference (IndirectObjCoordinates {objectId})) =
|
||||
getObject objectId >>= castDictionary
|
||||
getDictionary aDirectObject =
|
||||
expected "resource (dictionary or reference)" aDirectObject
|
||||
|
||||
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
|
||||
(//) aDict [] = return $ Dictionary aDict
|
||||
(//) aDict [key] = getField key aDict
|
||||
(//) aDict (key:keys) = getField key aDict >>= getDictionary >>= (// keys)
|
||||
|
||||
origin :: PDFContent m => m Dictionary
|
||||
origin = trailer . docStructure <$> ask
|
||||
|
||||
openStream :: PDFContent m => Object -> m ByteString
|
||||
openStream (Stream {header, streamContent}) = return $
|
||||
case Map.lookup (Name "Filter") header of
|
||||
Just (NameObject (Name "FlateDecode")) ->
|
||||
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
||||
_ -> streamContent
|
||||
openStream obj = expected "stream" obj
|
|
@ -5,13 +5,13 @@ module PDF.Pages (
|
|||
, getAll
|
||||
) where
|
||||
|
||||
import Codec.Compression.Zlib (decompress)
|
||||
import Control.Applicative ((<|>))
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.Except (Except, runExcept)
|
||||
import Control.Monad.Fail (MonadFail(..))
|
||||
import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
|
||||
import qualified Control.Monad.RWS as RWS (get)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||
import Data.Text (Text)
|
||||
|
@ -24,11 +24,13 @@ import PDF.Object (
|
|||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Object(..), Name(..), Structure(..)
|
||||
,)
|
||||
import PDF.Object.Navigation ((//), getDictionary, origin, stream)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
||||
type CachedFonts = Map ObjectId Font
|
||||
type T = RWST Content () CachedFonts (Either String)
|
||||
type T = RWST Content () CachedFonts (Except String)
|
||||
data Page = Page {
|
||||
contents :: [Text]
|
||||
, source :: ObjectId
|
||||
|
@ -42,17 +44,6 @@ infixl 1 //=
|
|||
(//=) :: Either String a -> (a -> T b) -> T b
|
||||
x //= f = lift x >>= f
|
||||
|
||||
expected :: Show a => String -> a -> Either String b
|
||||
expected name = Left . printf "Not a %s: %s" name . show
|
||||
|
||||
stream :: Object -> Either String ByteString
|
||||
stream (Stream {header, streamContent}) = Right $
|
||||
case Map.lookup (Name "Filter") header of
|
||||
Just (NameObject (Name "FlateDecode")) ->
|
||||
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
||||
_ -> streamContent
|
||||
stream obj = expected "stream" obj
|
||||
|
||||
getResource :: DirectObject -> T Dictionary
|
||||
getResource (Dictionary dictionary) = return dictionary
|
||||
getResource (Reference (IndirectObjCoordinates {objectId})) =
|
||||
|
@ -81,10 +72,10 @@ loadFont objectId = getObject objectId \\= dict >>= tryMappings
|
|||
tryMappings dictionary =
|
||||
loadCMap dictionary
|
||||
<|> lift (key "Encoding" dictionary >>= loadEncoding)
|
||||
<|> lift (Left $ unknownFormat (show objectId) (show dictionary))
|
||||
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||
loadCMap dictionary =
|
||||
key "ToUnicode" dictionary //= follow \\= stream \\= cMap
|
||||
getField "ToUnicode" dictionary >>= follow \\= stream >>= cMap
|
||||
loadEncoding (NameObject (Name name)) = encoding name
|
||||
loadEncoding directObject =
|
||||
Left . printf "Encoding must be a name, not that : %s" $ show directObject
|
||||
|
@ -129,8 +120,7 @@ dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
|
|||
|
||||
pagesList :: T [ObjectId]
|
||||
pagesList = do
|
||||
root <- dictObject "Root" . trailer . docStructure =<< ask
|
||||
pages <- dictObject "Pages" root
|
||||
pages <- origin >>= (// ["Root", "Pages"]) >>= getDictionary
|
||||
case Map.lookup (Name "Kids") pages of
|
||||
Just (Array kids) -> return $ getReferences kids
|
||||
_ -> return []
|
||||
|
@ -158,7 +148,7 @@ loadPage source =
|
|||
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
|
||||
|
||||
getAll :: Content -> Either String (Map Int Page)
|
||||
getAll content = fst <$> evalRWST getPages content Map.empty
|
||||
getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty
|
||||
where
|
||||
numbered = Map.fromList . zip [1..]
|
||||
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
||||
|
@ -166,8 +156,8 @@ getAll content = fst <$> evalRWST getPages content Map.empty
|
|||
get :: Content -> Int -> Either String Page
|
||||
get content pageNumber
|
||||
| pageNumber < 1 = Left "Pages start at 1"
|
||||
| otherwise = fst <$> evalRWST getPage content Map.empty
|
||||
| otherwise = runExcept $ fst <$> evalRWST getPage content Map.empty
|
||||
where
|
||||
firstPage [] = lift $ Left "Page is out of bounds"
|
||||
firstPage [] = fail "Page is out of bounds"
|
||||
firstPage (p:_) = loadPage p
|
||||
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|
||||
|
|
Loading…
Reference in New Issue