WIP: still trying to figure things out, moved to a separate submodule for Navigation, proper naming is hell
This commit is contained in:
parent
e77bbbcda9
commit
aed7af376a
|
@ -21,6 +21,7 @@ library
|
||||||
, PDF.Content
|
, PDF.Content
|
||||||
, PDF.EOL
|
, PDF.EOL
|
||||||
, PDF.Object
|
, PDF.Object
|
||||||
|
, PDF.Object.Navigation
|
||||||
, PDF.Output
|
, PDF.Output
|
||||||
, PDF.Parser
|
, PDF.Parser
|
||||||
, PDF.Pages
|
, PDF.Pages
|
||||||
|
|
|
@ -12,6 +12,7 @@ module PDF.CMap (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>), many)
|
import Control.Applicative ((<|>), many)
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.State (modify)
|
import Control.Monad.State (modify)
|
||||||
import Data.Attoparsec.ByteString.Char8 (count)
|
import Data.Attoparsec.ByteString.Char8 (count)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
|
@ -32,6 +33,7 @@ import PDF.Object (
|
||||||
, blank, directObject, integer, line, stringObject
|
, blank, directObject, integer, line, stringObject
|
||||||
)
|
)
|
||||||
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
import PDF.Parser (MonadParser, Parser, runParser, takeAll)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
|
||||||
type CMappers = Map Name CMap
|
type CMappers = Map Name CMap
|
||||||
type Mapping = Map ByteString Text
|
type Mapping = Map ByteString Text
|
||||||
|
@ -70,8 +72,8 @@ matches :: ByteString -> CRange -> Bool
|
||||||
matches code (CRange {fromSequence, toSequence}) =
|
matches code (CRange {fromSequence, toSequence}) =
|
||||||
fromSequence <= code && code <= toSequence
|
fromSequence <= code && code <= toSequence
|
||||||
|
|
||||||
cMap :: ByteString -> Either String Font
|
cMap :: MonadFail m => ByteString -> m Font
|
||||||
cMap = fmap (toFont . snd) <$> runParser
|
cMap = either fail (return . toFont . snd) . runParser
|
||||||
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
(many (codeRanges <|> cMapRange <|> cMapChar <|> ignoredLine))
|
||||||
emptyCMap
|
emptyCMap
|
||||||
where
|
where
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
{-# LANGUAGE ConstraintKinds #-}
|
|
||||||
{-# LANGUAGE FlexibleContexts #-}
|
|
||||||
module PDF.Object (
|
module PDF.Object (
|
||||||
Content(..)
|
Content(..)
|
||||||
, Dictionary
|
, Dictionary
|
||||||
|
@ -19,7 +17,6 @@ module PDF.Object (
|
||||||
, Structure(..)
|
, Structure(..)
|
||||||
, XRefEntry(..)
|
, XRefEntry(..)
|
||||||
, XRefSection
|
, XRefSection
|
||||||
, (//)
|
|
||||||
, array
|
, array
|
||||||
, blank
|
, blank
|
||||||
, dictionary
|
, dictionary
|
||||||
|
@ -37,8 +34,6 @@ module PDF.Object (
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Control.Applicative ((<|>), many)
|
import Control.Applicative ((<|>), many)
|
||||||
import Control.Monad.Reader (MonadReader(..))
|
|
||||||
import Control.Monad.Fail (MonadFail(..))
|
|
||||||
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
import Data.Attoparsec.ByteString.Char8 (choice, count, option, sepBy)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString as BS (concat)
|
import qualified Data.ByteString as BS (concat)
|
||||||
|
@ -56,7 +51,6 @@ import PDF.Output (
|
||||||
, saveOffset
|
, saveOffset
|
||||||
)
|
)
|
||||||
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
import PDF.Parser (MonadParser(..), Parser, (<?>), octDigit, oneOf)
|
||||||
import Prelude hiding (fail)
|
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
line :: MonadParser m => String -> m ()
|
line :: MonadParser m => String -> m ()
|
||||||
|
@ -168,10 +162,10 @@ array =
|
||||||
type Dictionary = Map Name DirectObject
|
type Dictionary = Map Name DirectObject
|
||||||
|
|
||||||
instance Output Dictionary where
|
instance Output Dictionary where
|
||||||
output dict =
|
output aDictionary =
|
||||||
"<<" `mappend` keyValues `mappend` ">>"
|
"<<" `mappend` keyValues `mappend` ">>"
|
||||||
where
|
where
|
||||||
keyValues = join " " $ outputKeyVal <$> Map.toList dict
|
keyValues = join " " $ outputKeyVal <$> Map.toList aDictionary
|
||||||
outputKeyVal :: (Name, DirectObject) -> OBuilder
|
outputKeyVal :: (Name, DirectObject) -> OBuilder
|
||||||
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
|
outputKeyVal (key, val) = Output.concat [output key, " ", output val]
|
||||||
|
|
||||||
|
@ -410,36 +404,3 @@ instance Output Content where
|
||||||
]
|
]
|
||||||
where
|
where
|
||||||
Structure {xRef, trailer} = docStructure
|
Structure {xRef, trailer} = docStructure
|
||||||
|
|
||||||
type PDFContent m = (MonadReader Content m, MonadFail m)
|
|
||||||
|
|
||||||
--
|
|
||||||
-- Navigation
|
|
||||||
--
|
|
||||||
key :: MonadFail m => String -> Dictionary -> m DirectObject
|
|
||||||
key keyName dictionary =
|
|
||||||
maybe (fail errorMessage) return (Map.lookup (Name keyName) dictionary)
|
|
||||||
where
|
|
||||||
errorMessage =
|
|
||||||
printf "Key %s not found in dictionary %s" keyName (show dictionary)
|
|
||||||
|
|
||||||
dict :: PDFContent m => Object -> m Dictionary
|
|
||||||
dict (Direct (Dictionary aDict)) = return aDict
|
|
||||||
dict obj = fail $ "Not a dictionary : " ++ show obj
|
|
||||||
|
|
||||||
getObject :: PDFContent m => ObjectId -> m Object
|
|
||||||
getObject objectId = do
|
|
||||||
content <- ask
|
|
||||||
return (objects content ! objectId)
|
|
||||||
|
|
||||||
getResource :: PDFContent m => DirectObject -> m Dictionary
|
|
||||||
getResource (Dictionary aDict) = return aDict
|
|
||||||
getResource (Reference (IndirectObjCoordinates {objectId})) =
|
|
||||||
getObject objectId >>= dict
|
|
||||||
getResource directObject =
|
|
||||||
fail $ "Not a resource (dictionary or reference) : " ++ show directObject
|
|
||||||
|
|
||||||
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
|
|
||||||
(//) aDict [] = return $ Dictionary aDict
|
|
||||||
(//) aDict [field] = key field aDict
|
|
||||||
(//) aDict (field:fields) = key field aDict >>= getResource >>= (// fields)
|
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE ConstraintKinds #-}
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module PDF.Object.Navigation (
|
||||||
|
(//)
|
||||||
|
, getDictionary
|
||||||
|
, getField
|
||||||
|
, getObject
|
||||||
|
, openStream
|
||||||
|
, origin
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Codec.Compression.Zlib (decompress)
|
||||||
|
import Control.Monad.Reader (MonadReader(..))
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||||
|
import Data.Map ((!))
|
||||||
|
import qualified Data.Map as Map (lookup)
|
||||||
|
import PDF.Object (
|
||||||
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
|
, Name(..), Object(..), Structure(..)
|
||||||
|
)
|
||||||
|
import PDF.Output (ObjectId)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
type PDFContent m = (MonadReader Content m, MonadFail m)
|
||||||
|
|
||||||
|
expected :: (PDFContent m, Show a) => String -> a -> m b
|
||||||
|
expected name = fail . printf "Not a %s: %s" name . show
|
||||||
|
|
||||||
|
getField :: MonadFail m => String -> Dictionary -> m DirectObject
|
||||||
|
getField key aDictionary =
|
||||||
|
maybe (fail errorMessage) return (Map.lookup (Name key) aDictionary)
|
||||||
|
where
|
||||||
|
errorMessage =
|
||||||
|
printf "Key %s not found in dictionary %s" key (show aDictionary)
|
||||||
|
|
||||||
|
castDictionary :: PDFContent m => Object -> m Dictionary
|
||||||
|
castDictionary (Direct (Dictionary aDict)) = return aDict
|
||||||
|
castDictionary obj = expected "dictionary : " obj
|
||||||
|
|
||||||
|
getObject :: PDFContent m => ObjectId -> m Object
|
||||||
|
getObject objectId = do
|
||||||
|
content <- ask
|
||||||
|
return (objects content ! objectId)
|
||||||
|
|
||||||
|
getDictionary :: PDFContent m => DirectObject -> m Dictionary
|
||||||
|
getDictionary (Dictionary aDictionary) = return aDictionary
|
||||||
|
getDictionary (Reference (IndirectObjCoordinates {objectId})) =
|
||||||
|
getObject objectId >>= castDictionary
|
||||||
|
getDictionary aDirectObject =
|
||||||
|
expected "resource (dictionary or reference)" aDirectObject
|
||||||
|
|
||||||
|
(//) :: PDFContent m => Dictionary -> [String] -> m DirectObject
|
||||||
|
(//) aDict [] = return $ Dictionary aDict
|
||||||
|
(//) aDict [key] = getField key aDict
|
||||||
|
(//) aDict (key:keys) = getField key aDict >>= getDictionary >>= (// keys)
|
||||||
|
|
||||||
|
origin :: PDFContent m => m Dictionary
|
||||||
|
origin = trailer . docStructure <$> ask
|
||||||
|
|
||||||
|
openStream :: PDFContent m => Object -> m ByteString
|
||||||
|
openStream (Stream {header, streamContent}) = return $
|
||||||
|
case Map.lookup (Name "Filter") header of
|
||||||
|
Just (NameObject (Name "FlateDecode")) ->
|
||||||
|
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
||||||
|
_ -> streamContent
|
||||||
|
openStream obj = expected "stream" obj
|
|
@ -5,13 +5,13 @@ module PDF.Pages (
|
||||||
, getAll
|
, getAll
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Codec.Compression.Zlib (decompress)
|
|
||||||
import Control.Applicative ((<|>))
|
import Control.Applicative ((<|>))
|
||||||
import Control.Monad (foldM)
|
import Control.Monad (foldM)
|
||||||
|
import Control.Monad.Except (Except, runExcept)
|
||||||
|
import Control.Monad.Fail (MonadFail(..))
|
||||||
import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
|
import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
|
||||||
import qualified Control.Monad.RWS as RWS (get)
|
import qualified Control.Monad.RWS as RWS (get)
|
||||||
import Data.ByteString (ByteString)
|
import Data.ByteString (ByteString)
|
||||||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
|
||||||
import Data.Map (Map, (!))
|
import Data.Map (Map, (!))
|
||||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
|
@ -24,11 +24,13 @@ import PDF.Object (
|
||||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
, Object(..), Name(..), Structure(..)
|
, Object(..), Name(..), Structure(..)
|
||||||
,)
|
,)
|
||||||
|
import PDF.Object.Navigation ((//), getDictionary, origin, stream)
|
||||||
import PDF.Output (ObjectId(..))
|
import PDF.Output (ObjectId(..))
|
||||||
|
import Prelude hiding (fail)
|
||||||
import Text.Printf (printf)
|
import Text.Printf (printf)
|
||||||
|
|
||||||
type CachedFonts = Map ObjectId Font
|
type CachedFonts = Map ObjectId Font
|
||||||
type T = RWST Content () CachedFonts (Either String)
|
type T = RWST Content () CachedFonts (Except String)
|
||||||
data Page = Page {
|
data Page = Page {
|
||||||
contents :: [Text]
|
contents :: [Text]
|
||||||
, source :: ObjectId
|
, source :: ObjectId
|
||||||
|
@ -42,17 +44,6 @@ infixl 1 //=
|
||||||
(//=) :: Either String a -> (a -> T b) -> T b
|
(//=) :: Either String a -> (a -> T b) -> T b
|
||||||
x //= f = lift x >>= f
|
x //= f = lift x >>= f
|
||||||
|
|
||||||
expected :: Show a => String -> a -> Either String b
|
|
||||||
expected name = Left . printf "Not a %s: %s" name . show
|
|
||||||
|
|
||||||
stream :: Object -> Either String ByteString
|
|
||||||
stream (Stream {header, streamContent}) = Right $
|
|
||||||
case Map.lookup (Name "Filter") header of
|
|
||||||
Just (NameObject (Name "FlateDecode")) ->
|
|
||||||
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
|
||||||
_ -> streamContent
|
|
||||||
stream obj = expected "stream" obj
|
|
||||||
|
|
||||||
getResource :: DirectObject -> T Dictionary
|
getResource :: DirectObject -> T Dictionary
|
||||||
getResource (Dictionary dictionary) = return dictionary
|
getResource (Dictionary dictionary) = return dictionary
|
||||||
getResource (Reference (IndirectObjCoordinates {objectId})) =
|
getResource (Reference (IndirectObjCoordinates {objectId})) =
|
||||||
|
@ -81,10 +72,10 @@ loadFont objectId = getObject objectId \\= dict >>= tryMappings
|
||||||
tryMappings dictionary =
|
tryMappings dictionary =
|
||||||
loadCMap dictionary
|
loadCMap dictionary
|
||||||
<|> lift (key "Encoding" dictionary >>= loadEncoding)
|
<|> lift (key "Encoding" dictionary >>= loadEncoding)
|
||||||
<|> lift (Left $ unknownFormat (show objectId) (show dictionary))
|
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
||||||
unknownFormat = printf "Unknown font format for object #%s : %s"
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
||||||
loadCMap dictionary =
|
loadCMap dictionary =
|
||||||
key "ToUnicode" dictionary //= follow \\= stream \\= cMap
|
getField "ToUnicode" dictionary >>= follow \\= stream >>= cMap
|
||||||
loadEncoding (NameObject (Name name)) = encoding name
|
loadEncoding (NameObject (Name name)) = encoding name
|
||||||
loadEncoding directObject =
|
loadEncoding directObject =
|
||||||
Left . printf "Encoding must be a name, not that : %s" $ show directObject
|
Left . printf "Encoding must be a name, not that : %s" $ show directObject
|
||||||
|
@ -129,8 +120,7 @@ dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
|
||||||
|
|
||||||
pagesList :: T [ObjectId]
|
pagesList :: T [ObjectId]
|
||||||
pagesList = do
|
pagesList = do
|
||||||
root <- dictObject "Root" . trailer . docStructure =<< ask
|
pages <- origin >>= (// ["Root", "Pages"]) >>= getDictionary
|
||||||
pages <- dictObject "Pages" root
|
|
||||||
case Map.lookup (Name "Kids") pages of
|
case Map.lookup (Name "Kids") pages of
|
||||||
Just (Array kids) -> return $ getReferences kids
|
Just (Array kids) -> return $ getReferences kids
|
||||||
_ -> return []
|
_ -> return []
|
||||||
|
@ -158,7 +148,7 @@ loadPage source =
|
||||||
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
|
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
|
||||||
|
|
||||||
getAll :: Content -> Either String (Map Int Page)
|
getAll :: Content -> Either String (Map Int Page)
|
||||||
getAll content = fst <$> evalRWST getPages content Map.empty
|
getAll content = runExcept $ fst <$> evalRWST getPages content Map.empty
|
||||||
where
|
where
|
||||||
numbered = Map.fromList . zip [1..]
|
numbered = Map.fromList . zip [1..]
|
||||||
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
||||||
|
@ -166,8 +156,8 @@ getAll content = fst <$> evalRWST getPages content Map.empty
|
||||||
get :: Content -> Int -> Either String Page
|
get :: Content -> Int -> Either String Page
|
||||||
get content pageNumber
|
get content pageNumber
|
||||||
| pageNumber < 1 = Left "Pages start at 1"
|
| pageNumber < 1 = Left "Pages start at 1"
|
||||||
| otherwise = fst <$> evalRWST getPage content Map.empty
|
| otherwise = runExcept $ fst <$> evalRWST getPage content Map.empty
|
||||||
where
|
where
|
||||||
firstPage [] = lift $ Left "Page is out of bounds"
|
firstPage [] = fail "Page is out of bounds"
|
||||||
firstPage (p:_) = loadPage p
|
firstPage (p:_) = loadPage p
|
||||||
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|
||||||
|
|
Loading…
Reference in New Issue