162 lines
5.7 KiB
Haskell
Executable File
162 lines
5.7 KiB
Haskell
Executable File
{-# LANGUAGE NamedFieldPuns #-}
|
|
{-# LANGUAGE TypeSynonymInstances #-}
|
|
{-# LANGUAGE FlexibleInstances #-}
|
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
|
{-# LANGUAGE ConstraintKinds #-}
|
|
{-# LANGUAGE FlexibleContexts #-}
|
|
{-# LANGUAGE UndecidableInstances #-}
|
|
module PDF.Pages (
|
|
Contents(..)
|
|
, FontCache
|
|
, Page(..)
|
|
, PageNumber(..)
|
|
, Pages(..)
|
|
, Text_(..)
|
|
, cacheFonts
|
|
, withResources
|
|
) where
|
|
|
|
import Control.Applicative (Alternative, (<|>))
|
|
import Control.Monad (foldM)
|
|
import Control.Monad.Fail (MonadFail(..))
|
|
import Control.Monad.Reader (ReaderT, runReaderT)
|
|
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
|
|
import qualified Control.Monad.RWS as RWS (get)
|
|
import Data.OrderedMap (OrderedMap, build)
|
|
import Data.Map (Map, foldrWithKey, mapWithKey)
|
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
|
import Data.Text (Text)
|
|
import PDF.Box (Box(..))
|
|
import PDF.CMap (cMap)
|
|
import PDF.Content (Content(..), Id(..), Indexed)
|
|
import qualified PDF.Content as Content (parse)
|
|
import PDF.Content.Text (Chunks(..))
|
|
import PDF.Encoding (encoding)
|
|
import PDF.Font (Font, FontSet)
|
|
import PDF.Layer (Layer(..))
|
|
import PDF.Object (
|
|
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
|
, Name(..), Object(..)
|
|
,)
|
|
import PDF.Object.Navigation (
|
|
Clear(..), PDFContent, (//), (>./), (>//), getDictionary
|
|
, getKey, objectById, origin
|
|
)
|
|
import PDF.Output (ObjectId(..))
|
|
import Prelude hiding (fail)
|
|
import Text.Printf (printf)
|
|
|
|
type CachedFonts = Map ObjectId Font
|
|
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
|
data Page = Page {
|
|
--contents :: [(ObjectId, Content)]
|
|
contents :: OrderedMap ObjectId Content
|
|
, resources :: Dictionary
|
|
, source :: ObjectId
|
|
}
|
|
|
|
--loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
|
|
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
|
|
loadContents directObject =
|
|
sequenceA . build loadContent $ objectIds directObject
|
|
where
|
|
loadContent :: PDFContent m => ObjectId -> m Content
|
|
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
|
|
objectIds (Array l) = l >>= getReference
|
|
objectIds dirObj = getReference dirObj
|
|
--build f = fmap (\k -> f k >>= \v -> return (k, v))
|
|
|
|
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
|
getFontDictionary pageObj =
|
|
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
|
|
|
|
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
|
cache loader objectId =
|
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
|
where
|
|
load = do
|
|
value <- loader objectId
|
|
modify $ Map.insert objectId value
|
|
return value
|
|
|
|
loadFont :: FontCache m => ObjectId -> m Font
|
|
loadFont objectId = objectById objectId >>= tryMappings
|
|
where
|
|
tryMappings object =
|
|
(object >./ "ToUnicode" >>= r Clear >>= cMap)
|
|
<|> (object >./ "Encoding" >>= loadEncoding)
|
|
<|> (fail $ unknownFormat (show objectId) (show object))
|
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
|
loadEncoding :: MonadFail m => Object -> m Font
|
|
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
|
loadEncoding object =
|
|
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
|
|
|
loadResources :: FontCache m => Dictionary -> m FontSet
|
|
loadResources = foldM addFont Map.empty . Map.toList
|
|
where
|
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
|
flip (Map.insert name) output <$> cache loadFont objectId
|
|
addFont output _ = return output
|
|
|
|
getReference :: DirectObject -> [ObjectId]
|
|
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
|
|
getReference _ = []
|
|
|
|
loadPage :: PDFContent m => ObjectId -> m Page
|
|
loadPage source = do
|
|
page <- objectById source
|
|
contents <- getKey "Contents" page >>= loadContents
|
|
resources <- getFontDictionary page
|
|
return $ Page {contents, resources, source}
|
|
|
|
pagesList :: PDFContent m => m [ObjectId]
|
|
pagesList =
|
|
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
|
|
<|> return []
|
|
where
|
|
getReferences (Array kids) = kids >>= getReference
|
|
getReferences _ = fail "Not a pages array"
|
|
|
|
data Pages = Pages
|
|
newtype PageNumber = P Int
|
|
data Contents = Contents
|
|
data Text_ = Text_
|
|
|
|
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
|
|
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
|
where
|
|
numbered :: [Page] -> Map Int Page
|
|
numbered = Map.fromList . zip [1..]
|
|
|
|
instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where
|
|
r (P i) layer
|
|
| i < 1 = fail "Pages start at 1"
|
|
| otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer
|
|
where
|
|
firstPage [] = fail "Page is out of bounds"
|
|
firstPage (p:_) = loadPage p
|
|
|
|
--instance Monad m => Box m Contents Page [(ObjectId, Content)] where
|
|
instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
|
|
r Contents = return . contents
|
|
|
|
{-
|
|
instance FontCache m => Box m Text_ Page (OrderedMap ObjectId (Indexed Text)) where
|
|
--instance FontCache m => Box m Text_ Page [((ObjectId, Id Text), Text)] where
|
|
r Text_ (Page {contents, resources}) =
|
|
loadResources resources
|
|
>>= runReaderT (mapM (r Chunks) contents)
|
|
-- >>= runReaderT (concat <$> mapM (r Chunks . snd) contents)
|
|
where
|
|
mixIn prefix = foldrWithKey (\k v m -> ((prefix, k), v):m) []
|
|
extractChunks (objectId, content) = mixIn objectId <$> r Chunks content
|
|
-}
|
|
|
|
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
|
cacheFonts = flip evalStateT Map.empty
|
|
|
|
withResources :: FontCache m => (Page -> ReaderT FontSet m b) -> Page -> m b
|
|
withResources f p =
|
|
loadResources (resources p) >>= runReaderT (f p)
|