Hufflepdf/src/PDF/Pages.hs

162 lines
5.7 KiB
Haskell
Executable File

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE UndecidableInstances #-}
module PDF.Pages (
Contents(..)
, FontCache
, Page(..)
, PageNumber(..)
, Pages(..)
, Text_(..)
, cacheFonts
, withResources
) where
import Control.Applicative (Alternative, (<|>))
import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (ReaderT, runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.OrderedMap (OrderedMap, build)
import Data.Map (Map, foldrWithKey, mapWithKey)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.Box (Box(..))
import PDF.CMap (cMap)
import PDF.Content (Content(..), Id(..), Indexed)
import qualified PDF.Content as Content (parse)
import PDF.Content.Text (Chunks(..))
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Layer (Layer(..))
import PDF.Object (
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..), Object(..)
,)
import PDF.Object.Navigation (
Clear(..), PDFContent, (//), (>./), (>//), getDictionary
, getKey, objectById, origin
)
import PDF.Output (ObjectId(..))
import Prelude hiding (fail)
import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type FontCache m = (MonadState CachedFonts m, PDFContent m)
data Page = Page {
--contents :: [(ObjectId, Content)]
contents :: OrderedMap ObjectId Content
, resources :: Dictionary
, source :: ObjectId
}
--loadContents :: PDFContent m => DirectObject -> m [(ObjectId, Content)]
loadContents :: PDFContent m => DirectObject -> m (OrderedMap ObjectId Content)
loadContents directObject =
sequenceA . build loadContent $ objectIds directObject
where
loadContent :: PDFContent m => ObjectId -> m Content
loadContent objectId = objectById objectId >>= r Clear >>= Content.parse
objectIds (Array l) = l >>= getReference
objectIds dirObj = getReference dirObj
--build f = fmap (\k -> f k >>= \v -> return (k, v))
getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary) <|> return Map.empty
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
where
load = do
value <- loader objectId
modify $ Map.insert objectId value
return value
loadFont :: FontCache m => ObjectId -> m Font
loadFont objectId = objectById objectId >>= tryMappings
where
tryMappings object =
(object >./ "ToUnicode" >>= r Clear >>= cMap)
<|> (object >./ "Encoding" >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show object))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadEncoding :: MonadFail m => Object -> m Font
loadEncoding (Direct (NameObject (Name name))) = encoding name
loadEncoding object =
fail $ printf "Encoding must be a name, not that : %s" $ show object
loadResources :: FontCache m => Dictionary -> m FontSet
loadResources = foldM addFont Map.empty . Map.toList
where
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
getReference :: DirectObject -> [ObjectId]
getReference (Reference (IndirectObjCoordinates {objectId})) = [objectId]
getReference _ = []
loadPage :: PDFContent m => ObjectId -> m Page
loadPage source = do
page <- objectById source
contents <- getKey "Contents" page >>= loadContents
resources <- getFontDictionary page
return $ Page {contents, resources, source}
pagesList :: PDFContent m => m [ObjectId]
pagesList =
(origin // ["Root", "Pages"] >>= getKey "Kids" >>= return . getReferences)
<|> return []
where
getReferences (Array kids) = kids >>= getReference
getReferences _ = fail "Not a pages array"
data Pages = Pages
newtype PageNumber = P Int
data Contents = Contents
data Text_ = Text_
instance (Alternative m, MonadFail m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
where
numbered :: [Page] -> Map Int Page
numbered = Map.fromList . zip [1..]
instance (Alternative m, MonadFail m) => Box m PageNumber Layer Page where
r (P i) layer
| i < 1 = fail "Pages start at 1"
| otherwise = runReaderT (drop (i - 1) <$> pagesList >>= firstPage) layer
where
firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p
--instance Monad m => Box m Contents Page [(ObjectId, Content)] where
instance Monad m => Box m Contents Page (OrderedMap ObjectId Content) where
r Contents = return . contents
{-
instance FontCache m => Box m Text_ Page (OrderedMap ObjectId (Indexed Text)) where
--instance FontCache m => Box m Text_ Page [((ObjectId, Id Text), Text)] where
r Text_ (Page {contents, resources}) =
loadResources resources
>>= runReaderT (mapM (r Chunks) contents)
-- >>= runReaderT (concat <$> mapM (r Chunks . snd) contents)
where
mixIn prefix = foldrWithKey (\k v m -> ((prefix, k), v):m) []
extractChunks (objectId, content) = mixIn objectId <$> r Chunks content
-}
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
cacheFonts = flip evalStateT Map.empty
withResources :: FontCache m => (Page -> ReaderT FontSet m b) -> Page -> m b
withResources f p =
loadResources (resources p) >>= runReaderT (f p)