Hufflepdf/src/PDF/Pages.hs

155 lines
5.1 KiB
Haskell
Executable File

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE UndecidableInstances #-}
module PDF.Pages (
Page(..)
, PageNumber(..)
, Pages
, cacheFonts
--, get
--, getAll
) where
import Control.Applicative (Alternative(..))
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.Reader (runReaderT)
import Control.Monad.State (MonadState, StateT(..), evalStateT)
import Control.Monad.RWS (RWST(..), evalRWST, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.Box (Box(..))
import PDF.CMap (cMap)
import qualified PDF.Content as Content (parse)
import PDF.Content.Text (renderText)
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Layer (Layer(..))
import PDF.Object (
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..), Object(..)
,)
import PDF.Object.Navigation (
Error(..), PDFContent, StreamContent(..), (./), (//), (>./), (>//)
, castObject, getDictionary, objectById, origin
)
import PDF.Output (ObjectId(..))
import Prelude hiding (fail)
import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type T = RWST Layer () CachedFonts Error
type PageContext = StateT CachedFonts
type FontCache m = (MonadState CachedFonts m, PDFContent m)
--type PDFContent m = (MonadState Layer m, MonadFail m, MonadState CachedFonts m)
data Page = Page {
contents :: [Text]
, source :: ObjectId
}
getFontDictionary :: PDFContent m => Object -> m Dictionary
getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary)
<|> return Map.empty
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
where
load = do
value <- loader objectId
modify $ Map.insert objectId value
return value
loadFont :: FontCache m => ObjectId -> m Font
loadFont objectId = objectById objectId >>= tryMappings
where
tryMappings object =
(object >./ "ToUnicode" >>= r StreamContent >>= cMap)
<|> (object >./ "Encoding" >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show object))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadEncoding :: MonadFail m => Object -> m Font
loadEncoding (Direct (NameObject (Name name))) = encoding name
loadEncoding object =
fail $ printf "Encoding must be a name, not that : %s" $ show object
loadFonts :: FontCache m => Dictionary -> m FontSet
loadFonts = foldM addFont Map.empty . Map.toList
where
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
several :: PDFContent m => Object -> m [Object]
several (Direct (Array l)) = mapM castObject l
several object = return [object]
pagesList :: PDFContent m => m [ObjectId]
pagesList = do
pages <- origin // ["Root", "Pages"] >>= getDictionary
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
getReferences :: [DirectObject] -> [ObjectId]
getReferences objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: FontCache m => Object -> m [Text]
extractText pageObj = do
fonts <- loadFonts =<< getFontDictionary pageObj
objects <- pageObj >./ "Contents" >>= several
concat <$> mapM (loadContent fonts) objects
where
loadContent fonts object =
r StreamContent object
>>= (either fail return . Content.parse)
>>= renderText fonts
loadPage :: FontCache m => ObjectId -> m Page
loadPage source = do
contents <- extractText =<< objectById source
return $ Page {contents, source}
data Pages = Pages
newtype PageNumber = P Int
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
where
numbered :: [Page] -> Map Int Page
numbered = Map.fromList . zip [1..]
instance Box T PageNumber Layer Page
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
cacheFonts = flip evalStateT Map.empty
{-
getAll :: Layer -> Either String (Map Int Page)
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
where
numbered = Map.fromList . zip [1..]
getPages = numbered <$> (mapM loadPage =<< pagesList)
get :: Layer -> Int -> Either String Page
get content pageNumber
| pageNumber < 1 = Left "Pages start at 1"
| otherwise = runError $ fst <$> evalRWST getPage content Map.empty
where
firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
-}