135 lines
4.5 KiB
Haskell
Executable File
135 lines
4.5 KiB
Haskell
Executable File
{-# LANGUAGE NamedFieldPuns #-}
|
|
{-# LANGUAGE TypeSynonymInstances #-}
|
|
{-# LANGUAGE FlexibleInstances #-}
|
|
{-# LANGUAGE MultiParamTypeClasses #-}
|
|
{-# LANGUAGE ConstraintKinds #-}
|
|
{-# LANGUAGE FlexibleContexts #-}
|
|
{-# LANGUAGE UndecidableInstances #-}
|
|
module PDF.Pages (
|
|
Page(..)
|
|
, PageNumber(..)
|
|
, Pages(..)
|
|
, cacheFonts
|
|
) where
|
|
|
|
import Control.Applicative (Alternative(..))
|
|
import Control.Applicative ((<|>))
|
|
import Control.Monad (foldM)
|
|
import Control.Monad.Fail (MonadFail(..))
|
|
import Control.Monad.Reader (runReaderT)
|
|
import Control.Monad.State (MonadState, StateT(..), evalStateT)
|
|
import Control.Monad.RWS (RWST(..), evalRWST, modify)
|
|
import qualified Control.Monad.RWS as RWS (get)
|
|
import Data.Map (Map)
|
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
|
import Data.Text (Text)
|
|
import PDF.Box (Box(..))
|
|
import PDF.CMap (cMap)
|
|
import qualified PDF.Content as Content (parse)
|
|
import PDF.Content.Text (renderText)
|
|
import PDF.Encoding (encoding)
|
|
import PDF.Font (Font, FontSet)
|
|
import PDF.Layer (Layer(..))
|
|
import PDF.Object (
|
|
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
|
, Name(..), Object(..)
|
|
,)
|
|
import PDF.Object.Navigation (
|
|
Error(..), PDFContent, StreamContent(..), (./), (//), (>./), (>//)
|
|
, castObject, getDictionary, objectById, origin
|
|
)
|
|
import PDF.Output (ObjectId(..))
|
|
import Prelude hiding (fail)
|
|
import Text.Printf (printf)
|
|
|
|
type CachedFonts = Map ObjectId Font
|
|
type T = RWST Layer () CachedFonts Error
|
|
type PageContext = StateT CachedFonts
|
|
type FontCache m = (MonadState CachedFonts m, PDFContent m)
|
|
data Page = Page {
|
|
contents :: [Text]
|
|
, source :: ObjectId
|
|
}
|
|
|
|
getFontDictionary :: PDFContent m => Object -> m Dictionary
|
|
getFontDictionary pageObj =
|
|
(pageObj >// ["Resources", "Font"] >>= getDictionary)
|
|
<|> return Map.empty
|
|
|
|
cache :: FontCache m => (ObjectId -> m Font) -> ObjectId -> m Font
|
|
cache loader objectId =
|
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
|
where
|
|
load = do
|
|
value <- loader objectId
|
|
modify $ Map.insert objectId value
|
|
return value
|
|
|
|
loadFont :: FontCache m => ObjectId -> m Font
|
|
loadFont objectId = objectById objectId >>= tryMappings
|
|
where
|
|
tryMappings object =
|
|
(object >./ "ToUnicode" >>= r StreamContent >>= cMap)
|
|
<|> (object >./ "Encoding" >>= loadEncoding)
|
|
<|> (fail $ unknownFormat (show objectId) (show object))
|
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
|
loadEncoding :: MonadFail m => Object -> m Font
|
|
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
|
loadEncoding object =
|
|
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
|
|
|
loadFonts :: FontCache m => Dictionary -> m FontSet
|
|
loadFonts = foldM addFont Map.empty . Map.toList
|
|
where
|
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
|
flip (Map.insert name) output <$> cache loadFont objectId
|
|
addFont output _ = return output
|
|
|
|
several :: PDFContent m => Object -> m [Object]
|
|
several (Direct (Array l)) = mapM castObject l
|
|
several object = return [object]
|
|
|
|
pagesList :: PDFContent m => m [ObjectId]
|
|
pagesList = do
|
|
pages <- origin // ["Root", "Pages"] >>= getDictionary
|
|
case Map.lookup (Name "Kids") pages of
|
|
Just (Array kids) -> return $ getReferences kids
|
|
_ -> return []
|
|
|
|
getReferences :: [DirectObject] -> [ObjectId]
|
|
getReferences objects = do
|
|
object <- objects
|
|
case object of
|
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
|
_ -> []
|
|
|
|
extractText :: FontCache m => Object -> m [Text]
|
|
extractText pageObj = do
|
|
fonts <- loadFonts =<< getFontDictionary pageObj
|
|
objects <- pageObj >./ "Contents" >>= several
|
|
concat <$> mapM (loadContent fonts) objects
|
|
where
|
|
loadContent fonts object =
|
|
r StreamContent object
|
|
>>= (either fail return . Content.parse)
|
|
>>= renderText fonts
|
|
|
|
loadPage :: FontCache m => ObjectId -> m Page
|
|
loadPage source = do
|
|
contents <- extractText =<< objectById source
|
|
return $ Page {contents, source}
|
|
|
|
data Pages = Pages
|
|
newtype PageNumber = P Int
|
|
|
|
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m Pages Layer (Map Int Page) where
|
|
r Pages layer = runReaderT (numbered <$> (mapM loadPage =<< pagesList)) layer
|
|
where
|
|
numbered :: [Page] -> Map Int Page
|
|
numbered = Map.fromList . zip [1..]
|
|
|
|
instance (MonadFail m, Alternative m, MonadState CachedFonts m) => Box m PageNumber Layer Page where
|
|
|
|
cacheFonts :: Monad m => StateT CachedFonts m a -> m a
|
|
cacheFonts = flip evalStateT Map.empty
|