Hufflepdf/src/PDF/Pages.hs

124 lines
3.9 KiB
Haskell
Executable File

{-# LANGUAGE NamedFieldPuns #-}
module PDF.Pages (
Page(..)
, get
, getAll
) where
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.RWS (RWST(..), evalRWST, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.CMap (cMap)
import qualified PDF.Content as Content (parse)
import PDF.Content.Text (renderText)
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..), Object(..)
,)
import PDF.Object.Navigation (
Error(..), (//), (>./), (>//), getDictionary, objectById, openStream
, origin
)
import PDF.Output (ObjectId(..))
import Prelude hiding (fail)
import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedFonts Error
data Page = Page {
contents :: [Text]
, source :: ObjectId
}
getFontDictionary :: Object -> T Dictionary
getFontDictionary pageObj =
(pageObj >// ["Resources", "Font"] >>= getDictionary)
<|> return Map.empty
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
where
load = do
value <- loader objectId
modify $ Map.insert objectId value
return value
loadFont :: ObjectId -> T Font
loadFont objectId = objectById objectId >>= tryMappings
where
tryMappings object =
(object >./ "ToUnicode" >>= openStream >>= cMap)
<|> (object >./ "Encoding" >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show object))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadEncoding :: Object -> T Font
loadEncoding (Direct (NameObject (Name name))) = encoding name
loadEncoding object =
fail $ printf "Encoding must be a name, not that : %s" $ show object
loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList
where
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
several :: Object -> [Object]
several (Direct (Array l)) = Direct <$> l
several object = [object]
pagesList :: T [ObjectId]
pagesList = do
pages <- origin // ["Root", "Pages"] >>= getDictionary
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
getReferences :: [DirectObject] -> [ObjectId]
getReferences objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: Object -> T [Text]
extractText pageObj = do
fonts <- loadFonts =<< getFontDictionary pageObj
objects <- several <$> pageObj >./ "Contents"
concat <$> mapM (loadContent fonts) objects
where
loadContent :: FontSet -> Object -> T [Text]
loadContent fonts object =
openStream object
>>= (either fail return . Content.parse)
>>= renderText fonts
loadPage :: ObjectId -> T Page
loadPage source = do
contents <- extractText =<< objectById source
return $ Page {contents, source}
getAll :: Content -> Either String (Map Int Page)
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
where
numbered = Map.fromList . zip [1..]
getPages = numbered <$> (mapM loadPage =<< pagesList)
get :: Content -> Int -> Either String Page
get content pageNumber
| pageNumber < 1 = Left "Pages start at 1"
| otherwise = runError $ fst <$> evalRWST getPage content Map.empty
where
firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage