Hufflepdf/src/PDF/Pages.hs

134 lines
4.3 KiB
Haskell
Executable File

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module PDF.Pages (
Page(..)
, get
, getAll
) where
import Control.Applicative (Alternative(..), (<|>))
import Control.Monad (MonadPlus(..), foldM)
import Control.Monad.Fail (MonadFail(..))
import Control.Monad.RWS (RWST(..), evalRWST, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.Map (Map)
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.CMap (cMap)
import qualified PDF.Content as Content (parse)
import PDF.Content.Text (renderText)
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Name(..)
,)
import PDF.Object.Navigation (
(//), dictionaryById, getDictionary, getField, follow, openStream, origin
)
import PDF.Output (ObjectId(..))
import Prelude hiding (fail)
import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
newtype Error a = Error {
runError :: Either String a
} deriving (Alternative, Functor, Applicative, Monad, MonadPlus)
type T = RWST Content () CachedFonts Error
data Page = Page {
contents :: [Text]
, source :: ObjectId
}
instance MonadFail Error where
fail = Error . Left
getFontDictionary :: Dictionary -> T Dictionary
getFontDictionary pageDict =
((pageDict // ["Resources", "Font"]) >>= getDictionary)
<|> return Map.empty
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
where
load = do
value <- loader objectId
modify $ Map.insert objectId value
return value
loadFont :: ObjectId -> T Font
loadFont objectId = dictionaryById objectId >>= tryMappings
where
tryMappings dictionary =
loadCMap dictionary
<|> (getField "Encoding" dictionary >>= loadEncoding)
<|> (fail $ unknownFormat (show objectId) (show dictionary))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadCMap :: Dictionary -> T Font
loadCMap dictionary =
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
loadEncoding :: DirectObject -> T Font
loadEncoding (NameObject (Name name)) = encoding name
loadEncoding directObject =
fail $ printf "Encoding must be a name, not that : %s" $ show directObject
loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList
where
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
several :: DirectObject -> [DirectObject]
several (Array l) = l
several directObject = [directObject]
pagesList :: T [ObjectId]
pagesList = do
pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
getReferences :: [DirectObject] -> [ObjectId]
getReferences objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: Dictionary -> T [Text]
extractText pageDict = do
fonts <- loadFonts =<< getFontDictionary pageDict
objects <- several <$> getField "Contents" pageDict
concat <$> mapM (loadContent fonts) objects
where
loadContent :: FontSet -> DirectObject -> T [Text]
loadContent fonts directObject =
follow directObject
>>= openStream
>>= (either fail return . Content.parse)
>>= renderText fonts
loadPage :: ObjectId -> T Page
loadPage source = do
contents <- extractText =<< dictionaryById source
return $ Page {contents, source}
getAll :: Content -> Either String (Map Int Page)
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
where
numbered = Map.fromList . zip [1..]
getPages = numbered <$> (mapM loadPage =<< pagesList)
get :: Content -> Int -> Either String Page
get content pageNumber
| pageNumber < 1 = Left "Pages start at 1"
| otherwise = runError $ fst <$> evalRWST getPage content Map.empty
where
firstPage [] = fail "Page is out of bounds"
firstPage (p:_) = loadPage p
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage