134 lines
4.3 KiB
Haskell
Executable File
134 lines
4.3 KiB
Haskell
Executable File
{-# LANGUAGE NamedFieldPuns #-}
|
|
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
|
module PDF.Pages (
|
|
Page(..)
|
|
, get
|
|
, getAll
|
|
) where
|
|
|
|
import Control.Applicative (Alternative(..), (<|>))
|
|
import Control.Monad (MonadPlus(..), foldM)
|
|
import Control.Monad.Fail (MonadFail(..))
|
|
import Control.Monad.RWS (RWST(..), evalRWST, modify)
|
|
import qualified Control.Monad.RWS as RWS (get)
|
|
import Data.Map (Map)
|
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
|
import Data.Text (Text)
|
|
import PDF.CMap (cMap)
|
|
import qualified PDF.Content as Content (parse)
|
|
import PDF.Content.Text (renderText)
|
|
import PDF.Encoding (encoding)
|
|
import PDF.Font (Font, FontSet)
|
|
import PDF.Object (
|
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
|
, Name(..)
|
|
,)
|
|
import PDF.Object.Navigation (
|
|
(//), dictionaryById, getDictionary, getField, follow, openStream, origin
|
|
)
|
|
import PDF.Output (ObjectId(..))
|
|
import Prelude hiding (fail)
|
|
import Text.Printf (printf)
|
|
|
|
type CachedFonts = Map ObjectId Font
|
|
newtype Error a = Error {
|
|
runError :: Either String a
|
|
} deriving (Alternative, Functor, Applicative, Monad, MonadPlus)
|
|
type T = RWST Content () CachedFonts Error
|
|
data Page = Page {
|
|
contents :: [Text]
|
|
, source :: ObjectId
|
|
}
|
|
|
|
instance MonadFail Error where
|
|
fail = Error . Left
|
|
|
|
getFontDictionary :: Dictionary -> T Dictionary
|
|
getFontDictionary pageDict =
|
|
((pageDict // ["Resources", "Font"]) >>= getDictionary)
|
|
<|> return Map.empty
|
|
|
|
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
|
cache loader objectId =
|
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
|
where
|
|
load = do
|
|
value <- loader objectId
|
|
modify $ Map.insert objectId value
|
|
return value
|
|
|
|
loadFont :: ObjectId -> T Font
|
|
loadFont objectId = dictionaryById objectId >>= tryMappings
|
|
where
|
|
tryMappings dictionary =
|
|
loadCMap dictionary
|
|
<|> (getField "Encoding" dictionary >>= loadEncoding)
|
|
<|> (fail $ unknownFormat (show objectId) (show dictionary))
|
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
|
loadCMap :: Dictionary -> T Font
|
|
loadCMap dictionary =
|
|
getField "ToUnicode" dictionary >>= follow >>= openStream >>= cMap
|
|
loadEncoding :: DirectObject -> T Font
|
|
loadEncoding (NameObject (Name name)) = encoding name
|
|
loadEncoding directObject =
|
|
fail $ printf "Encoding must be a name, not that : %s" $ show directObject
|
|
|
|
loadFonts :: Dictionary -> T FontSet
|
|
loadFonts = foldM addFont Map.empty . Map.toList
|
|
where
|
|
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
|
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
|
flip (Map.insert name) output <$> cache loadFont objectId
|
|
addFont output _ = return output
|
|
|
|
several :: DirectObject -> [DirectObject]
|
|
several (Array l) = l
|
|
several directObject = [directObject]
|
|
|
|
pagesList :: T [ObjectId]
|
|
pagesList = do
|
|
pages <- (origin :: T Dictionary) >>= (// ["Root", "Pages"]) >>= getDictionary
|
|
case Map.lookup (Name "Kids") pages of
|
|
Just (Array kids) -> return $ getReferences kids
|
|
_ -> return []
|
|
|
|
getReferences :: [DirectObject] -> [ObjectId]
|
|
getReferences objects = do
|
|
object <- objects
|
|
case object of
|
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
|
_ -> []
|
|
|
|
extractText :: Dictionary -> T [Text]
|
|
extractText pageDict = do
|
|
fonts <- loadFonts =<< getFontDictionary pageDict
|
|
objects <- several <$> getField "Contents" pageDict
|
|
concat <$> mapM (loadContent fonts) objects
|
|
where
|
|
loadContent :: FontSet -> DirectObject -> T [Text]
|
|
loadContent fonts directObject =
|
|
follow directObject
|
|
>>= openStream
|
|
>>= (either fail return . Content.parse)
|
|
>>= renderText fonts
|
|
|
|
loadPage :: ObjectId -> T Page
|
|
loadPage source = do
|
|
contents <- extractText =<< dictionaryById source
|
|
return $ Page {contents, source}
|
|
|
|
getAll :: Content -> Either String (Map Int Page)
|
|
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
|
|
where
|
|
numbered = Map.fromList . zip [1..]
|
|
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
|
|
|
get :: Content -> Int -> Either String Page
|
|
get content pageNumber
|
|
| pageNumber < 1 = Left "Pages start at 1"
|
|
| otherwise = runError $ fst <$> evalRWST getPage content Map.empty
|
|
where
|
|
firstPage [] = fail "Page is out of bounds"
|
|
firstPage (p:_) = loadPage p
|
|
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|