2019-11-29 11:51:35 +01:00
|
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
|
|
|
module PDF.Pages (
|
|
|
|
Page(..)
|
|
|
|
, get
|
|
|
|
, getAll
|
|
|
|
) where
|
|
|
|
|
2020-02-15 10:22:42 +01:00
|
|
|
import Control.Applicative ((<|>))
|
|
|
|
import Control.Monad (foldM)
|
2020-02-11 08:29:08 +01:00
|
|
|
import Control.Monad.Fail (MonadFail(..))
|
2020-02-11 22:41:46 +01:00
|
|
|
import Control.Monad.RWS (RWST(..), evalRWST, modify)
|
2019-11-29 11:51:35 +01:00
|
|
|
import qualified Control.Monad.RWS as RWS (get)
|
2020-02-11 22:41:46 +01:00
|
|
|
import Data.Map (Map)
|
2019-11-29 11:51:35 +01:00
|
|
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
2020-02-08 08:15:32 +01:00
|
|
|
import Data.Text (Text)
|
|
|
|
import PDF.CMap (cMap)
|
2020-02-10 10:54:44 +01:00
|
|
|
import qualified PDF.Content as Content (parse)
|
|
|
|
import PDF.Content.Text (renderText)
|
2020-02-08 08:15:32 +01:00
|
|
|
import PDF.Encoding (encoding)
|
|
|
|
import PDF.Font (Font, FontSet)
|
2020-02-17 15:29:59 +01:00
|
|
|
import PDF.Layer (Layer(..))
|
2019-11-29 11:51:35 +01:00
|
|
|
import PDF.Object (
|
2020-02-17 15:29:59 +01:00
|
|
|
Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
2020-02-15 13:51:24 +01:00
|
|
|
, Name(..), Object(..)
|
2019-11-29 11:51:35 +01:00
|
|
|
,)
|
2020-02-11 17:59:15 +01:00
|
|
|
import PDF.Object.Navigation (
|
2020-02-15 13:51:24 +01:00
|
|
|
Error(..), (//), (>./), (>//), getDictionary, objectById, openStream
|
|
|
|
, origin
|
2020-02-11 17:59:15 +01:00
|
|
|
)
|
2019-11-29 11:51:35 +01:00
|
|
|
import PDF.Output (ObjectId(..))
|
2020-02-11 08:29:08 +01:00
|
|
|
import Prelude hiding (fail)
|
2019-11-29 11:51:35 +01:00
|
|
|
import Text.Printf (printf)
|
|
|
|
|
2020-02-08 08:15:32 +01:00
|
|
|
type CachedFonts = Map ObjectId Font
|
2020-02-17 15:29:59 +01:00
|
|
|
type T = RWST Layer () CachedFonts Error
|
2019-11-29 11:51:35 +01:00
|
|
|
data Page = Page {
|
2020-02-08 08:15:32 +01:00
|
|
|
contents :: [Text]
|
2019-11-29 11:51:35 +01:00
|
|
|
, source :: ObjectId
|
|
|
|
}
|
|
|
|
|
2020-02-15 13:51:24 +01:00
|
|
|
getFontDictionary :: Object -> T Dictionary
|
|
|
|
getFontDictionary pageObj =
|
|
|
|
(pageObj >// ["Resources", "Font"] >>= getDictionary)
|
2020-02-11 17:59:15 +01:00
|
|
|
<|> return Map.empty
|
2019-11-29 11:51:35 +01:00
|
|
|
|
2020-02-08 08:15:32 +01:00
|
|
|
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
2019-11-29 11:51:35 +01:00
|
|
|
cache loader objectId =
|
|
|
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
|
|
|
where
|
|
|
|
load = do
|
|
|
|
value <- loader objectId
|
|
|
|
modify $ Map.insert objectId value
|
|
|
|
return value
|
|
|
|
|
2020-02-08 08:15:32 +01:00
|
|
|
loadFont :: ObjectId -> T Font
|
2020-02-15 13:51:24 +01:00
|
|
|
loadFont objectId = objectById objectId >>= tryMappings
|
2020-02-08 08:15:32 +01:00
|
|
|
where
|
2020-02-15 13:51:24 +01:00
|
|
|
tryMappings object =
|
|
|
|
(object >./ "ToUnicode" >>= openStream >>= cMap)
|
|
|
|
<|> (object >./ "Encoding" >>= loadEncoding)
|
|
|
|
<|> (fail $ unknownFormat (show objectId) (show object))
|
2020-02-08 08:15:32 +01:00
|
|
|
unknownFormat = printf "Unknown font format for object #%s : %s"
|
2020-02-15 13:51:24 +01:00
|
|
|
loadEncoding :: Object -> T Font
|
|
|
|
loadEncoding (Direct (NameObject (Name name))) = encoding name
|
|
|
|
loadEncoding object =
|
|
|
|
fail $ printf "Encoding must be a name, not that : %s" $ show object
|
2020-02-08 08:15:32 +01:00
|
|
|
|
|
|
|
loadFonts :: Dictionary -> T FontSet
|
|
|
|
loadFonts = foldM addFont Map.empty . Map.toList
|
2019-11-29 11:51:35 +01:00
|
|
|
where
|
2020-02-08 08:15:32 +01:00
|
|
|
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
|
|
|
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
2019-11-29 11:51:35 +01:00
|
|
|
flip (Map.insert name) output <$> cache loadFont objectId
|
2020-02-08 08:15:32 +01:00
|
|
|
addFont output _ = return output
|
2019-11-29 11:51:35 +01:00
|
|
|
|
2020-02-15 13:51:24 +01:00
|
|
|
several :: Object -> [Object]
|
|
|
|
several (Direct (Array l)) = Direct <$> l
|
|
|
|
several object = [object]
|
2019-11-29 11:51:35 +01:00
|
|
|
|
|
|
|
pagesList :: T [ObjectId]
|
|
|
|
pagesList = do
|
2020-02-15 13:51:24 +01:00
|
|
|
pages <- origin // ["Root", "Pages"] >>= getDictionary
|
2019-11-29 11:51:35 +01:00
|
|
|
case Map.lookup (Name "Kids") pages of
|
|
|
|
Just (Array kids) -> return $ getReferences kids
|
|
|
|
_ -> return []
|
|
|
|
|
|
|
|
getReferences :: [DirectObject] -> [ObjectId]
|
|
|
|
getReferences objects = do
|
|
|
|
object <- objects
|
|
|
|
case object of
|
|
|
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
|
|
|
_ -> []
|
|
|
|
|
2020-02-15 13:51:24 +01:00
|
|
|
extractText :: Object -> T [Text]
|
|
|
|
extractText pageObj = do
|
|
|
|
fonts <- loadFonts =<< getFontDictionary pageObj
|
|
|
|
objects <- several <$> pageObj >./ "Contents"
|
2020-02-11 22:41:46 +01:00
|
|
|
concat <$> mapM (loadContent fonts) objects
|
2019-11-29 11:51:35 +01:00
|
|
|
where
|
2020-02-15 13:51:24 +01:00
|
|
|
loadContent :: FontSet -> Object -> T [Text]
|
|
|
|
loadContent fonts object =
|
|
|
|
openStream object
|
2020-02-11 22:41:46 +01:00
|
|
|
>>= (either fail return . Content.parse)
|
2020-02-11 17:59:15 +01:00
|
|
|
>>= renderText fonts
|
2019-11-29 11:51:35 +01:00
|
|
|
|
|
|
|
loadPage :: ObjectId -> T Page
|
2020-02-11 17:59:15 +01:00
|
|
|
loadPage source = do
|
2020-02-15 13:51:24 +01:00
|
|
|
contents <- extractText =<< objectById source
|
2020-02-11 17:59:15 +01:00
|
|
|
return $ Page {contents, source}
|
2019-11-29 11:51:35 +01:00
|
|
|
|
2020-02-17 15:29:59 +01:00
|
|
|
getAll :: Layer -> Either String (Map Int Page)
|
2020-02-11 22:41:46 +01:00
|
|
|
getAll content = runError $ fst <$> evalRWST getPages content Map.empty
|
2019-11-29 11:51:35 +01:00
|
|
|
where
|
|
|
|
numbered = Map.fromList . zip [1..]
|
|
|
|
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
|
|
|
|
2020-02-17 15:29:59 +01:00
|
|
|
get :: Layer -> Int -> Either String Page
|
2019-11-29 11:51:35 +01:00
|
|
|
get content pageNumber
|
|
|
|
| pageNumber < 1 = Left "Pages start at 1"
|
2020-02-11 22:41:46 +01:00
|
|
|
| otherwise = runError $ fst <$> evalRWST getPage content Map.empty
|
2019-11-29 11:51:35 +01:00
|
|
|
where
|
2020-02-11 08:29:08 +01:00
|
|
|
firstPage [] = fail "Page is out of bounds"
|
2019-11-29 11:51:35 +01:00
|
|
|
firstPage (p:_) = loadPage p
|
|
|
|
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|