Hufflepdf/src/PDF/Pages.hs

171 lines
5.4 KiB
Haskell
Executable File

{-# LANGUAGE NamedFieldPuns #-}
module PDF.Pages (
Page(..)
, get
, getAll
) where
import Codec.Compression.Zlib (decompress)
import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, mapRWST, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import PDF.CMap (CMap, CMappers, cMap)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId(..))
import PDF.Text (pageContents)
import Prelude hiding (fail)
import Text.Printf (printf)
type CachedCMaps = Map ObjectId CMap
type T = RWST Content () CachedCMaps (Either String)
data Page = Page {
contents :: [ByteString]
, source :: ObjectId
}
infixl 1 \\=
(\\=) :: T a -> (a -> Either String b) -> T b
x \\= f = mapRWST ((\(a, s, w) -> (\b -> (b, s, w)) <$> f a) =<<) x
infixl 1 //=
(//=) :: Either String a -> (a -> T b) -> T b
(//=) (Left e) _ = RWST (\_ _ -> Left e)
(//=) (Right a) f = f a
lift :: Either String a -> T a
lift x = x //= return
expected :: Show a => String -> a -> Either String b
expected name = Left . printf "Not a %s: %s" name . show
stream :: Object -> Either String ByteString
stream (Stream {header, streamContent}) = Right $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
stream obj = expected "stream" obj
getResource :: DirectObject -> T Dictionary
getResource (Dictionary dictionary) = return dictionary
getResource (Reference (IndirectObjCoordinates {objectId})) =
getObject objectId \\= dict
getResource directObject =
lift $ expected "resource (dictionary or reference)" directObject
getFont :: Dictionary -> T Dictionary
getFont pageDict =
key "Resources" pageDict
//= getResource
\\= key "Font"
>>= getResource
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
where
load = do
value <- loader objectId
modify $ Map.insert objectId value
return value
loadFont :: ObjectId -> T CMap
loadFont objectId =
(getObject objectId
\\= dict
\\= key "ToUnicode"
>>= follow)
\\= stream
\\= cMap
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
where
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId
loadCMapper output _ = return output
getObject :: ObjectId -> T Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
key :: String -> Dictionary -> Either String DirectObject
key keyName dictionary =
maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
where
errorMessage =
printf "Key %s not found in dictionary %s" keyName (show dictionary)
target :: DirectObject -> Either String ObjectId
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
target directObject = expected "reference" directObject
many :: DirectObject -> [DirectObject]
many (Array l) = l
many directObject = [directObject]
follow :: DirectObject -> T Object
follow directObject = target directObject //= getObject
dict :: Object -> Either String Dictionary
dict (Direct (Dictionary dictionary)) = Right dictionary
dict obj = expected "dictionary" obj
dictObject :: String -> Dictionary -> T Dictionary
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
pagesList :: T [ObjectId]
pagesList = do
root <- dictObject "Root" . trailer . docStructure =<< ask
pages <- dictObject "Pages" root
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
getReferences :: [DirectObject] -> [ObjectId]
getReferences objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: Object -> T [ByteString]
extractText object = do
pageDict <- lift $ dict object
cMappers <- loadCMappers =<< getFont pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent cMappers))
where
loadContent :: CMappers -> DirectObject -> T [ByteString]
loadContent cMappers directObject =
follow directObject \\= stream \\= pageContents cMappers
loadPage :: ObjectId -> T Page
loadPage source =
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
getAll :: Content -> Either String (Map Int Page)
getAll content = fst <$> evalRWST getPages content Map.empty
where
numbered = Map.fromList . zip [1..]
getPages = numbered <$> (mapM loadPage =<< pagesList)
get :: Content -> Int -> Either String Page
get content pageNumber
| pageNumber < 1 = Left "Pages start at 1"
| otherwise = fst <$> evalRWST getPage content Map.empty
where
firstPage [] = lift $ Left "Page is out of bounds"
firstPage (p:_) = loadPage p
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage