Hufflepdf/src/PDF/Pages.hs

174 lines
5.7 KiB
Haskell
Executable File

{-# LANGUAGE NamedFieldPuns #-}
module PDF.Pages (
Page(..)
, get
, getAll
) where
import Codec.Compression.Zlib (decompress)
import Control.Applicative ((<|>))
import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, lift, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import Data.Text (Text)
import PDF.CMap (cMap)
import PDF.Encoding (encoding)
import PDF.Font (Font, FontSet)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId(..))
import PDF.Text (pageContents)
import Text.Printf (printf)
type CachedFonts = Map ObjectId Font
type T = RWST Content () CachedFonts (Either String)
data Page = Page {
contents :: [Text]
, source :: ObjectId
}
infixl 1 \\=
(\\=) :: T a -> (a -> Either String b) -> T b
x \\= f = x >>= lift . f
infixl 1 //=
(//=) :: Either String a -> (a -> T b) -> T b
x //= f = lift x >>= f
expected :: Show a => String -> a -> Either String b
expected name = Left . printf "Not a %s: %s" name . show
stream :: Object -> Either String ByteString
stream (Stream {header, streamContent}) = Right $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
stream obj = expected "stream" obj
getResource :: DirectObject -> T Dictionary
getResource (Dictionary dictionary) = return dictionary
getResource (Reference (IndirectObjCoordinates {objectId})) =
getObject objectId \\= dict
getResource directObject =
lift $ expected "resource (dictionary or reference)" directObject
getFontDictionary :: Dictionary -> T Dictionary
getFontDictionary pageDict =
key "Resources" pageDict
//= getResource
\\= key "Font"
>>= getResource
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
where
load = do
value <- loader objectId
modify $ Map.insert objectId value
return value
loadFont :: ObjectId -> T Font
loadFont objectId = getObject objectId \\= dict >>= tryMappings
where
tryMappings dictionary =
loadCMap dictionary
<|> lift (key "Encoding" dictionary >>= loadEncoding)
<|> lift (Left $ unknownFormat (show objectId) (show dictionary))
unknownFormat = printf "Unknown font format for object #%s : %s"
loadCMap dictionary =
key "ToUnicode" dictionary //= follow \\= stream \\= cMap
loadEncoding (NameObject (Name name)) = encoding name
loadEncoding directObject =
Left . printf "Encoding must be a name, not that : %s" $ show directObject
loadFonts :: Dictionary -> T FontSet
loadFonts = foldM addFont Map.empty . Map.toList
where
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
addFont output _ = return output
getObject :: ObjectId -> T Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
key :: String -> Dictionary -> Either String DirectObject
key keyName dictionary =
maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
where
errorMessage =
printf "Key %s not found in dictionary %s" keyName (show dictionary)
target :: DirectObject -> Either String ObjectId
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
target directObject = expected "reference" directObject
many :: DirectObject -> [DirectObject]
many (Array l) = l
many directObject = [directObject]
follow :: DirectObject -> T Object
follow directObject = target directObject //= getObject
dict :: Object -> Either String Dictionary
dict (Direct (Dictionary dictionary)) = Right dictionary
dict obj = expected "dictionary" obj
dictObject :: String -> Dictionary -> T Dictionary
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
pagesList :: T [ObjectId]
pagesList = do
root <- dictObject "Root" . trailer . docStructure =<< ask
pages <- dictObject "Pages" root
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
getReferences :: [DirectObject] -> [ObjectId]
getReferences objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: Object -> T [Text]
extractText object = do
pageDict <- lift $ dict object
fonts <- loadFonts =<< getFontDictionary pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent fonts))
where
loadContent :: FontSet -> DirectObject -> T [Text]
loadContent fonts directObject =
follow directObject \\= stream \\= pageContents fonts
loadPage :: ObjectId -> T Page
loadPage source =
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
getAll :: Content -> Either String (Map Int Page)
getAll content = fst <$> evalRWST getPages content Map.empty
where
numbered = Map.fromList . zip [1..]
getPages = numbered <$> (mapM loadPage =<< pagesList)
get :: Content -> Int -> Either String Page
get content pageNumber
| pageNumber < 1 = Left "Pages start at 1"
| otherwise = fst <$> evalRWST getPage content Map.empty
where
firstPage [] = lift $ Left "Page is out of bounds"
firstPage (p:_) = loadPage p
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage