2019-11-29 11:51:35 +01:00
|
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
|
|
|
module PDF.Pages (
|
|
|
|
Page(..)
|
|
|
|
, get
|
|
|
|
, getAll
|
|
|
|
) where
|
|
|
|
|
|
|
|
import Codec.Compression.Zlib (decompress)
|
|
|
|
import Control.Monad (foldM)
|
|
|
|
import Control.Monad.RWS (RWST(..), ask, evalRWST, mapRWST, modify)
|
|
|
|
import qualified Control.Monad.RWS as RWS (get)
|
|
|
|
import Data.ByteString (ByteString)
|
|
|
|
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
|
|
|
import Data.Map (Map, (!))
|
|
|
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
2020-02-05 17:42:17 +01:00
|
|
|
import PDF.CMap (cMap)
|
2020-02-05 18:03:44 +01:00
|
|
|
import PDF.Encoding.MacRoman (macRomanEncoding)
|
2020-02-05 17:42:17 +01:00
|
|
|
import PDF.Font (Font, FontSet)
|
2019-11-29 11:51:35 +01:00
|
|
|
import PDF.Object (
|
|
|
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
|
|
|
, Object(..), Name(..), Structure(..)
|
|
|
|
,)
|
|
|
|
import PDF.Output (ObjectId(..))
|
|
|
|
import PDF.Text (pageContents)
|
|
|
|
import Text.Printf (printf)
|
|
|
|
|
2020-02-05 17:42:17 +01:00
|
|
|
type CachedFonts = Map ObjectId Font
|
|
|
|
type T = RWST Content () CachedFonts (Either String)
|
2019-11-29 11:51:35 +01:00
|
|
|
data Page = Page {
|
|
|
|
contents :: [ByteString]
|
|
|
|
, source :: ObjectId
|
|
|
|
}
|
|
|
|
|
|
|
|
infixl 1 \\=
|
|
|
|
(\\=) :: T a -> (a -> Either String b) -> T b
|
|
|
|
x \\= f = mapRWST ((\(a, s, w) -> (\b -> (b, s, w)) <$> f a) =<<) x
|
|
|
|
|
|
|
|
infixl 1 //=
|
|
|
|
(//=) :: Either String a -> (a -> T b) -> T b
|
|
|
|
(//=) (Left e) _ = RWST (\_ _ -> Left e)
|
|
|
|
(//=) (Right a) f = f a
|
|
|
|
|
|
|
|
lift :: Either String a -> T a
|
|
|
|
lift x = x //= return
|
|
|
|
|
|
|
|
expected :: Show a => String -> a -> Either String b
|
|
|
|
expected name = Left . printf "Not a %s: %s" name . show
|
|
|
|
|
|
|
|
stream :: Object -> Either String ByteString
|
|
|
|
stream (Stream {header, streamContent}) = Right $
|
|
|
|
case Map.lookup (Name "Filter") header of
|
|
|
|
Just (NameObject (Name "FlateDecode")) ->
|
|
|
|
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
|
|
|
_ -> streamContent
|
|
|
|
stream obj = expected "stream" obj
|
|
|
|
|
|
|
|
getResource :: DirectObject -> T Dictionary
|
|
|
|
getResource (Dictionary dictionary) = return dictionary
|
|
|
|
getResource (Reference (IndirectObjCoordinates {objectId})) =
|
|
|
|
getObject objectId \\= dict
|
|
|
|
getResource directObject =
|
|
|
|
lift $ expected "resource (dictionary or reference)" directObject
|
|
|
|
|
2020-02-05 17:42:17 +01:00
|
|
|
getFontDictionary :: Dictionary -> T Dictionary
|
|
|
|
getFontDictionary pageDict =
|
2019-11-29 11:51:35 +01:00
|
|
|
key "Resources" pageDict
|
|
|
|
//= getResource
|
|
|
|
\\= key "Font"
|
2020-02-04 17:04:42 +01:00
|
|
|
>>= getResource
|
2019-11-29 11:51:35 +01:00
|
|
|
|
2020-02-05 17:42:17 +01:00
|
|
|
cache :: (ObjectId -> T Font) -> ObjectId -> T Font
|
2019-11-29 11:51:35 +01:00
|
|
|
cache loader objectId =
|
|
|
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
|
|
|
where
|
|
|
|
load = do
|
|
|
|
value <- loader objectId
|
|
|
|
modify $ Map.insert objectId value
|
|
|
|
return value
|
|
|
|
|
2020-02-05 17:42:17 +01:00
|
|
|
loadFont :: ObjectId -> T Font
|
2019-11-29 11:51:35 +01:00
|
|
|
loadFont objectId =
|
2020-02-04 17:05:15 +01:00
|
|
|
getObject objectId
|
2019-11-29 11:51:35 +01:00
|
|
|
\\= dict
|
|
|
|
\\= key "ToUnicode"
|
2020-02-04 17:05:15 +01:00
|
|
|
>>= follow
|
2019-11-29 11:51:35 +01:00
|
|
|
\\= stream
|
|
|
|
\\= cMap
|
|
|
|
|
2020-02-05 17:42:17 +01:00
|
|
|
loadFonts :: Dictionary -> T FontSet
|
|
|
|
loadFonts = foldM addFont Map.empty . Map.toList
|
2019-11-29 11:51:35 +01:00
|
|
|
where
|
2020-02-05 17:42:17 +01:00
|
|
|
addFont :: FontSet -> (Name, DirectObject) -> T FontSet
|
|
|
|
addFont output (name, Reference (IndirectObjCoordinates {objectId})) =
|
2019-11-29 11:51:35 +01:00
|
|
|
flip (Map.insert name) output <$> cache loadFont objectId
|
2020-02-05 17:42:17 +01:00
|
|
|
addFont output _ = return output
|
2019-11-29 11:51:35 +01:00
|
|
|
|
|
|
|
getObject :: ObjectId -> T Object
|
|
|
|
getObject objectId = do
|
|
|
|
content <- ask
|
|
|
|
return (objects content ! objectId)
|
|
|
|
|
|
|
|
key :: String -> Dictionary -> Either String DirectObject
|
|
|
|
key keyName dictionary =
|
|
|
|
maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
|
|
|
|
where
|
|
|
|
errorMessage =
|
|
|
|
printf "Key %s not found in dictionary %s" keyName (show dictionary)
|
|
|
|
|
|
|
|
target :: DirectObject -> Either String ObjectId
|
|
|
|
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
|
|
|
|
target directObject = expected "reference" directObject
|
|
|
|
|
|
|
|
many :: DirectObject -> [DirectObject]
|
|
|
|
many (Array l) = l
|
|
|
|
many directObject = [directObject]
|
|
|
|
|
|
|
|
follow :: DirectObject -> T Object
|
|
|
|
follow directObject = target directObject //= getObject
|
|
|
|
|
|
|
|
dict :: Object -> Either String Dictionary
|
|
|
|
dict (Direct (Dictionary dictionary)) = Right dictionary
|
|
|
|
dict obj = expected "dictionary" obj
|
|
|
|
|
|
|
|
dictObject :: String -> Dictionary -> T Dictionary
|
|
|
|
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
|
|
|
|
|
|
|
|
pagesList :: T [ObjectId]
|
|
|
|
pagesList = do
|
|
|
|
root <- dictObject "Root" . trailer . docStructure =<< ask
|
|
|
|
pages <- dictObject "Pages" root
|
|
|
|
case Map.lookup (Name "Kids") pages of
|
|
|
|
Just (Array kids) -> return $ getReferences kids
|
|
|
|
_ -> return []
|
|
|
|
|
|
|
|
getReferences :: [DirectObject] -> [ObjectId]
|
|
|
|
getReferences objects = do
|
|
|
|
object <- objects
|
|
|
|
case object of
|
|
|
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
|
|
|
_ -> []
|
|
|
|
|
|
|
|
extractText :: Object -> T [ByteString]
|
|
|
|
extractText object = do
|
|
|
|
pageDict <- lift $ dict object
|
2020-02-05 17:42:17 +01:00
|
|
|
fonts <- loadFonts =<< getFontDictionary pageDict
|
2019-11-29 11:51:35 +01:00
|
|
|
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
2020-02-05 17:42:17 +01:00
|
|
|
concat <$> (objects //= (mapM $ loadContent fonts))
|
2019-11-29 11:51:35 +01:00
|
|
|
where
|
2020-02-05 17:42:17 +01:00
|
|
|
loadContent :: FontSet -> DirectObject -> T [ByteString]
|
|
|
|
loadContent fonts directObject =
|
|
|
|
follow directObject \\= stream \\= pageContents fonts
|
2019-11-29 11:51:35 +01:00
|
|
|
|
|
|
|
loadPage :: ObjectId -> T Page
|
|
|
|
loadPage source =
|
|
|
|
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
|
|
|
|
|
|
|
|
getAll :: Content -> Either String (Map Int Page)
|
|
|
|
getAll content = fst <$> evalRWST getPages content Map.empty
|
|
|
|
where
|
|
|
|
numbered = Map.fromList . zip [1..]
|
|
|
|
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
|
|
|
|
|
|
|
get :: Content -> Int -> Either String Page
|
|
|
|
get content pageNumber
|
|
|
|
| pageNumber < 1 = Left "Pages start at 1"
|
|
|
|
| otherwise = fst <$> evalRWST getPage content Map.empty
|
|
|
|
where
|
|
|
|
firstPage [] = lift $ Left "Page is out of bounds"
|
|
|
|
firstPage (p:_) = loadPage p
|
|
|
|
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|