Hufflepdf/examples/getText.hs

138 lines
4.1 KiB
Haskell
Executable File

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
import Codec.Compression.Zlib (decompress)
import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
import Control.Monad.Writer (tell)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile, putStrLn)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, insert, lookup, toList)
import PDF (Document(..), parseDocument)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
type CachedCMaps = Map ObjectId CMap
type T a = RWST Content [ByteString] CachedCMaps [] a
list :: [a] -> T a
list l = RWST (\_ s -> fillContext s <$> l)
where
fillContext s a = (a, s, [])
extractText :: Object -> T ()
extractText object = do
pageDict <- dict object
contents <- follow =<< key "Contents" pageDict
case contents of
(Stream {header, streamContent}) -> do
font <- getFont pageDict
cMappers <- loadCMappers font
storeDecodedText cMappers $ clear header streamContent
_ -> return ()
clear :: Dictionary -> ByteString -> ByteString
clear header streamContent =
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
storeDecodedText :: CMappers -> ByteString -> T ()
storeDecodedText font page =
case pageContents font page of
Left _ -> return ()
Right (PageContents {chunks}) -> tell chunks
getFont :: Dictionary -> T Dictionary
getFont pageDict =
key "Resources" pageDict
>>= dict . Direct
>>= key "Font"
>>= follow
>>= dict
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
cache loader objectId = do
loaded <- get
case Map.lookup objectId loaded of
Just value -> return value
Nothing -> do
value <- loader objectId
modify (Map.insert objectId value) >> return value
loadFont :: ObjectId -> T CMap
loadFont objectId =
getObject objectId
>>= dict
>>= key "ToUnicode"
>>= follow
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
where
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
flip (Map.insert name) output <$> cache loadFont objectId
loadCMapper output _ = return output
getObject :: ObjectId -> T Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
key :: String -> Dictionary -> T DirectObject
key keyName dictionary =
case Map.lookup (Name keyName) dictionary of
Just obj -> return obj
_ -> list []
follow :: DirectObject -> T Object
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
follow _ = list []
dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary
dict _ = list []
pagesList :: T ObjectId
pagesList = do
root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
pages <- dict =<< follow =<< key "Pages" root
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> list $ filterObjectIds kids
_ -> list []
filterObjectIds :: [DirectObject] -> [ObjectId]
filterObjectIds objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
listTextObjects :: Document -> [ByteString]
listTextObjects (Document {updates}) =
snd =<< evalRWST rwsMain (unify updates) Map.empty
where
rwsMain =
--Lazy.pack . show <$> (getObject =<< pagesList)
pagesList >>= getObject >>= extractText
main :: IO ()
main = do
[inputFile] <- getArgs
result <- parseDocument <$> BS.readFile inputFile
case result of
Left parseError -> hPutStrLn stderr $ show parseError
Right doc -> mapM_ BS.putStrLn $ listTextObjects doc