Hufflepdf/examples/getText.hs

141 lines
4.3 KiB
Haskell
Executable file

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
import Codec.Compression.Zlib (decompress)
import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
import Control.Monad.Writer (tell)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, insert, lookup, toList)
import PDF (Document(..), parseDocument)
import PDF.CMap (CMap, CMappers, cMap, emptyCMap)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId(..))
import PDF.Text (PageContents(..), pageContents)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
import Text.Printf (printf)
type CachedCMaps = Map ObjectId CMap
type T a = RWST Content [ByteString] CachedCMaps [] a
list :: [a] -> T a
list l = RWST (\_ s -> fillContext s <$> l)
where
fillContext s a = (a, s, [])
handleError :: ObjectId -> a -> String -> T a
handleError objectId defaultValue s =
(tell . replicate 1 $ BS.pack message) >> return defaultValue
where
message = printf "Object #%d : %s" (getObjectId objectId) s
extractText :: Object -> T ()
extractText object = do
pageDict <- dict object
cMappers <- loadCMappers =<< getFont pageDict
contentsId <- target =<< key "Contents" pageDict
contents <- stream =<< getObject contentsId
either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents)
stream :: Object -> T ByteString
stream (Stream {header, streamContent}) = return $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
stream _ = list []
getFont :: Dictionary -> T Dictionary
getFont pageDict =
key "Resources" pageDict
>>= dict . Direct
>>= key "Font"
>>= follow
>>= dict
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
cache loader objectId = do
loaded <- get
case Map.lookup objectId loaded of
Just value -> return value
Nothing -> do
value <- loader objectId
modify (Map.insert objectId value) >> return value
loadFont :: ObjectId -> T CMap
loadFont objectId =
getObject objectId
>>= dict
>>= key "ToUnicode"
>>= follow
>>= stream
>>= either (handleError objectId emptyCMap) return . cMap
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
where
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
flip (Map.insert name) output <$> cache loadFont objectId
loadCMapper output _ = return output
getObject :: ObjectId -> T Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
key :: String -> Dictionary -> T DirectObject
key keyName dictionary =
case Map.lookup (Name keyName) dictionary of
Just obj -> return obj
_ -> list []
target :: DirectObject -> T ObjectId
target (Reference (IndirectObjCoordinates {objectId})) = return objectId
target _ = list []
follow :: DirectObject -> T Object
follow directObject = target directObject >>= getObject
dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary
dict _ = list []
pagesList :: T ObjectId
pagesList = do
root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
pages <- dict =<< follow =<< key "Pages" root
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> list $ filterObjectIds kids
_ -> list []
filterObjectIds :: [DirectObject] -> [ObjectId]
filterObjectIds objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
listTextObjects :: Document -> [ByteString]
listTextObjects (Document {updates}) =
snd =<< evalRWST rwsMain (unify updates) Map.empty
where
rwsMain =
pagesList >>= getObject >>= extractText
main :: IO ()
main = do
[inputFile] <- getArgs
result <- parseDocument <$> BS.readFile inputFile
case result of
Left parseError -> hPutStrLn stderr $ show parseError
Right doc -> mapM_ BS.putStrLn $ listTextObjects doc