Hufflepdf/examples/getText.hs

125 lines
3.6 KiB
Haskell
Raw Normal View History

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
import Codec.Compression.Zlib (decompress)
import Control.Monad.Reader (ask)
import Control.Monad.RWS (RWST(..), evalRWST)
import Control.Monad.Writer (tell)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict)
import Data.Map ((!))
import qualified Data.Map as Map (empty, lookup)
import PDF (Document(..), parseDocument)
import qualified PDF.EOL as EOL (Style)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId(..))
import qualified PDF.Output as Output (render)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
--type T a = ReaderT Content [] a
type T a = RWST Content [ByteString] CMappers [] a
list :: [a] -> T a
list l = RWST (\_ s -> fillContext s <$> l)
where
fillContext s a = (a, s, [])
extractText :: Object -> T ()
extractText object = do
pageDict <- dict object
pageContents <- follow =<< get "Contents" pageDict
case pageContents of
(Stream {header, streamContent}) -> do
font <- getFont pageDict
storeDecodedText font . clear header $ Lazy.fromStrict streamContent
_ -> return ()
clear :: Dictionary -> ByteString -> ByteString
clear header streamContent =
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) -> decompress streamContent
_ -> streamContent
storeDecodedText :: Dictionary -> ByteString -> T ()
storeDecodedText font page =
tell . chunks =<< pageContents font page
getFont :: Dictionary -> T Dictionary
getFont pageDict =
get "Resources" pageDict
>>= dict . Direct
>>= get "Font"
>>= follow
>>= dict
getObject :: ObjectId -> T Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
get :: String -> Dictionary -> T DirectObject
get key dictionary =
case Map.lookup (Name key) dictionary of
Just obj -> return obj
_ -> list []
follow :: DirectObject -> T Object
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
follow _ = list []
{-
obj <- getObject objectId
case obj of
Direct directObj -> return directObj
_ -> list []
-}
{-
key dictionary =
case Map.lookup (Name key) dictionary of
Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId
_ -> list []
-}
dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary
dict _ = list []
pagesList :: T ObjectId
pagesList = do
root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask
pages <- dict =<< follow =<< get "Pages" root
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> list $ filterObjectIds kids
_ -> list []
filterObjectIds :: [DirectObject] -> [ObjectId]
filterObjectIds objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
listTextObjects :: Document -> [ByteString]
listTextObjects (Document {updates}) =
snd =<< evalRWST rwsMain (unify updates) Map.empty
where
rwsMain =
--Lazy.pack . show <$> (getObject =<< pagesList)
pagesList >>= getObject >>= extractText
main :: IO ()
main = do
[inputFile] <- getArgs
result <- parseDocument <$> BS.readFile inputFile
case result of
Left parseError -> hPutStrLn stderr $ show parseError
Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc