125 lines
3.6 KiB
Haskell
125 lines
3.6 KiB
Haskell
|
{-# LANGUAGE NamedFieldPuns #-}
|
||
|
{-# LANGUAGE OverloadedStrings #-}
|
||
|
|
||
|
import Codec.Compression.Zlib (decompress)
|
||
|
import Control.Monad.Reader (ask)
|
||
|
import Control.Monad.RWS (RWST(..), evalRWST)
|
||
|
import Control.Monad.Writer (tell)
|
||
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||
|
import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict)
|
||
|
import Data.Map ((!))
|
||
|
import qualified Data.Map as Map (empty, lookup)
|
||
|
import PDF (Document(..), parseDocument)
|
||
|
import qualified PDF.EOL as EOL (Style)
|
||
|
import PDF.Object (
|
||
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||
|
, Object(..), Name(..), Structure(..)
|
||
|
,)
|
||
|
import PDF.Output (ObjectId(..))
|
||
|
import qualified PDF.Output as Output (render)
|
||
|
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
|
||
|
import PDF.Update (unify)
|
||
|
import System.Environment (getArgs)
|
||
|
import System.IO (hPutStrLn, stderr)
|
||
|
|
||
|
--type T a = ReaderT Content [] a
|
||
|
type T a = RWST Content [ByteString] CMappers [] a
|
||
|
|
||
|
list :: [a] -> T a
|
||
|
list l = RWST (\_ s -> fillContext s <$> l)
|
||
|
where
|
||
|
fillContext s a = (a, s, [])
|
||
|
|
||
|
extractText :: Object -> T ()
|
||
|
extractText object = do
|
||
|
pageDict <- dict object
|
||
|
pageContents <- follow =<< get "Contents" pageDict
|
||
|
case pageContents of
|
||
|
(Stream {header, streamContent}) -> do
|
||
|
font <- getFont pageDict
|
||
|
storeDecodedText font . clear header $ Lazy.fromStrict streamContent
|
||
|
_ -> return ()
|
||
|
|
||
|
clear :: Dictionary -> ByteString -> ByteString
|
||
|
clear header streamContent =
|
||
|
case Map.lookup (Name "Filter") header of
|
||
|
Just (NameObject (Name "FlateDecode")) -> decompress streamContent
|
||
|
_ -> streamContent
|
||
|
|
||
|
storeDecodedText :: Dictionary -> ByteString -> T ()
|
||
|
storeDecodedText font page =
|
||
|
tell . chunks =<< pageContents font page
|
||
|
|
||
|
getFont :: Dictionary -> T Dictionary
|
||
|
getFont pageDict =
|
||
|
get "Resources" pageDict
|
||
|
>>= dict . Direct
|
||
|
>>= get "Font"
|
||
|
>>= follow
|
||
|
>>= dict
|
||
|
|
||
|
getObject :: ObjectId -> T Object
|
||
|
getObject objectId = do
|
||
|
content <- ask
|
||
|
return (objects content ! objectId)
|
||
|
|
||
|
get :: String -> Dictionary -> T DirectObject
|
||
|
get key dictionary =
|
||
|
case Map.lookup (Name key) dictionary of
|
||
|
Just obj -> return obj
|
||
|
_ -> list []
|
||
|
|
||
|
follow :: DirectObject -> T Object
|
||
|
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
|
||
|
follow _ = list []
|
||
|
{-
|
||
|
obj <- getObject objectId
|
||
|
case obj of
|
||
|
Direct directObj -> return directObj
|
||
|
_ -> list []
|
||
|
-}
|
||
|
|
||
|
{-
|
||
|
key dictionary =
|
||
|
case Map.lookup (Name key) dictionary of
|
||
|
Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId
|
||
|
_ -> list []
|
||
|
-}
|
||
|
|
||
|
dict :: Object -> T Dictionary
|
||
|
dict (Direct (Dictionary dictionary)) = return dictionary
|
||
|
dict _ = list []
|
||
|
|
||
|
pagesList :: T ObjectId
|
||
|
pagesList = do
|
||
|
root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask
|
||
|
pages <- dict =<< follow =<< get "Pages" root
|
||
|
case Map.lookup (Name "Kids") pages of
|
||
|
Just (Array kids) -> list $ filterObjectIds kids
|
||
|
_ -> list []
|
||
|
|
||
|
filterObjectIds :: [DirectObject] -> [ObjectId]
|
||
|
filterObjectIds objects = do
|
||
|
object <- objects
|
||
|
case object of
|
||
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||
|
_ -> []
|
||
|
|
||
|
listTextObjects :: Document -> [ByteString]
|
||
|
listTextObjects (Document {updates}) =
|
||
|
snd =<< evalRWST rwsMain (unify updates) Map.empty
|
||
|
where
|
||
|
rwsMain =
|
||
|
--Lazy.pack . show <$> (getObject =<< pagesList)
|
||
|
pagesList >>= getObject >>= extractText
|
||
|
|
||
|
|
||
|
main :: IO ()
|
||
|
main = do
|
||
|
[inputFile] <- getArgs
|
||
|
result <- parseDocument <$> BS.readFile inputFile
|
||
|
case result of
|
||
|
Left parseError -> hPutStrLn stderr $ show parseError
|
||
|
Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc
|