2019-09-23 18:00:47 +02:00
|
|
|
{-# LANGUAGE NamedFieldPuns #-}
|
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
|
|
|
|
import Codec.Compression.Zlib (decompress)
|
2019-09-24 18:38:12 +02:00
|
|
|
import Control.Monad (foldM)
|
|
|
|
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
|
2019-09-23 18:00:47 +02:00
|
|
|
import Control.Monad.Writer (tell)
|
2019-09-24 18:38:12 +02:00
|
|
|
import Data.ByteString (ByteString)
|
|
|
|
import qualified Data.ByteString.Char8 as BS (readFile, putStrLn)
|
|
|
|
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict)
|
|
|
|
import Data.Map (Map, (!))
|
|
|
|
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
2019-09-23 18:00:47 +02:00
|
|
|
import PDF (Document(..), parseDocument)
|
|
|
|
import PDF.Object (
|
|
|
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
|
|
|
, Object(..), Name(..), Structure(..)
|
|
|
|
,)
|
2019-09-24 18:38:12 +02:00
|
|
|
import PDF.Output (ObjectId)
|
2019-09-25 18:42:34 +02:00
|
|
|
import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents)
|
2019-09-23 18:00:47 +02:00
|
|
|
import PDF.Update (unify)
|
|
|
|
import System.Environment (getArgs)
|
|
|
|
import System.IO (hPutStrLn, stderr)
|
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
type CachedCMaps = Map ObjectId CMap
|
|
|
|
type T a = RWST Content [ByteString] CachedCMaps [] a
|
2019-09-23 18:00:47 +02:00
|
|
|
|
|
|
|
list :: [a] -> T a
|
|
|
|
list l = RWST (\_ s -> fillContext s <$> l)
|
|
|
|
where
|
|
|
|
fillContext s a = (a, s, [])
|
|
|
|
|
|
|
|
extractText :: Object -> T ()
|
|
|
|
extractText object = do
|
|
|
|
pageDict <- dict object
|
2019-09-25 18:42:34 +02:00
|
|
|
cMappers <- loadCMappers =<< getFont pageDict
|
|
|
|
contents <- stream =<< follow =<< key "Contents" pageDict
|
|
|
|
either (return . const ()) (tell . chunks) (pageContents cMappers contents)
|
2019-09-23 18:00:47 +02:00
|
|
|
|
2019-09-25 18:42:34 +02:00
|
|
|
stream :: Object -> T ByteString
|
|
|
|
stream (Stream {header, streamContent}) = return $
|
2019-09-23 18:00:47 +02:00
|
|
|
case Map.lookup (Name "Filter") header of
|
2019-09-24 18:38:12 +02:00
|
|
|
Just (NameObject (Name "FlateDecode")) ->
|
|
|
|
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
2019-09-23 18:00:47 +02:00
|
|
|
_ -> streamContent
|
2019-09-25 18:42:34 +02:00
|
|
|
stream _ = list []
|
2019-09-23 18:00:47 +02:00
|
|
|
|
|
|
|
getFont :: Dictionary -> T Dictionary
|
|
|
|
getFont pageDict =
|
2019-09-24 18:38:12 +02:00
|
|
|
key "Resources" pageDict
|
2019-09-23 18:00:47 +02:00
|
|
|
>>= dict . Direct
|
2019-09-24 18:38:12 +02:00
|
|
|
>>= key "Font"
|
2019-09-23 18:00:47 +02:00
|
|
|
>>= follow
|
|
|
|
>>= dict
|
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
|
|
|
|
cache loader objectId = do
|
|
|
|
loaded <- get
|
|
|
|
case Map.lookup objectId loaded of
|
|
|
|
Just value -> return value
|
|
|
|
Nothing -> do
|
|
|
|
value <- loader objectId
|
|
|
|
modify (Map.insert objectId value) >> return value
|
|
|
|
|
|
|
|
loadFont :: ObjectId -> T CMap
|
|
|
|
loadFont objectId =
|
|
|
|
getObject objectId
|
|
|
|
>>= dict
|
|
|
|
>>= key "ToUnicode"
|
|
|
|
>>= follow
|
2019-09-25 18:42:34 +02:00
|
|
|
>>= stream
|
|
|
|
>>= either (return . const emptyCMap) return . cMap
|
2019-09-24 18:38:12 +02:00
|
|
|
|
|
|
|
loadCMappers :: Dictionary -> T CMappers
|
|
|
|
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
|
|
|
where
|
|
|
|
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
|
|
|
|
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
|
|
|
|
flip (Map.insert name) output <$> cache loadFont objectId
|
|
|
|
loadCMapper output _ = return output
|
|
|
|
|
2019-09-23 18:00:47 +02:00
|
|
|
getObject :: ObjectId -> T Object
|
|
|
|
getObject objectId = do
|
|
|
|
content <- ask
|
|
|
|
return (objects content ! objectId)
|
|
|
|
|
2019-09-24 18:38:12 +02:00
|
|
|
key :: String -> Dictionary -> T DirectObject
|
|
|
|
key keyName dictionary =
|
|
|
|
case Map.lookup (Name keyName) dictionary of
|
2019-09-23 18:00:47 +02:00
|
|
|
Just obj -> return obj
|
|
|
|
_ -> list []
|
|
|
|
|
|
|
|
follow :: DirectObject -> T Object
|
|
|
|
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
|
|
|
|
follow _ = list []
|
|
|
|
|
|
|
|
dict :: Object -> T Dictionary
|
|
|
|
dict (Direct (Dictionary dictionary)) = return dictionary
|
|
|
|
dict _ = list []
|
|
|
|
|
|
|
|
pagesList :: T ObjectId
|
|
|
|
pagesList = do
|
2019-09-24 18:38:12 +02:00
|
|
|
root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
|
|
|
|
pages <- dict =<< follow =<< key "Pages" root
|
2019-09-23 18:00:47 +02:00
|
|
|
case Map.lookup (Name "Kids") pages of
|
|
|
|
Just (Array kids) -> list $ filterObjectIds kids
|
|
|
|
_ -> list []
|
|
|
|
|
|
|
|
filterObjectIds :: [DirectObject] -> [ObjectId]
|
|
|
|
filterObjectIds objects = do
|
|
|
|
object <- objects
|
|
|
|
case object of
|
|
|
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
|
|
|
_ -> []
|
|
|
|
|
|
|
|
listTextObjects :: Document -> [ByteString]
|
|
|
|
listTextObjects (Document {updates}) =
|
|
|
|
snd =<< evalRWST rwsMain (unify updates) Map.empty
|
|
|
|
where
|
|
|
|
rwsMain =
|
|
|
|
--Lazy.pack . show <$> (getObject =<< pagesList)
|
|
|
|
pagesList >>= getObject >>= extractText
|
|
|
|
|
|
|
|
|
|
|
|
main :: IO ()
|
|
|
|
main = do
|
|
|
|
[inputFile] <- getArgs
|
|
|
|
result <- parseDocument <$> BS.readFile inputFile
|
|
|
|
case result of
|
|
|
|
Left parseError -> hPutStrLn stderr $ show parseError
|
2019-09-24 18:38:12 +02:00
|
|
|
Right doc -> mapM_ BS.putStrLn $ listTextObjects doc
|