Adding a module to implement text reading and a demo program to go with it

This commit is contained in:
Tissevert 2019-09-23 18:00:47 +02:00
parent d6994f0813
commit 6f3c159ea7
4 changed files with 165 additions and 0 deletions

View File

@ -20,6 +20,7 @@ library
, PDF.EOL
, PDF.Object
, PDF.Output
, PDF.Text
, PDF.Update
other-modules: Data.ByteString.Char8.Util
, PDF.Body
@ -51,3 +52,14 @@ executable getObj
, zlib
ghc-options: -Wall
default-language: Haskell2010
executable getText
main-is: examples/getText.hs
build-depends: base
, bytestring
, containers
, Hufflepdf
, mtl
, zlib
ghc-options: -Wall
default-language: Haskell2010

124
examples/getText.hs Executable file
View File

@ -0,0 +1,124 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
import Codec.Compression.Zlib (decompress)
import Control.Monad.Reader (ask)
import Control.Monad.RWS (RWST(..), evalRWST)
import Control.Monad.Writer (tell)
import Data.ByteString.Lazy.Char8 (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict)
import Data.Map ((!))
import qualified Data.Map as Map (empty, lookup)
import PDF (Document(..), parseDocument)
import qualified PDF.EOL as EOL (Style)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId(..))
import qualified PDF.Output as Output (render)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
--type T a = ReaderT Content [] a
type T a = RWST Content [ByteString] CMappers [] a
list :: [a] -> T a
list l = RWST (\_ s -> fillContext s <$> l)
where
fillContext s a = (a, s, [])
extractText :: Object -> T ()
extractText object = do
pageDict <- dict object
pageContents <- follow =<< get "Contents" pageDict
case pageContents of
(Stream {header, streamContent}) -> do
font <- getFont pageDict
storeDecodedText font . clear header $ Lazy.fromStrict streamContent
_ -> return ()
clear :: Dictionary -> ByteString -> ByteString
clear header streamContent =
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) -> decompress streamContent
_ -> streamContent
storeDecodedText :: Dictionary -> ByteString -> T ()
storeDecodedText font page =
tell . chunks =<< pageContents font page
getFont :: Dictionary -> T Dictionary
getFont pageDict =
get "Resources" pageDict
>>= dict . Direct
>>= get "Font"
>>= follow
>>= dict
getObject :: ObjectId -> T Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
get :: String -> Dictionary -> T DirectObject
get key dictionary =
case Map.lookup (Name key) dictionary of
Just obj -> return obj
_ -> list []
follow :: DirectObject -> T Object
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
follow _ = list []
{-
obj <- getObject objectId
case obj of
Direct directObj -> return directObj
_ -> list []
-}
{-
key dictionary =
case Map.lookup (Name key) dictionary of
Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId
_ -> list []
-}
dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary
dict _ = list []
pagesList :: T ObjectId
pagesList = do
root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask
pages <- dict =<< follow =<< get "Pages" root
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> list $ filterObjectIds kids
_ -> list []
filterObjectIds :: [DirectObject] -> [ObjectId]
filterObjectIds objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
listTextObjects :: Document -> [ByteString]
listTextObjects (Document {updates}) =
snd =<< evalRWST rwsMain (unify updates) Map.empty
where
rwsMain =
--Lazy.pack . show <$> (getObject =<< pagesList)
pagesList >>= getObject >>= extractText
main :: IO ()
main = do
[inputFile] <- getArgs
result <- parseDocument <$> BS.readFile inputFile
case result of
Left parseError -> hPutStrLn stderr $ show parseError
Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc

View File

@ -3,6 +3,7 @@
{-# LANGUAGE FlexibleInstances #-}
module PDF.Object (
Content(..)
, Dictionary
, DirectObject(..)
, Flow(..)
, IndexedObjects

28
src/PDF/Text.hs Normal file
View File

@ -0,0 +1,28 @@
{-# LANGUAGE FlexibleContexts #-}
module PDF.Text (
CMap
, CMappers
, PageContents(..)
, cMap
, pageContents
) where
import Control.Monad.State (MonadState)
import Data.ByteString.Lazy.Char8 (ByteString)
import Data.Map (Map)
import PDF.Object (Dictionary)
import PDF.Output (ObjectId)
import PDF.Parser (Parser)
type CMappers = Map ObjectId CMap
type CMap = Map Int ByteString
cMap :: ByteString -> CMap
cMap = undefined
data PageContents = PageContents {
chunks :: [ByteString]
}
pageContents :: MonadState CMappers m => Dictionary -> ByteString -> m PageContents
pageContents font page = undefined