Adding a module to implement text reading and a demo program to go with it
This commit is contained in:
parent
d6994f0813
commit
6f3c159ea7
4 changed files with 165 additions and 0 deletions
|
@ -20,6 +20,7 @@ library
|
|||
, PDF.EOL
|
||||
, PDF.Object
|
||||
, PDF.Output
|
||||
, PDF.Text
|
||||
, PDF.Update
|
||||
other-modules: Data.ByteString.Char8.Util
|
||||
, PDF.Body
|
||||
|
@ -51,3 +52,14 @@ executable getObj
|
|||
, zlib
|
||||
ghc-options: -Wall
|
||||
default-language: Haskell2010
|
||||
|
||||
executable getText
|
||||
main-is: examples/getText.hs
|
||||
build-depends: base
|
||||
, bytestring
|
||||
, containers
|
||||
, Hufflepdf
|
||||
, mtl
|
||||
, zlib
|
||||
ghc-options: -Wall
|
||||
default-language: Haskell2010
|
||||
|
|
124
examples/getText.hs
Executable file
124
examples/getText.hs
Executable file
|
@ -0,0 +1,124 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
import Codec.Compression.Zlib (decompress)
|
||||
import Control.Monad.Reader (ask)
|
||||
import Control.Monad.RWS (RWST(..), evalRWST)
|
||||
import Control.Monad.Writer (tell)
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||
import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict)
|
||||
import Data.Map ((!))
|
||||
import qualified Data.Map as Map (empty, lookup)
|
||||
import PDF (Document(..), parseDocument)
|
||||
import qualified PDF.EOL as EOL (Style)
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Object(..), Name(..), Structure(..)
|
||||
,)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import qualified PDF.Output as Output (render)
|
||||
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
|
||||
import PDF.Update (unify)
|
||||
import System.Environment (getArgs)
|
||||
import System.IO (hPutStrLn, stderr)
|
||||
|
||||
--type T a = ReaderT Content [] a
|
||||
type T a = RWST Content [ByteString] CMappers [] a
|
||||
|
||||
list :: [a] -> T a
|
||||
list l = RWST (\_ s -> fillContext s <$> l)
|
||||
where
|
||||
fillContext s a = (a, s, [])
|
||||
|
||||
extractText :: Object -> T ()
|
||||
extractText object = do
|
||||
pageDict <- dict object
|
||||
pageContents <- follow =<< get "Contents" pageDict
|
||||
case pageContents of
|
||||
(Stream {header, streamContent}) -> do
|
||||
font <- getFont pageDict
|
||||
storeDecodedText font . clear header $ Lazy.fromStrict streamContent
|
||||
_ -> return ()
|
||||
|
||||
clear :: Dictionary -> ByteString -> ByteString
|
||||
clear header streamContent =
|
||||
case Map.lookup (Name "Filter") header of
|
||||
Just (NameObject (Name "FlateDecode")) -> decompress streamContent
|
||||
_ -> streamContent
|
||||
|
||||
storeDecodedText :: Dictionary -> ByteString -> T ()
|
||||
storeDecodedText font page =
|
||||
tell . chunks =<< pageContents font page
|
||||
|
||||
getFont :: Dictionary -> T Dictionary
|
||||
getFont pageDict =
|
||||
get "Resources" pageDict
|
||||
>>= dict . Direct
|
||||
>>= get "Font"
|
||||
>>= follow
|
||||
>>= dict
|
||||
|
||||
getObject :: ObjectId -> T Object
|
||||
getObject objectId = do
|
||||
content <- ask
|
||||
return (objects content ! objectId)
|
||||
|
||||
get :: String -> Dictionary -> T DirectObject
|
||||
get key dictionary =
|
||||
case Map.lookup (Name key) dictionary of
|
||||
Just obj -> return obj
|
||||
_ -> list []
|
||||
|
||||
follow :: DirectObject -> T Object
|
||||
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
|
||||
follow _ = list []
|
||||
{-
|
||||
obj <- getObject objectId
|
||||
case obj of
|
||||
Direct directObj -> return directObj
|
||||
_ -> list []
|
||||
-}
|
||||
|
||||
{-
|
||||
key dictionary =
|
||||
case Map.lookup (Name key) dictionary of
|
||||
Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId
|
||||
_ -> list []
|
||||
-}
|
||||
|
||||
dict :: Object -> T Dictionary
|
||||
dict (Direct (Dictionary dictionary)) = return dictionary
|
||||
dict _ = list []
|
||||
|
||||
pagesList :: T ObjectId
|
||||
pagesList = do
|
||||
root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask
|
||||
pages <- dict =<< follow =<< get "Pages" root
|
||||
case Map.lookup (Name "Kids") pages of
|
||||
Just (Array kids) -> list $ filterObjectIds kids
|
||||
_ -> list []
|
||||
|
||||
filterObjectIds :: [DirectObject] -> [ObjectId]
|
||||
filterObjectIds objects = do
|
||||
object <- objects
|
||||
case object of
|
||||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||
_ -> []
|
||||
|
||||
listTextObjects :: Document -> [ByteString]
|
||||
listTextObjects (Document {updates}) =
|
||||
snd =<< evalRWST rwsMain (unify updates) Map.empty
|
||||
where
|
||||
rwsMain =
|
||||
--Lazy.pack . show <$> (getObject =<< pagesList)
|
||||
pagesList >>= getObject >>= extractText
|
||||
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
[inputFile] <- getArgs
|
||||
result <- parseDocument <$> BS.readFile inputFile
|
||||
case result of
|
||||
Left parseError -> hPutStrLn stderr $ show parseError
|
||||
Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc
|
|
@ -3,6 +3,7 @@
|
|||
{-# LANGUAGE FlexibleInstances #-}
|
||||
module PDF.Object (
|
||||
Content(..)
|
||||
, Dictionary
|
||||
, DirectObject(..)
|
||||
, Flow(..)
|
||||
, IndexedObjects
|
||||
|
|
28
src/PDF/Text.hs
Normal file
28
src/PDF/Text.hs
Normal file
|
@ -0,0 +1,28 @@
|
|||
{-# LANGUAGE FlexibleContexts #-}
|
||||
module PDF.Text (
|
||||
CMap
|
||||
, CMappers
|
||||
, PageContents(..)
|
||||
, cMap
|
||||
, pageContents
|
||||
) where
|
||||
|
||||
import Control.Monad.State (MonadState)
|
||||
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||
import Data.Map (Map)
|
||||
import PDF.Object (Dictionary)
|
||||
import PDF.Output (ObjectId)
|
||||
import PDF.Parser (Parser)
|
||||
|
||||
type CMappers = Map ObjectId CMap
|
||||
type CMap = Map Int ByteString
|
||||
|
||||
cMap :: ByteString -> CMap
|
||||
cMap = undefined
|
||||
|
||||
data PageContents = PageContents {
|
||||
chunks :: [ByteString]
|
||||
}
|
||||
|
||||
pageContents :: MonadState CMappers m => Dictionary -> ByteString -> m PageContents
|
||||
pageContents font page = undefined
|
Loading…
Reference in a new issue