Adding a module to implement text reading and a demo program to go with it
This commit is contained in:
parent
d6994f0813
commit
6f3c159ea7
4 changed files with 165 additions and 0 deletions
|
@ -20,6 +20,7 @@ library
|
||||||
, PDF.EOL
|
, PDF.EOL
|
||||||
, PDF.Object
|
, PDF.Object
|
||||||
, PDF.Output
|
, PDF.Output
|
||||||
|
, PDF.Text
|
||||||
, PDF.Update
|
, PDF.Update
|
||||||
other-modules: Data.ByteString.Char8.Util
|
other-modules: Data.ByteString.Char8.Util
|
||||||
, PDF.Body
|
, PDF.Body
|
||||||
|
@ -51,3 +52,14 @@ executable getObj
|
||||||
, zlib
|
, zlib
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
||||||
|
executable getText
|
||||||
|
main-is: examples/getText.hs
|
||||||
|
build-depends: base
|
||||||
|
, bytestring
|
||||||
|
, containers
|
||||||
|
, Hufflepdf
|
||||||
|
, mtl
|
||||||
|
, zlib
|
||||||
|
ghc-options: -Wall
|
||||||
|
default-language: Haskell2010
|
||||||
|
|
124
examples/getText.hs
Executable file
124
examples/getText.hs
Executable file
|
@ -0,0 +1,124 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
|
import Codec.Compression.Zlib (decompress)
|
||||||
|
import Control.Monad.Reader (ask)
|
||||||
|
import Control.Monad.RWS (RWST(..), evalRWST)
|
||||||
|
import Control.Monad.Writer (tell)
|
||||||
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
|
import qualified Data.ByteString.Char8 as BS (readFile)
|
||||||
|
import qualified Data.ByteString.Lazy.Char8 as Lazy (concat, fromStrict, pack, putStrLn, toStrict)
|
||||||
|
import Data.Map ((!))
|
||||||
|
import qualified Data.Map as Map (empty, lookup)
|
||||||
|
import PDF (Document(..), parseDocument)
|
||||||
|
import qualified PDF.EOL as EOL (Style)
|
||||||
|
import PDF.Object (
|
||||||
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
|
, Object(..), Name(..), Structure(..)
|
||||||
|
,)
|
||||||
|
import PDF.Output (ObjectId(..))
|
||||||
|
import qualified PDF.Output as Output (render)
|
||||||
|
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
|
||||||
|
import PDF.Update (unify)
|
||||||
|
import System.Environment (getArgs)
|
||||||
|
import System.IO (hPutStrLn, stderr)
|
||||||
|
|
||||||
|
--type T a = ReaderT Content [] a
|
||||||
|
type T a = RWST Content [ByteString] CMappers [] a
|
||||||
|
|
||||||
|
list :: [a] -> T a
|
||||||
|
list l = RWST (\_ s -> fillContext s <$> l)
|
||||||
|
where
|
||||||
|
fillContext s a = (a, s, [])
|
||||||
|
|
||||||
|
extractText :: Object -> T ()
|
||||||
|
extractText object = do
|
||||||
|
pageDict <- dict object
|
||||||
|
pageContents <- follow =<< get "Contents" pageDict
|
||||||
|
case pageContents of
|
||||||
|
(Stream {header, streamContent}) -> do
|
||||||
|
font <- getFont pageDict
|
||||||
|
storeDecodedText font . clear header $ Lazy.fromStrict streamContent
|
||||||
|
_ -> return ()
|
||||||
|
|
||||||
|
clear :: Dictionary -> ByteString -> ByteString
|
||||||
|
clear header streamContent =
|
||||||
|
case Map.lookup (Name "Filter") header of
|
||||||
|
Just (NameObject (Name "FlateDecode")) -> decompress streamContent
|
||||||
|
_ -> streamContent
|
||||||
|
|
||||||
|
storeDecodedText :: Dictionary -> ByteString -> T ()
|
||||||
|
storeDecodedText font page =
|
||||||
|
tell . chunks =<< pageContents font page
|
||||||
|
|
||||||
|
getFont :: Dictionary -> T Dictionary
|
||||||
|
getFont pageDict =
|
||||||
|
get "Resources" pageDict
|
||||||
|
>>= dict . Direct
|
||||||
|
>>= get "Font"
|
||||||
|
>>= follow
|
||||||
|
>>= dict
|
||||||
|
|
||||||
|
getObject :: ObjectId -> T Object
|
||||||
|
getObject objectId = do
|
||||||
|
content <- ask
|
||||||
|
return (objects content ! objectId)
|
||||||
|
|
||||||
|
get :: String -> Dictionary -> T DirectObject
|
||||||
|
get key dictionary =
|
||||||
|
case Map.lookup (Name key) dictionary of
|
||||||
|
Just obj -> return obj
|
||||||
|
_ -> list []
|
||||||
|
|
||||||
|
follow :: DirectObject -> T Object
|
||||||
|
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
|
||||||
|
follow _ = list []
|
||||||
|
{-
|
||||||
|
obj <- getObject objectId
|
||||||
|
case obj of
|
||||||
|
Direct directObj -> return directObj
|
||||||
|
_ -> list []
|
||||||
|
-}
|
||||||
|
|
||||||
|
{-
|
||||||
|
key dictionary =
|
||||||
|
case Map.lookup (Name key) dictionary of
|
||||||
|
Just (Reference (IndirectObjCoordinates {objectId})) -> getObject objectId
|
||||||
|
_ -> list []
|
||||||
|
-}
|
||||||
|
|
||||||
|
dict :: Object -> T Dictionary
|
||||||
|
dict (Direct (Dictionary dictionary)) = return dictionary
|
||||||
|
dict _ = list []
|
||||||
|
|
||||||
|
pagesList :: T ObjectId
|
||||||
|
pagesList = do
|
||||||
|
root <- dict =<< follow =<< get "Root" . trailer . docStructure =<< ask
|
||||||
|
pages <- dict =<< follow =<< get "Pages" root
|
||||||
|
case Map.lookup (Name "Kids") pages of
|
||||||
|
Just (Array kids) -> list $ filterObjectIds kids
|
||||||
|
_ -> list []
|
||||||
|
|
||||||
|
filterObjectIds :: [DirectObject] -> [ObjectId]
|
||||||
|
filterObjectIds objects = do
|
||||||
|
object <- objects
|
||||||
|
case object of
|
||||||
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||||
|
_ -> []
|
||||||
|
|
||||||
|
listTextObjects :: Document -> [ByteString]
|
||||||
|
listTextObjects (Document {updates}) =
|
||||||
|
snd =<< evalRWST rwsMain (unify updates) Map.empty
|
||||||
|
where
|
||||||
|
rwsMain =
|
||||||
|
--Lazy.pack . show <$> (getObject =<< pagesList)
|
||||||
|
pagesList >>= getObject >>= extractText
|
||||||
|
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
[inputFile] <- getArgs
|
||||||
|
result <- parseDocument <$> BS.readFile inputFile
|
||||||
|
case result of
|
||||||
|
Left parseError -> hPutStrLn stderr $ show parseError
|
||||||
|
Right doc -> mapM_ Lazy.putStrLn $ listTextObjects doc
|
|
@ -3,6 +3,7 @@
|
||||||
{-# LANGUAGE FlexibleInstances #-}
|
{-# LANGUAGE FlexibleInstances #-}
|
||||||
module PDF.Object (
|
module PDF.Object (
|
||||||
Content(..)
|
Content(..)
|
||||||
|
, Dictionary
|
||||||
, DirectObject(..)
|
, DirectObject(..)
|
||||||
, Flow(..)
|
, Flow(..)
|
||||||
, IndexedObjects
|
, IndexedObjects
|
||||||
|
|
28
src/PDF/Text.hs
Normal file
28
src/PDF/Text.hs
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
{-# LANGUAGE FlexibleContexts #-}
|
||||||
|
module PDF.Text (
|
||||||
|
CMap
|
||||||
|
, CMappers
|
||||||
|
, PageContents(..)
|
||||||
|
, cMap
|
||||||
|
, pageContents
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Monad.State (MonadState)
|
||||||
|
import Data.ByteString.Lazy.Char8 (ByteString)
|
||||||
|
import Data.Map (Map)
|
||||||
|
import PDF.Object (Dictionary)
|
||||||
|
import PDF.Output (ObjectId)
|
||||||
|
import PDF.Parser (Parser)
|
||||||
|
|
||||||
|
type CMappers = Map ObjectId CMap
|
||||||
|
type CMap = Map Int ByteString
|
||||||
|
|
||||||
|
cMap :: ByteString -> CMap
|
||||||
|
cMap = undefined
|
||||||
|
|
||||||
|
data PageContents = PageContents {
|
||||||
|
chunks :: [ByteString]
|
||||||
|
}
|
||||||
|
|
||||||
|
pageContents :: MonadState CMappers m => Dictionary -> ByteString -> m PageContents
|
||||||
|
pageContents font page = undefined
|
Loading…
Reference in a new issue