Take the dirty code of «getText» and turn it into a relatively clean module exposing pages, that can be retrieved all at once or by page number (numbered human-style, starting from 1)
This commit is contained in:
parent
08a9717b3a
commit
f9f799c59b
3 changed files with 201 additions and 132 deletions
|
@ -21,9 +21,10 @@ library
|
|||
, PDF.EOL
|
||||
, PDF.Object
|
||||
, PDF.Output
|
||||
, PDF.Pages
|
||||
, PDF.Parser
|
||||
, PDF.Text
|
||||
, PDF.Update
|
||||
, PDF.Parser
|
||||
other-modules: Data.ByteString.Char8.Util
|
||||
, PDF.Body
|
||||
-- other-extensions:
|
||||
|
@ -34,6 +35,7 @@ library
|
|||
, mtl
|
||||
, text
|
||||
, utf8-string
|
||||
, zlib
|
||||
hs-source-dirs: src
|
||||
ghc-options: -Wall
|
||||
default-language: Haskell2010
|
||||
|
|
|
@ -1,141 +1,37 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
import Codec.Compression.Zlib (decompress)
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
|
||||
import Control.Monad.Writer (tell)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile)
|
||||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
||||
import qualified Data.ByteString.Char8 as BS (putStrLn, readFile)
|
||||
import qualified Data.Map as Map (toList)
|
||||
import PDF (Document(..), parseDocument)
|
||||
import PDF.CMap (CMap, CMappers, cMap, emptyCMap)
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Object(..), Name(..), Structure(..)
|
||||
,)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import PDF.Text (PageContents(..), pageContents)
|
||||
import PDF.Object (Content)
|
||||
import PDF.Pages (Page(..), get, getAll)
|
||||
import PDF.Update (unify)
|
||||
import System.Environment (getArgs)
|
||||
import System.IO (hPutStrLn, stderr)
|
||||
import Text.Printf (printf)
|
||||
import System.Exit (die)
|
||||
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||
|
||||
type CachedCMaps = Map ObjectId CMap
|
||||
type T a = RWST Content [ByteString] CachedCMaps [] a
|
||||
onDoc :: FilePath -> (Content -> Either String a) -> IO a
|
||||
onDoc inputFile f = do
|
||||
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
|
||||
case content >>= f of
|
||||
Left someError -> die someError
|
||||
Right value -> return value
|
||||
|
||||
list :: [a] -> T a
|
||||
list l = RWST (\_ s -> fillContext s <$> l)
|
||||
where
|
||||
fillContext s a = (a, s, [])
|
||||
displayPage :: Page -> IO ()
|
||||
displayPage = mapM_ BS.putStrLn . contents
|
||||
|
||||
handleError :: ObjectId -> a -> String -> T a
|
||||
handleError objectId defaultValue s =
|
||||
(tell . replicate 1 $ BS.pack message) >> return defaultValue
|
||||
where
|
||||
message = printf "Object #%d : %s" (getObjectId objectId) s
|
||||
wholeDoc :: FilePath -> IO ()
|
||||
wholeDoc inputFile = do
|
||||
pages <- onDoc inputFile getAll
|
||||
mapM_ (displayPage . snd) $ Map.toList pages
|
||||
|
||||
extractText :: Object -> T ()
|
||||
extractText object = do
|
||||
pageDict <- dict object
|
||||
cMappers <- loadCMappers =<< getFont pageDict
|
||||
contentsId <- target =<< key "Contents" pageDict
|
||||
contents <- stream =<< getObject contentsId
|
||||
either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents)
|
||||
|
||||
stream :: Object -> T ByteString
|
||||
stream (Stream {header, streamContent}) = return $
|
||||
case Map.lookup (Name "Filter") header of
|
||||
Just (NameObject (Name "FlateDecode")) ->
|
||||
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
||||
_ -> streamContent
|
||||
stream _ = list []
|
||||
|
||||
getFont :: Dictionary -> T Dictionary
|
||||
getFont pageDict =
|
||||
key "Resources" pageDict
|
||||
>>= dict . Direct
|
||||
>>= key "Font"
|
||||
>>= follow
|
||||
>>= dict
|
||||
|
||||
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
|
||||
cache loader objectId = do
|
||||
loaded <- get
|
||||
case Map.lookup objectId loaded of
|
||||
Just value -> return value
|
||||
Nothing -> do
|
||||
value <- loader objectId
|
||||
modify (Map.insert objectId value) >> return value
|
||||
|
||||
loadFont :: ObjectId -> T CMap
|
||||
loadFont objectId =
|
||||
getObject objectId
|
||||
>>= dict
|
||||
>>= key "ToUnicode"
|
||||
>>= follow
|
||||
>>= stream
|
||||
>>= either (handleError objectId emptyCMap) return . cMap
|
||||
|
||||
loadCMappers :: Dictionary -> T CMappers
|
||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||
where
|
||||
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
|
||||
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
|
||||
flip (Map.insert name) output <$> cache loadFont objectId
|
||||
loadCMapper output _ = return output
|
||||
|
||||
getObject :: ObjectId -> T Object
|
||||
getObject objectId = do
|
||||
content <- ask
|
||||
return (objects content ! objectId)
|
||||
|
||||
key :: String -> Dictionary -> T DirectObject
|
||||
key keyName dictionary =
|
||||
case Map.lookup (Name keyName) dictionary of
|
||||
Just obj -> return obj
|
||||
_ -> list []
|
||||
|
||||
target :: DirectObject -> T ObjectId
|
||||
target (Reference (IndirectObjCoordinates {objectId})) = return objectId
|
||||
target _ = list []
|
||||
|
||||
follow :: DirectObject -> T Object
|
||||
follow directObject = target directObject >>= getObject
|
||||
|
||||
dict :: Object -> T Dictionary
|
||||
dict (Direct (Dictionary dictionary)) = return dictionary
|
||||
dict _ = list []
|
||||
|
||||
pagesList :: T ObjectId
|
||||
pagesList = do
|
||||
root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
|
||||
pages <- dict =<< follow =<< key "Pages" root
|
||||
case Map.lookup (Name "Kids") pages of
|
||||
Just (Array kids) -> list $ filterObjectIds kids
|
||||
_ -> list []
|
||||
|
||||
filterObjectIds :: [DirectObject] -> [ObjectId]
|
||||
filterObjectIds objects = do
|
||||
object <- objects
|
||||
case object of
|
||||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||
_ -> []
|
||||
|
||||
listTextObjects :: Document -> [ByteString]
|
||||
listTextObjects (Document {updates}) =
|
||||
snd =<< evalRWST rwsMain (unify updates) Map.empty
|
||||
where
|
||||
rwsMain =
|
||||
pagesList >>= getObject >>= extractText
|
||||
|
||||
singlePage :: FilePath -> Int -> IO ()
|
||||
singlePage inputFile pageNumber =
|
||||
onDoc inputFile (`get` pageNumber) >>= displayPage
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
[inputFile] <- getArgs
|
||||
result <- parseDocument <$> BS.readFile inputFile
|
||||
case result of
|
||||
Left parseError -> hPutStrLn stderr $ show parseError
|
||||
Right doc -> mapM_ BS.putStrLn $ listTextObjects doc
|
||||
hSetBuffering stdout LineBuffering
|
||||
args <- getArgs
|
||||
case args of
|
||||
[inputFile] -> wholeDoc inputFile
|
||||
[inputFile, pageNumber] -> singlePage inputFile (read pageNumber)
|
||||
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
|
||||
|
|
171
src/PDF/Pages.hs
Executable file
171
src/PDF/Pages.hs
Executable file
|
@ -0,0 +1,171 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module PDF.Pages (
|
||||
Page(..)
|
||||
, get
|
||||
, getAll
|
||||
) where
|
||||
|
||||
import Codec.Compression.Zlib (decompress)
|
||||
import Control.Monad (foldM)
|
||||
import Control.Monad.RWS (RWST(..), ask, evalRWST, mapRWST, modify)
|
||||
import qualified Control.Monad.RWS as RWS (get)
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||
import Data.Map (Map, (!))
|
||||
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||
import PDF.CMap (CMap, CMappers, cMap)
|
||||
import PDF.Object (
|
||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||
, Object(..), Name(..), Structure(..)
|
||||
,)
|
||||
import PDF.Output (ObjectId(..))
|
||||
import PDF.Text (pageContents)
|
||||
import Prelude hiding (fail)
|
||||
import Text.Printf (printf)
|
||||
|
||||
type CachedCMaps = Map ObjectId CMap
|
||||
type T = RWST Content () CachedCMaps (Either String)
|
||||
data Page = Page {
|
||||
contents :: [ByteString]
|
||||
, source :: ObjectId
|
||||
}
|
||||
|
||||
infixl 1 \\=
|
||||
(\\=) :: T a -> (a -> Either String b) -> T b
|
||||
x \\= f = mapRWST ((\(a, s, w) -> (\b -> (b, s, w)) <$> f a) =<<) x
|
||||
|
||||
infixl 1 //=
|
||||
(//=) :: Either String a -> (a -> T b) -> T b
|
||||
(//=) (Left e) _ = RWST (\_ _ -> Left e)
|
||||
(//=) (Right a) f = f a
|
||||
|
||||
lift :: Either String a -> T a
|
||||
lift x = x //= return
|
||||
|
||||
expected :: Show a => String -> a -> Either String b
|
||||
expected name = Left . printf "Not a %s: %s" name . show
|
||||
|
||||
stream :: Object -> Either String ByteString
|
||||
stream (Stream {header, streamContent}) = Right $
|
||||
case Map.lookup (Name "Filter") header of
|
||||
Just (NameObject (Name "FlateDecode")) ->
|
||||
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
||||
_ -> streamContent
|
||||
stream obj = expected "stream" obj
|
||||
|
||||
getResource :: DirectObject -> T Dictionary
|
||||
getResource (Dictionary dictionary) = return dictionary
|
||||
getResource (Reference (IndirectObjCoordinates {objectId})) =
|
||||
getObject objectId \\= dict
|
||||
getResource directObject =
|
||||
lift $ expected "resource (dictionary or reference)" directObject
|
||||
|
||||
getFont :: Dictionary -> T Dictionary
|
||||
getFont pageDict =
|
||||
key "Resources" pageDict
|
||||
//= getResource
|
||||
\\= key "Font"
|
||||
>>= follow
|
||||
\\= dict
|
||||
|
||||
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
|
||||
cache loader objectId =
|
||||
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||
where
|
||||
load = do
|
||||
value <- loader objectId
|
||||
modify $ Map.insert objectId value
|
||||
return value
|
||||
|
||||
loadFont :: ObjectId -> T CMap
|
||||
loadFont objectId =
|
||||
(getObject objectId
|
||||
\\= dict
|
||||
\\= key "ToUnicode"
|
||||
>>= follow)
|
||||
\\= stream
|
||||
\\= cMap
|
||||
|
||||
loadCMappers :: Dictionary -> T CMappers
|
||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||
where
|
||||
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
|
||||
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||
flip (Map.insert name) output <$> cache loadFont objectId
|
||||
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId
|
||||
loadCMapper output _ = return output
|
||||
|
||||
getObject :: ObjectId -> T Object
|
||||
getObject objectId = do
|
||||
content <- ask
|
||||
return (objects content ! objectId)
|
||||
|
||||
key :: String -> Dictionary -> Either String DirectObject
|
||||
key keyName dictionary =
|
||||
maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
|
||||
where
|
||||
errorMessage =
|
||||
printf "Key %s not found in dictionary %s" keyName (show dictionary)
|
||||
|
||||
target :: DirectObject -> Either String ObjectId
|
||||
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
|
||||
target directObject = expected "reference" directObject
|
||||
|
||||
many :: DirectObject -> [DirectObject]
|
||||
many (Array l) = l
|
||||
many directObject = [directObject]
|
||||
|
||||
follow :: DirectObject -> T Object
|
||||
follow directObject = target directObject //= getObject
|
||||
|
||||
dict :: Object -> Either String Dictionary
|
||||
dict (Direct (Dictionary dictionary)) = Right dictionary
|
||||
dict obj = expected "dictionary" obj
|
||||
|
||||
dictObject :: String -> Dictionary -> T Dictionary
|
||||
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
|
||||
|
||||
pagesList :: T [ObjectId]
|
||||
pagesList = do
|
||||
root <- dictObject "Root" . trailer . docStructure =<< ask
|
||||
pages <- dictObject "Pages" root
|
||||
case Map.lookup (Name "Kids") pages of
|
||||
Just (Array kids) -> return $ getReferences kids
|
||||
_ -> return []
|
||||
|
||||
getReferences :: [DirectObject] -> [ObjectId]
|
||||
getReferences objects = do
|
||||
object <- objects
|
||||
case object of
|
||||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||
_ -> []
|
||||
|
||||
extractText :: Object -> T [ByteString]
|
||||
extractText object = do
|
||||
pageDict <- lift $ dict object
|
||||
cMappers <- loadCMappers =<< getFont pageDict
|
||||
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
||||
concat <$> (objects //= (mapM $ loadContent cMappers))
|
||||
where
|
||||
loadContent :: CMappers -> DirectObject -> T [ByteString]
|
||||
loadContent cMappers directObject =
|
||||
follow directObject \\= stream \\= pageContents cMappers
|
||||
|
||||
loadPage :: ObjectId -> T Page
|
||||
loadPage source =
|
||||
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
|
||||
|
||||
getAll :: Content -> Either String (Map Int Page)
|
||||
getAll content = fst <$> evalRWST getPages content Map.empty
|
||||
where
|
||||
numbered = Map.fromList . zip [1..]
|
||||
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
||||
|
||||
get :: Content -> Int -> Either String Page
|
||||
get content pageNumber
|
||||
| pageNumber < 1 = Left "Pages start at 1"
|
||||
| otherwise = fst <$> evalRWST getPage content Map.empty
|
||||
where
|
||||
firstPage [] = lift $ Left "Page is out of bounds"
|
||||
firstPage (p:_) = loadPage p
|
||||
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|
Loading…
Reference in a new issue