Take the dirty code of «getText» and turn it into a relatively clean module exposing pages, that can be retrieved all at once or by page number (numbered human-style, starting from 1)

This commit is contained in:
Tissevert 2019-11-29 11:51:35 +01:00
parent 08a9717b3a
commit f9f799c59b
3 changed files with 201 additions and 132 deletions

View file

@ -21,9 +21,10 @@ library
, PDF.EOL
, PDF.Object
, PDF.Output
, PDF.Pages
, PDF.Parser
, PDF.Text
, PDF.Update
, PDF.Parser
other-modules: Data.ByteString.Char8.Util
, PDF.Body
-- other-extensions:
@ -34,6 +35,7 @@ library
, mtl
, text
, utf8-string
, zlib
hs-source-dirs: src
ghc-options: -Wall
default-language: Haskell2010

View file

@ -1,141 +1,37 @@
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}
import Codec.Compression.Zlib (decompress)
import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
import Control.Monad.Writer (tell)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, insert, lookup, toList)
import qualified Data.ByteString.Char8 as BS (putStrLn, readFile)
import qualified Data.Map as Map (toList)
import PDF (Document(..), parseDocument)
import PDF.CMap (CMap, CMappers, cMap, emptyCMap)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId(..))
import PDF.Text (PageContents(..), pageContents)
import PDF.Object (Content)
import PDF.Pages (Page(..), get, getAll)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)
import Text.Printf (printf)
import System.Exit (die)
import System.IO (BufferMode(..), hSetBuffering, stdout)
type CachedCMaps = Map ObjectId CMap
type T a = RWST Content [ByteString] CachedCMaps [] a
onDoc :: FilePath -> (Content -> Either String a) -> IO a
onDoc inputFile f = do
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
case content >>= f of
Left someError -> die someError
Right value -> return value
list :: [a] -> T a
list l = RWST (\_ s -> fillContext s <$> l)
where
fillContext s a = (a, s, [])
displayPage :: Page -> IO ()
displayPage = mapM_ BS.putStrLn . contents
handleError :: ObjectId -> a -> String -> T a
handleError objectId defaultValue s =
(tell . replicate 1 $ BS.pack message) >> return defaultValue
where
message = printf "Object #%d : %s" (getObjectId objectId) s
wholeDoc :: FilePath -> IO ()
wholeDoc inputFile = do
pages <- onDoc inputFile getAll
mapM_ (displayPage . snd) $ Map.toList pages
extractText :: Object -> T ()
extractText object = do
pageDict <- dict object
cMappers <- loadCMappers =<< getFont pageDict
contentsId <- target =<< key "Contents" pageDict
contents <- stream =<< getObject contentsId
either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents)
stream :: Object -> T ByteString
stream (Stream {header, streamContent}) = return $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
stream _ = list []
getFont :: Dictionary -> T Dictionary
getFont pageDict =
key "Resources" pageDict
>>= dict . Direct
>>= key "Font"
>>= follow
>>= dict
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
cache loader objectId = do
loaded <- get
case Map.lookup objectId loaded of
Just value -> return value
Nothing -> do
value <- loader objectId
modify (Map.insert objectId value) >> return value
loadFont :: ObjectId -> T CMap
loadFont objectId =
getObject objectId
>>= dict
>>= key "ToUnicode"
>>= follow
>>= stream
>>= either (handleError objectId emptyCMap) return . cMap
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
where
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
flip (Map.insert name) output <$> cache loadFont objectId
loadCMapper output _ = return output
getObject :: ObjectId -> T Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
key :: String -> Dictionary -> T DirectObject
key keyName dictionary =
case Map.lookup (Name keyName) dictionary of
Just obj -> return obj
_ -> list []
target :: DirectObject -> T ObjectId
target (Reference (IndirectObjCoordinates {objectId})) = return objectId
target _ = list []
follow :: DirectObject -> T Object
follow directObject = target directObject >>= getObject
dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary
dict _ = list []
pagesList :: T ObjectId
pagesList = do
root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
pages <- dict =<< follow =<< key "Pages" root
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> list $ filterObjectIds kids
_ -> list []
filterObjectIds :: [DirectObject] -> [ObjectId]
filterObjectIds objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
listTextObjects :: Document -> [ByteString]
listTextObjects (Document {updates}) =
snd =<< evalRWST rwsMain (unify updates) Map.empty
where
rwsMain =
pagesList >>= getObject >>= extractText
singlePage :: FilePath -> Int -> IO ()
singlePage inputFile pageNumber =
onDoc inputFile (`get` pageNumber) >>= displayPage
main :: IO ()
main = do
[inputFile] <- getArgs
result <- parseDocument <$> BS.readFile inputFile
case result of
Left parseError -> hPutStrLn stderr $ show parseError
Right doc -> mapM_ BS.putStrLn $ listTextObjects doc
hSetBuffering stdout LineBuffering
args <- getArgs
case args of
[inputFile] -> wholeDoc inputFile
[inputFile, pageNumber] -> singlePage inputFile (read pageNumber)
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"

171
src/PDF/Pages.hs Executable file
View file

@ -0,0 +1,171 @@
{-# LANGUAGE NamedFieldPuns #-}
module PDF.Pages (
Page(..)
, get
, getAll
) where
import Codec.Compression.Zlib (decompress)
import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, mapRWST, modify)
import qualified Control.Monad.RWS as RWS (get)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
import PDF.CMap (CMap, CMappers, cMap)
import PDF.Object (
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
, Object(..), Name(..), Structure(..)
,)
import PDF.Output (ObjectId(..))
import PDF.Text (pageContents)
import Prelude hiding (fail)
import Text.Printf (printf)
type CachedCMaps = Map ObjectId CMap
type T = RWST Content () CachedCMaps (Either String)
data Page = Page {
contents :: [ByteString]
, source :: ObjectId
}
infixl 1 \\=
(\\=) :: T a -> (a -> Either String b) -> T b
x \\= f = mapRWST ((\(a, s, w) -> (\b -> (b, s, w)) <$> f a) =<<) x
infixl 1 //=
(//=) :: Either String a -> (a -> T b) -> T b
(//=) (Left e) _ = RWST (\_ _ -> Left e)
(//=) (Right a) f = f a
lift :: Either String a -> T a
lift x = x //= return
expected :: Show a => String -> a -> Either String b
expected name = Left . printf "Not a %s: %s" name . show
stream :: Object -> Either String ByteString
stream (Stream {header, streamContent}) = Right $
case Map.lookup (Name "Filter") header of
Just (NameObject (Name "FlateDecode")) ->
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
_ -> streamContent
stream obj = expected "stream" obj
getResource :: DirectObject -> T Dictionary
getResource (Dictionary dictionary) = return dictionary
getResource (Reference (IndirectObjCoordinates {objectId})) =
getObject objectId \\= dict
getResource directObject =
lift $ expected "resource (dictionary or reference)" directObject
getFont :: Dictionary -> T Dictionary
getFont pageDict =
key "Resources" pageDict
//= getResource
\\= key "Font"
>>= follow
\\= dict
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
cache loader objectId =
(maybe load return . Map.lookup objectId) =<< RWS.get
where
load = do
value <- loader objectId
modify $ Map.insert objectId value
return value
loadFont :: ObjectId -> T CMap
loadFont objectId =
(getObject objectId
\\= dict
\\= key "ToUnicode"
>>= follow)
\\= stream
\\= cMap
loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
where
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) =
flip (Map.insert name) output <$> cache loadFont objectId
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId
loadCMapper output _ = return output
getObject :: ObjectId -> T Object
getObject objectId = do
content <- ask
return (objects content ! objectId)
key :: String -> Dictionary -> Either String DirectObject
key keyName dictionary =
maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
where
errorMessage =
printf "Key %s not found in dictionary %s" keyName (show dictionary)
target :: DirectObject -> Either String ObjectId
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
target directObject = expected "reference" directObject
many :: DirectObject -> [DirectObject]
many (Array l) = l
many directObject = [directObject]
follow :: DirectObject -> T Object
follow directObject = target directObject //= getObject
dict :: Object -> Either String Dictionary
dict (Direct (Dictionary dictionary)) = Right dictionary
dict obj = expected "dictionary" obj
dictObject :: String -> Dictionary -> T Dictionary
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
pagesList :: T [ObjectId]
pagesList = do
root <- dictObject "Root" . trailer . docStructure =<< ask
pages <- dictObject "Pages" root
case Map.lookup (Name "Kids") pages of
Just (Array kids) -> return $ getReferences kids
_ -> return []
getReferences :: [DirectObject] -> [ObjectId]
getReferences objects = do
object <- objects
case object of
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
_ -> []
extractText :: Object -> T [ByteString]
extractText object = do
pageDict <- lift $ dict object
cMappers <- loadCMappers =<< getFont pageDict
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
concat <$> (objects //= (mapM $ loadContent cMappers))
where
loadContent :: CMappers -> DirectObject -> T [ByteString]
loadContent cMappers directObject =
follow directObject \\= stream \\= pageContents cMappers
loadPage :: ObjectId -> T Page
loadPage source =
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
getAll :: Content -> Either String (Map Int Page)
getAll content = fst <$> evalRWST getPages content Map.empty
where
numbered = Map.fromList . zip [1..]
getPages = numbered <$> (mapM loadPage =<< pagesList)
get :: Content -> Int -> Either String Page
get content pageNumber
| pageNumber < 1 = Left "Pages start at 1"
| otherwise = fst <$> evalRWST getPage content Map.empty
where
firstPage [] = lift $ Left "Page is out of bounds"
firstPage (p:_) = loadPage p
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage