Take the dirty code of «getText» and turn it into a relatively clean module exposing pages, that can be retrieved all at once or by page number (numbered human-style, starting from 1)
This commit is contained in:
parent
08a9717b3a
commit
f9f799c59b
3 changed files with 201 additions and 132 deletions
|
@ -21,9 +21,10 @@ library
|
||||||
, PDF.EOL
|
, PDF.EOL
|
||||||
, PDF.Object
|
, PDF.Object
|
||||||
, PDF.Output
|
, PDF.Output
|
||||||
|
, PDF.Pages
|
||||||
|
, PDF.Parser
|
||||||
, PDF.Text
|
, PDF.Text
|
||||||
, PDF.Update
|
, PDF.Update
|
||||||
, PDF.Parser
|
|
||||||
other-modules: Data.ByteString.Char8.Util
|
other-modules: Data.ByteString.Char8.Util
|
||||||
, PDF.Body
|
, PDF.Body
|
||||||
-- other-extensions:
|
-- other-extensions:
|
||||||
|
@ -34,6 +35,7 @@ library
|
||||||
, mtl
|
, mtl
|
||||||
, text
|
, text
|
||||||
, utf8-string
|
, utf8-string
|
||||||
|
, zlib
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
default-language: Haskell2010
|
default-language: Haskell2010
|
||||||
|
|
|
@ -1,141 +1,37 @@
|
||||||
{-# LANGUAGE NamedFieldPuns #-}
|
import qualified Data.ByteString.Char8 as BS (putStrLn, readFile)
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
import qualified Data.Map as Map (toList)
|
||||||
|
|
||||||
import Codec.Compression.Zlib (decompress)
|
|
||||||
import Control.Monad (foldM)
|
|
||||||
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
|
|
||||||
import Control.Monad.Writer (tell)
|
|
||||||
import Data.ByteString (ByteString)
|
|
||||||
import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile)
|
|
||||||
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
|
||||||
import Data.Map (Map, (!))
|
|
||||||
import qualified Data.Map as Map (empty, insert, lookup, toList)
|
|
||||||
import PDF (Document(..), parseDocument)
|
import PDF (Document(..), parseDocument)
|
||||||
import PDF.CMap (CMap, CMappers, cMap, emptyCMap)
|
import PDF.Object (Content)
|
||||||
import PDF.Object (
|
import PDF.Pages (Page(..), get, getAll)
|
||||||
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
|
||||||
, Object(..), Name(..), Structure(..)
|
|
||||||
,)
|
|
||||||
import PDF.Output (ObjectId(..))
|
|
||||||
import PDF.Text (PageContents(..), pageContents)
|
|
||||||
import PDF.Update (unify)
|
import PDF.Update (unify)
|
||||||
import System.Environment (getArgs)
|
import System.Environment (getArgs)
|
||||||
import System.IO (hPutStrLn, stderr)
|
import System.Exit (die)
|
||||||
import Text.Printf (printf)
|
import System.IO (BufferMode(..), hSetBuffering, stdout)
|
||||||
|
|
||||||
type CachedCMaps = Map ObjectId CMap
|
onDoc :: FilePath -> (Content -> Either String a) -> IO a
|
||||||
type T a = RWST Content [ByteString] CachedCMaps [] a
|
onDoc inputFile f = do
|
||||||
|
content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile
|
||||||
|
case content >>= f of
|
||||||
|
Left someError -> die someError
|
||||||
|
Right value -> return value
|
||||||
|
|
||||||
list :: [a] -> T a
|
displayPage :: Page -> IO ()
|
||||||
list l = RWST (\_ s -> fillContext s <$> l)
|
displayPage = mapM_ BS.putStrLn . contents
|
||||||
where
|
|
||||||
fillContext s a = (a, s, [])
|
|
||||||
|
|
||||||
handleError :: ObjectId -> a -> String -> T a
|
wholeDoc :: FilePath -> IO ()
|
||||||
handleError objectId defaultValue s =
|
wholeDoc inputFile = do
|
||||||
(tell . replicate 1 $ BS.pack message) >> return defaultValue
|
pages <- onDoc inputFile getAll
|
||||||
where
|
mapM_ (displayPage . snd) $ Map.toList pages
|
||||||
message = printf "Object #%d : %s" (getObjectId objectId) s
|
|
||||||
|
|
||||||
extractText :: Object -> T ()
|
|
||||||
extractText object = do
|
|
||||||
pageDict <- dict object
|
|
||||||
cMappers <- loadCMappers =<< getFont pageDict
|
|
||||||
contentsId <- target =<< key "Contents" pageDict
|
|
||||||
contents <- stream =<< getObject contentsId
|
|
||||||
either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents)
|
|
||||||
|
|
||||||
stream :: Object -> T ByteString
|
|
||||||
stream (Stream {header, streamContent}) = return $
|
|
||||||
case Map.lookup (Name "Filter") header of
|
|
||||||
Just (NameObject (Name "FlateDecode")) ->
|
|
||||||
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
|
||||||
_ -> streamContent
|
|
||||||
stream _ = list []
|
|
||||||
|
|
||||||
getFont :: Dictionary -> T Dictionary
|
|
||||||
getFont pageDict =
|
|
||||||
key "Resources" pageDict
|
|
||||||
>>= dict . Direct
|
|
||||||
>>= key "Font"
|
|
||||||
>>= follow
|
|
||||||
>>= dict
|
|
||||||
|
|
||||||
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
|
|
||||||
cache loader objectId = do
|
|
||||||
loaded <- get
|
|
||||||
case Map.lookup objectId loaded of
|
|
||||||
Just value -> return value
|
|
||||||
Nothing -> do
|
|
||||||
value <- loader objectId
|
|
||||||
modify (Map.insert objectId value) >> return value
|
|
||||||
|
|
||||||
loadFont :: ObjectId -> T CMap
|
|
||||||
loadFont objectId =
|
|
||||||
getObject objectId
|
|
||||||
>>= dict
|
|
||||||
>>= key "ToUnicode"
|
|
||||||
>>= follow
|
|
||||||
>>= stream
|
|
||||||
>>= either (handleError objectId emptyCMap) return . cMap
|
|
||||||
|
|
||||||
loadCMappers :: Dictionary -> T CMappers
|
|
||||||
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
|
||||||
where
|
|
||||||
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
|
|
||||||
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
|
|
||||||
flip (Map.insert name) output <$> cache loadFont objectId
|
|
||||||
loadCMapper output _ = return output
|
|
||||||
|
|
||||||
getObject :: ObjectId -> T Object
|
|
||||||
getObject objectId = do
|
|
||||||
content <- ask
|
|
||||||
return (objects content ! objectId)
|
|
||||||
|
|
||||||
key :: String -> Dictionary -> T DirectObject
|
|
||||||
key keyName dictionary =
|
|
||||||
case Map.lookup (Name keyName) dictionary of
|
|
||||||
Just obj -> return obj
|
|
||||||
_ -> list []
|
|
||||||
|
|
||||||
target :: DirectObject -> T ObjectId
|
|
||||||
target (Reference (IndirectObjCoordinates {objectId})) = return objectId
|
|
||||||
target _ = list []
|
|
||||||
|
|
||||||
follow :: DirectObject -> T Object
|
|
||||||
follow directObject = target directObject >>= getObject
|
|
||||||
|
|
||||||
dict :: Object -> T Dictionary
|
|
||||||
dict (Direct (Dictionary dictionary)) = return dictionary
|
|
||||||
dict _ = list []
|
|
||||||
|
|
||||||
pagesList :: T ObjectId
|
|
||||||
pagesList = do
|
|
||||||
root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
|
|
||||||
pages <- dict =<< follow =<< key "Pages" root
|
|
||||||
case Map.lookup (Name "Kids") pages of
|
|
||||||
Just (Array kids) -> list $ filterObjectIds kids
|
|
||||||
_ -> list []
|
|
||||||
|
|
||||||
filterObjectIds :: [DirectObject] -> [ObjectId]
|
|
||||||
filterObjectIds objects = do
|
|
||||||
object <- objects
|
|
||||||
case object of
|
|
||||||
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
|
||||||
_ -> []
|
|
||||||
|
|
||||||
listTextObjects :: Document -> [ByteString]
|
|
||||||
listTextObjects (Document {updates}) =
|
|
||||||
snd =<< evalRWST rwsMain (unify updates) Map.empty
|
|
||||||
where
|
|
||||||
rwsMain =
|
|
||||||
pagesList >>= getObject >>= extractText
|
|
||||||
|
|
||||||
|
singlePage :: FilePath -> Int -> IO ()
|
||||||
|
singlePage inputFile pageNumber =
|
||||||
|
onDoc inputFile (`get` pageNumber) >>= displayPage
|
||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = do
|
main = do
|
||||||
[inputFile] <- getArgs
|
hSetBuffering stdout LineBuffering
|
||||||
result <- parseDocument <$> BS.readFile inputFile
|
args <- getArgs
|
||||||
case result of
|
case args of
|
||||||
Left parseError -> hPutStrLn stderr $ show parseError
|
[inputFile] -> wholeDoc inputFile
|
||||||
Right doc -> mapM_ BS.putStrLn $ listTextObjects doc
|
[inputFile, pageNumber] -> singlePage inputFile (read pageNumber)
|
||||||
|
_ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]"
|
||||||
|
|
171
src/PDF/Pages.hs
Executable file
171
src/PDF/Pages.hs
Executable file
|
@ -0,0 +1,171 @@
|
||||||
|
{-# LANGUAGE NamedFieldPuns #-}
|
||||||
|
module PDF.Pages (
|
||||||
|
Page(..)
|
||||||
|
, get
|
||||||
|
, getAll
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Codec.Compression.Zlib (decompress)
|
||||||
|
import Control.Monad (foldM)
|
||||||
|
import Control.Monad.RWS (RWST(..), ask, evalRWST, mapRWST, modify)
|
||||||
|
import qualified Control.Monad.RWS as RWS (get)
|
||||||
|
import Data.ByteString (ByteString)
|
||||||
|
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
|
||||||
|
import Data.Map (Map, (!))
|
||||||
|
import qualified Data.Map as Map (empty, fromList, insert, lookup, toList)
|
||||||
|
import PDF.CMap (CMap, CMappers, cMap)
|
||||||
|
import PDF.Object (
|
||||||
|
Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
|
||||||
|
, Object(..), Name(..), Structure(..)
|
||||||
|
,)
|
||||||
|
import PDF.Output (ObjectId(..))
|
||||||
|
import PDF.Text (pageContents)
|
||||||
|
import Prelude hiding (fail)
|
||||||
|
import Text.Printf (printf)
|
||||||
|
|
||||||
|
type CachedCMaps = Map ObjectId CMap
|
||||||
|
type T = RWST Content () CachedCMaps (Either String)
|
||||||
|
data Page = Page {
|
||||||
|
contents :: [ByteString]
|
||||||
|
, source :: ObjectId
|
||||||
|
}
|
||||||
|
|
||||||
|
infixl 1 \\=
|
||||||
|
(\\=) :: T a -> (a -> Either String b) -> T b
|
||||||
|
x \\= f = mapRWST ((\(a, s, w) -> (\b -> (b, s, w)) <$> f a) =<<) x
|
||||||
|
|
||||||
|
infixl 1 //=
|
||||||
|
(//=) :: Either String a -> (a -> T b) -> T b
|
||||||
|
(//=) (Left e) _ = RWST (\_ _ -> Left e)
|
||||||
|
(//=) (Right a) f = f a
|
||||||
|
|
||||||
|
lift :: Either String a -> T a
|
||||||
|
lift x = x //= return
|
||||||
|
|
||||||
|
expected :: Show a => String -> a -> Either String b
|
||||||
|
expected name = Left . printf "Not a %s: %s" name . show
|
||||||
|
|
||||||
|
stream :: Object -> Either String ByteString
|
||||||
|
stream (Stream {header, streamContent}) = Right $
|
||||||
|
case Map.lookup (Name "Filter") header of
|
||||||
|
Just (NameObject (Name "FlateDecode")) ->
|
||||||
|
Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
|
||||||
|
_ -> streamContent
|
||||||
|
stream obj = expected "stream" obj
|
||||||
|
|
||||||
|
getResource :: DirectObject -> T Dictionary
|
||||||
|
getResource (Dictionary dictionary) = return dictionary
|
||||||
|
getResource (Reference (IndirectObjCoordinates {objectId})) =
|
||||||
|
getObject objectId \\= dict
|
||||||
|
getResource directObject =
|
||||||
|
lift $ expected "resource (dictionary or reference)" directObject
|
||||||
|
|
||||||
|
getFont :: Dictionary -> T Dictionary
|
||||||
|
getFont pageDict =
|
||||||
|
key "Resources" pageDict
|
||||||
|
//= getResource
|
||||||
|
\\= key "Font"
|
||||||
|
>>= follow
|
||||||
|
\\= dict
|
||||||
|
|
||||||
|
cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
|
||||||
|
cache loader objectId =
|
||||||
|
(maybe load return . Map.lookup objectId) =<< RWS.get
|
||||||
|
where
|
||||||
|
load = do
|
||||||
|
value <- loader objectId
|
||||||
|
modify $ Map.insert objectId value
|
||||||
|
return value
|
||||||
|
|
||||||
|
loadFont :: ObjectId -> T CMap
|
||||||
|
loadFont objectId =
|
||||||
|
(getObject objectId
|
||||||
|
\\= dict
|
||||||
|
\\= key "ToUnicode"
|
||||||
|
>>= follow)
|
||||||
|
\\= stream
|
||||||
|
\\= cMap
|
||||||
|
|
||||||
|
loadCMappers :: Dictionary -> T CMappers
|
||||||
|
loadCMappers = foldM loadCMapper Map.empty . Map.toList
|
||||||
|
where
|
||||||
|
loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
|
||||||
|
loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) =
|
||||||
|
flip (Map.insert name) output <$> cache loadFont objectId
|
||||||
|
--maybe output (flip (Map.insert name) output) <$> cache loadFont objectId
|
||||||
|
loadCMapper output _ = return output
|
||||||
|
|
||||||
|
getObject :: ObjectId -> T Object
|
||||||
|
getObject objectId = do
|
||||||
|
content <- ask
|
||||||
|
return (objects content ! objectId)
|
||||||
|
|
||||||
|
key :: String -> Dictionary -> Either String DirectObject
|
||||||
|
key keyName dictionary =
|
||||||
|
maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary)
|
||||||
|
where
|
||||||
|
errorMessage =
|
||||||
|
printf "Key %s not found in dictionary %s" keyName (show dictionary)
|
||||||
|
|
||||||
|
target :: DirectObject -> Either String ObjectId
|
||||||
|
target (Reference (IndirectObjCoordinates {objectId})) = Right objectId
|
||||||
|
target directObject = expected "reference" directObject
|
||||||
|
|
||||||
|
many :: DirectObject -> [DirectObject]
|
||||||
|
many (Array l) = l
|
||||||
|
many directObject = [directObject]
|
||||||
|
|
||||||
|
follow :: DirectObject -> T Object
|
||||||
|
follow directObject = target directObject //= getObject
|
||||||
|
|
||||||
|
dict :: Object -> Either String Dictionary
|
||||||
|
dict (Direct (Dictionary dictionary)) = Right dictionary
|
||||||
|
dict obj = expected "dictionary" obj
|
||||||
|
|
||||||
|
dictObject :: String -> Dictionary -> T Dictionary
|
||||||
|
dictObject keyName dictionary = key keyName dictionary //= follow \\= dict
|
||||||
|
|
||||||
|
pagesList :: T [ObjectId]
|
||||||
|
pagesList = do
|
||||||
|
root <- dictObject "Root" . trailer . docStructure =<< ask
|
||||||
|
pages <- dictObject "Pages" root
|
||||||
|
case Map.lookup (Name "Kids") pages of
|
||||||
|
Just (Array kids) -> return $ getReferences kids
|
||||||
|
_ -> return []
|
||||||
|
|
||||||
|
getReferences :: [DirectObject] -> [ObjectId]
|
||||||
|
getReferences objects = do
|
||||||
|
object <- objects
|
||||||
|
case object of
|
||||||
|
Reference (IndirectObjCoordinates {objectId}) -> [objectId]
|
||||||
|
_ -> []
|
||||||
|
|
||||||
|
extractText :: Object -> T [ByteString]
|
||||||
|
extractText object = do
|
||||||
|
pageDict <- lift $ dict object
|
||||||
|
cMappers <- loadCMappers =<< getFont pageDict
|
||||||
|
let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject])
|
||||||
|
concat <$> (objects //= (mapM $ loadContent cMappers))
|
||||||
|
where
|
||||||
|
loadContent :: CMappers -> DirectObject -> T [ByteString]
|
||||||
|
loadContent cMappers directObject =
|
||||||
|
follow directObject \\= stream \\= pageContents cMappers
|
||||||
|
|
||||||
|
loadPage :: ObjectId -> T Page
|
||||||
|
loadPage source =
|
||||||
|
(\contents -> Page {contents, source}) <$> (extractText =<< getObject source)
|
||||||
|
|
||||||
|
getAll :: Content -> Either String (Map Int Page)
|
||||||
|
getAll content = fst <$> evalRWST getPages content Map.empty
|
||||||
|
where
|
||||||
|
numbered = Map.fromList . zip [1..]
|
||||||
|
getPages = numbered <$> (mapM loadPage =<< pagesList)
|
||||||
|
|
||||||
|
get :: Content -> Int -> Either String Page
|
||||||
|
get content pageNumber
|
||||||
|
| pageNumber < 1 = Left "Pages start at 1"
|
||||||
|
| otherwise = fst <$> evalRWST getPage content Map.empty
|
||||||
|
where
|
||||||
|
firstPage [] = lift $ Left "Page is out of bounds"
|
||||||
|
firstPage (p:_) = loadPage p
|
||||||
|
getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage
|
Loading…
Reference in a new issue