diff --git a/Hufflepdf.cabal b/Hufflepdf.cabal index dd40ca0..9186d7f 100644 --- a/Hufflepdf.cabal +++ b/Hufflepdf.cabal @@ -21,9 +21,10 @@ library , PDF.EOL , PDF.Object , PDF.Output + , PDF.Pages + , PDF.Parser , PDF.Text , PDF.Update - , PDF.Parser other-modules: Data.ByteString.Char8.Util , PDF.Body -- other-extensions: @@ -34,6 +35,7 @@ library , mtl , text , utf8-string + , zlib hs-source-dirs: src ghc-options: -Wall default-language: Haskell2010 diff --git a/examples/getText.hs b/examples/getText.hs index 8a47496..c93441e 100755 --- a/examples/getText.hs +++ b/examples/getText.hs @@ -1,141 +1,37 @@ -{-# LANGUAGE NamedFieldPuns #-} -{-# LANGUAGE OverloadedStrings #-} - -import Codec.Compression.Zlib (decompress) -import Control.Monad (foldM) -import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify) -import Control.Monad.Writer (tell) -import Data.ByteString (ByteString) -import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile) -import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) -import Data.Map (Map, (!)) -import qualified Data.Map as Map (empty, insert, lookup, toList) +import qualified Data.ByteString.Char8 as BS (putStrLn, readFile) +import qualified Data.Map as Map (toList) import PDF (Document(..), parseDocument) -import PDF.CMap (CMap, CMappers, cMap, emptyCMap) -import PDF.Object ( - Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) - , Object(..), Name(..), Structure(..) - ,) -import PDF.Output (ObjectId(..)) -import PDF.Text (PageContents(..), pageContents) +import PDF.Object (Content) +import PDF.Pages (Page(..), get, getAll) import PDF.Update (unify) import System.Environment (getArgs) -import System.IO (hPutStrLn, stderr) -import Text.Printf (printf) +import System.Exit (die) +import System.IO (BufferMode(..), hSetBuffering, stdout) -type CachedCMaps = Map ObjectId CMap -type T a = RWST Content [ByteString] CachedCMaps [] a +onDoc :: FilePath -> (Content -> Either String a) -> IO a +onDoc inputFile f = do + content <- fmap (unify . updates) . parseDocument <$> BS.readFile inputFile + case content >>= f of + Left someError -> die someError + Right value -> return value -list :: [a] -> T a -list l = RWST (\_ s -> fillContext s <$> l) - where - fillContext s a = (a, s, []) +displayPage :: Page -> IO () +displayPage = mapM_ BS.putStrLn . contents -handleError :: ObjectId -> a -> String -> T a -handleError objectId defaultValue s = - (tell . replicate 1 $ BS.pack message) >> return defaultValue - where - message = printf "Object #%d : %s" (getObjectId objectId) s +wholeDoc :: FilePath -> IO () +wholeDoc inputFile = do + pages <- onDoc inputFile getAll + mapM_ (displayPage . snd) $ Map.toList pages -extractText :: Object -> T () -extractText object = do - pageDict <- dict object - cMappers <- loadCMappers =<< getFont pageDict - contentsId <- target =<< key "Contents" pageDict - contents <- stream =<< getObject contentsId - either (handleError contentsId ()) (tell . chunks) (pageContents cMappers contents) - -stream :: Object -> T ByteString -stream (Stream {header, streamContent}) = return $ - case Map.lookup (Name "Filter") header of - Just (NameObject (Name "FlateDecode")) -> - Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent - _ -> streamContent -stream _ = list [] - -getFont :: Dictionary -> T Dictionary -getFont pageDict = - key "Resources" pageDict - >>= dict . Direct - >>= key "Font" - >>= follow - >>= dict - -cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap -cache loader objectId = do - loaded <- get - case Map.lookup objectId loaded of - Just value -> return value - Nothing -> do - value <- loader objectId - modify (Map.insert objectId value) >> return value - -loadFont :: ObjectId -> T CMap -loadFont objectId = - getObject objectId - >>= dict - >>= key "ToUnicode" - >>= follow - >>= stream - >>= either (handleError objectId emptyCMap) return . cMap - -loadCMappers :: Dictionary -> T CMappers -loadCMappers = foldM loadCMapper Map.empty . Map.toList - where - loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers - loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do - flip (Map.insert name) output <$> cache loadFont objectId - loadCMapper output _ = return output - -getObject :: ObjectId -> T Object -getObject objectId = do - content <- ask - return (objects content ! objectId) - -key :: String -> Dictionary -> T DirectObject -key keyName dictionary = - case Map.lookup (Name keyName) dictionary of - Just obj -> return obj - _ -> list [] - -target :: DirectObject -> T ObjectId -target (Reference (IndirectObjCoordinates {objectId})) = return objectId -target _ = list [] - -follow :: DirectObject -> T Object -follow directObject = target directObject >>= getObject - -dict :: Object -> T Dictionary -dict (Direct (Dictionary dictionary)) = return dictionary -dict _ = list [] - -pagesList :: T ObjectId -pagesList = do - root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask - pages <- dict =<< follow =<< key "Pages" root - case Map.lookup (Name "Kids") pages of - Just (Array kids) -> list $ filterObjectIds kids - _ -> list [] - -filterObjectIds :: [DirectObject] -> [ObjectId] -filterObjectIds objects = do - object <- objects - case object of - Reference (IndirectObjCoordinates {objectId}) -> [objectId] - _ -> [] - -listTextObjects :: Document -> [ByteString] -listTextObjects (Document {updates}) = - snd =<< evalRWST rwsMain (unify updates) Map.empty - where - rwsMain = - pagesList >>= getObject >>= extractText - +singlePage :: FilePath -> Int -> IO () +singlePage inputFile pageNumber = + onDoc inputFile (`get` pageNumber) >>= displayPage main :: IO () main = do - [inputFile] <- getArgs - result <- parseDocument <$> BS.readFile inputFile - case result of - Left parseError -> hPutStrLn stderr $ show parseError - Right doc -> mapM_ BS.putStrLn $ listTextObjects doc + hSetBuffering stdout LineBuffering + args <- getArgs + case args of + [inputFile] -> wholeDoc inputFile + [inputFile, pageNumber] -> singlePage inputFile (read pageNumber) + _ -> die "Syntax: getText INPUT_FILE [PAGE_NUMBER]" diff --git a/src/PDF/Pages.hs b/src/PDF/Pages.hs new file mode 100755 index 0000000..b882dbf --- /dev/null +++ b/src/PDF/Pages.hs @@ -0,0 +1,171 @@ +{-# LANGUAGE NamedFieldPuns #-} +module PDF.Pages ( + Page(..) + , get + , getAll + ) where + +import Codec.Compression.Zlib (decompress) +import Control.Monad (foldM) +import Control.Monad.RWS (RWST(..), ask, evalRWST, mapRWST, modify) +import qualified Control.Monad.RWS as RWS (get) +import Data.ByteString (ByteString) +import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict) +import Data.Map (Map, (!)) +import qualified Data.Map as Map (empty, fromList, insert, lookup, toList) +import PDF.CMap (CMap, CMappers, cMap) +import PDF.Object ( + Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..) + , Object(..), Name(..), Structure(..) + ,) +import PDF.Output (ObjectId(..)) +import PDF.Text (pageContents) +import Prelude hiding (fail) +import Text.Printf (printf) + +type CachedCMaps = Map ObjectId CMap +type T = RWST Content () CachedCMaps (Either String) +data Page = Page { + contents :: [ByteString] + , source :: ObjectId + } + +infixl 1 \\= +(\\=) :: T a -> (a -> Either String b) -> T b +x \\= f = mapRWST ((\(a, s, w) -> (\b -> (b, s, w)) <$> f a) =<<) x + +infixl 1 //= +(//=) :: Either String a -> (a -> T b) -> T b +(//=) (Left e) _ = RWST (\_ _ -> Left e) +(//=) (Right a) f = f a + +lift :: Either String a -> T a +lift x = x //= return + +expected :: Show a => String -> a -> Either String b +expected name = Left . printf "Not a %s: %s" name . show + +stream :: Object -> Either String ByteString +stream (Stream {header, streamContent}) = Right $ + case Map.lookup (Name "Filter") header of + Just (NameObject (Name "FlateDecode")) -> + Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent + _ -> streamContent +stream obj = expected "stream" obj + +getResource :: DirectObject -> T Dictionary +getResource (Dictionary dictionary) = return dictionary +getResource (Reference (IndirectObjCoordinates {objectId})) = + getObject objectId \\= dict +getResource directObject = + lift $ expected "resource (dictionary or reference)" directObject + +getFont :: Dictionary -> T Dictionary +getFont pageDict = + key "Resources" pageDict + //= getResource + \\= key "Font" + >>= follow + \\= dict + +cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap +cache loader objectId = + (maybe load return . Map.lookup objectId) =<< RWS.get + where + load = do + value <- loader objectId + modify $ Map.insert objectId value + return value + +loadFont :: ObjectId -> T CMap +loadFont objectId = + (getObject objectId + \\= dict + \\= key "ToUnicode" + >>= follow) + \\= stream + \\= cMap + +loadCMappers :: Dictionary -> T CMappers +loadCMappers = foldM loadCMapper Map.empty . Map.toList + where + loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers + loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = + flip (Map.insert name) output <$> cache loadFont objectId + --maybe output (flip (Map.insert name) output) <$> cache loadFont objectId + loadCMapper output _ = return output + +getObject :: ObjectId -> T Object +getObject objectId = do + content <- ask + return (objects content ! objectId) + +key :: String -> Dictionary -> Either String DirectObject +key keyName dictionary = + maybe (Left errorMessage) Right (Map.lookup (Name keyName) dictionary) + where + errorMessage = + printf "Key %s not found in dictionary %s" keyName (show dictionary) + +target :: DirectObject -> Either String ObjectId +target (Reference (IndirectObjCoordinates {objectId})) = Right objectId +target directObject = expected "reference" directObject + +many :: DirectObject -> [DirectObject] +many (Array l) = l +many directObject = [directObject] + +follow :: DirectObject -> T Object +follow directObject = target directObject //= getObject + +dict :: Object -> Either String Dictionary +dict (Direct (Dictionary dictionary)) = Right dictionary +dict obj = expected "dictionary" obj + +dictObject :: String -> Dictionary -> T Dictionary +dictObject keyName dictionary = key keyName dictionary //= follow \\= dict + +pagesList :: T [ObjectId] +pagesList = do + root <- dictObject "Root" . trailer . docStructure =<< ask + pages <- dictObject "Pages" root + case Map.lookup (Name "Kids") pages of + Just (Array kids) -> return $ getReferences kids + _ -> return [] + +getReferences :: [DirectObject] -> [ObjectId] +getReferences objects = do + object <- objects + case object of + Reference (IndirectObjCoordinates {objectId}) -> [objectId] + _ -> [] + +extractText :: Object -> T [ByteString] +extractText object = do + pageDict <- lift $ dict object + cMappers <- loadCMappers =<< getFont pageDict + let objects = ((many <$> (key "Contents" pageDict)) :: Either String [DirectObject]) + concat <$> (objects //= (mapM $ loadContent cMappers)) + where + loadContent :: CMappers -> DirectObject -> T [ByteString] + loadContent cMappers directObject = + follow directObject \\= stream \\= pageContents cMappers + +loadPage :: ObjectId -> T Page +loadPage source = + (\contents -> Page {contents, source}) <$> (extractText =<< getObject source) + +getAll :: Content -> Either String (Map Int Page) +getAll content = fst <$> evalRWST getPages content Map.empty + where + numbered = Map.fromList . zip [1..] + getPages = numbered <$> (mapM loadPage =<< pagesList) + +get :: Content -> Int -> Either String Page +get content pageNumber + | pageNumber < 1 = Left "Pages start at 1" + | otherwise = fst <$> evalRWST getPage content Map.empty + where + firstPage [] = lift $ Left "Page is out of bounds" + firstPage (p:_) = loadPage p + getPage = drop (pageNumber - 1) <$> pagesList >>= firstPage