Hufflepdf/examples/getText.hs

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}

import Codec.Compression.Zlib (decompress)
import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
import Control.Monad.Writer (tell)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS (readFile, putStrLn)
import qualified Data.ByteString.Lazy.Char8 as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, insert, lookup, toList)
import PDF (Document(..), parseDocument)
import PDF.Object (
      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
    , Object(..), Name(..), Structure(..)
  ,)
import PDF.Output (ObjectId)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, pageContents)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)

type CachedCMaps = Map ObjectId CMap
type T a = RWST Content [ByteString] CachedCMaps [] a

list :: [a] -> T a
list l = RWST (\_ s -> fillContext s <$> l)
  where
    fillContext s a = (a, s, [])

extractText :: Object -> T ()
extractText object = do
  pageDict <- dict object
  contents <- follow =<< key "Contents" pageDict
  case contents of
    (Stream {header, streamContent}) -> do
      font <- getFont pageDict
      cMappers <- loadCMappers font
      storeDecodedText cMappers $ clear header streamContent
    _ -> return ()

clear :: Dictionary -> ByteString -> ByteString
clear header streamContent =
  case Map.lookup (Name "Filter") header of
    Just (NameObject (Name "FlateDecode")) ->
      Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
    _ -> streamContent

storeDecodedText :: CMappers -> ByteString -> T ()
storeDecodedText font page =
  case pageContents font page of
    Left _ -> return ()
    Right (PageContents {chunks}) -> tell chunks

getFont :: Dictionary -> T Dictionary
getFont pageDict =
  key "Resources" pageDict
  >>= dict . Direct
  >>= key "Font"
  >>= follow
  >>= dict

cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
cache loader objectId = do
  loaded <- get
  case Map.lookup objectId loaded of
    Just value -> return value
    Nothing -> do
      value <- loader objectId
      modify (Map.insert objectId value) >> return value

loadFont :: ObjectId -> T CMap
loadFont objectId =
  getObject objectId
  >>= dict
  >>= key "ToUnicode"
  >>= follow

loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
  where
    loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
    loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
      flip (Map.insert name) output <$> cache loadFont objectId
    loadCMapper output _ = return output

getObject :: ObjectId -> T Object
getObject objectId = do
  content <- ask
  return (objects content ! objectId)

key :: String -> Dictionary -> T DirectObject
key keyName dictionary =
  case Map.lookup (Name keyName) dictionary of
    Just obj -> return obj
    _ -> list []

follow :: DirectObject -> T Object
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
follow _ = list []

dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary
dict _ = list []

pagesList :: T ObjectId
pagesList = do
  root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
  pages <- dict =<< follow =<< key "Pages" root
  case Map.lookup (Name "Kids") pages of
    Just (Array kids) -> list $ filterObjectIds kids
    _ -> list []

filterObjectIds :: [DirectObject] -> [ObjectId]
filterObjectIds objects = do
  object <- objects
  case object of
    Reference (IndirectObjCoordinates {objectId}) -> [objectId]
    _ -> []

listTextObjects :: Document -> [ByteString]
listTextObjects (Document {updates}) =
  snd =<< evalRWST rwsMain (unify updates) Map.empty
  where
    rwsMain =
      --Lazy.pack . show <$> (getObject =<< pagesList)
      pagesList >>= getObject >>= extractText


main :: IO ()
main = do
  [inputFile] <- getArgs
  result <- parseDocument <$> BS.readFile inputFile
  case result of
    Left parseError -> hPutStrLn stderr $ show parseError
    Right doc -> mapM_ BS.putStrLn $ listTextObjects doc