Hufflepdf/examples/getText.hs

{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE OverloadedStrings #-}

import Codec.Compression.Zlib (decompress)
import Control.Monad (foldM)
import Control.Monad.RWS (RWST(..), ask, evalRWST, get, modify)
import Control.Monad.Writer (tell)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS (pack, putStrLn, readFile)
import qualified Data.ByteString.Lazy as Lazy (fromStrict, toStrict)
import Data.Map (Map, (!))
import qualified Data.Map as Map (empty, insert, lookup, toList)
import PDF (Document(..), parseDocument)
import PDF.Object (
      Content(..), Dictionary, DirectObject(..), IndirectObjCoordinates(..)
    , Object(..), Name(..), Structure(..)
  ,)
import PDF.Output (ObjectId)
import PDF.Text (CMap, CMappers, PageContents(..), cMap, emptyCMap, pageContents)
import PDF.Update (unify)
import System.Environment (getArgs)
import System.IO (hPutStrLn, stderr)

type CachedCMaps = Map ObjectId CMap
type T a = RWST Content [ByteString] CachedCMaps [] a

list :: [a] -> T a
list l = RWST (\_ s -> fillContext s <$> l)
  where
    fillContext s a = (a, s, [])

handleError :: a -> String -> T a
handleError defaultValue s =
  (tell . replicate 1 . BS.pack $ "Warning: " ++ s) >> return defaultValue

extractText :: Object -> T ()
extractText object = do
  pageDict <- dict object
  cMappers <- loadCMappers =<< getFont pageDict
  contents <- stream =<< follow =<< key "Contents" pageDict
  either (handleError ()) (tell . chunks) (pageContents cMappers contents)

stream :: Object -> T ByteString
stream (Stream {header, streamContent}) = return $
  case Map.lookup (Name "Filter") header of
    Just (NameObject (Name "FlateDecode")) ->
      Lazy.toStrict . decompress . Lazy.fromStrict $ streamContent
    _ -> streamContent
stream _ = list []

getFont :: Dictionary -> T Dictionary
getFont pageDict =
  key "Resources" pageDict
  >>= dict . Direct
  >>= key "Font"
  >>= follow
  >>= dict

cache :: (ObjectId -> T CMap) -> ObjectId -> T CMap
cache loader objectId = do
  loaded <- get
  case Map.lookup objectId loaded of
    Just value -> return value
    Nothing -> do
      value <- loader objectId
      modify (Map.insert objectId value) >> return value

loadFont :: ObjectId -> T CMap
loadFont objectId =
  getObject objectId
  >>= dict
  >>= key "ToUnicode"
  >>= follow
  >>= stream
  >>= either (handleError emptyCMap) return . cMap

loadCMappers :: Dictionary -> T CMappers
loadCMappers = foldM loadCMapper Map.empty . Map.toList
  where
    loadCMapper :: CMappers -> (Name, DirectObject) -> T CMappers
    loadCMapper output (name, Reference (IndirectObjCoordinates {objectId})) = do
      flip (Map.insert name) output <$> cache loadFont objectId
    loadCMapper output _ = return output

getObject :: ObjectId -> T Object
getObject objectId = do
  content <- ask
  return (objects content ! objectId)

key :: String -> Dictionary -> T DirectObject
key keyName dictionary =
  case Map.lookup (Name keyName) dictionary of
    Just obj -> return obj
    _ -> list []

follow :: DirectObject -> T Object
follow (Reference (IndirectObjCoordinates {objectId})) = getObject objectId
follow _ = list []

dict :: Object -> T Dictionary
dict (Direct (Dictionary dictionary)) = return dictionary
dict _ = list []

pagesList :: T ObjectId
pagesList = do
  root <- dict =<< follow =<< key "Root" . trailer . docStructure =<< ask
  pages <- dict =<< follow =<< key "Pages" root
  case Map.lookup (Name "Kids") pages of
    Just (Array kids) -> list $ filterObjectIds kids
    _ -> list []

filterObjectIds :: [DirectObject] -> [ObjectId]
filterObjectIds objects = do
  object <- objects
  case object of
    Reference (IndirectObjCoordinates {objectId}) -> [objectId]
    _ -> []

listTextObjects :: Document -> [ByteString]
listTextObjects (Document {updates}) =
  snd =<< evalRWST rwsMain (unify updates) Map.empty
  where
    rwsMain =
      --Lazy.pack . show <$> (getObject =<< pagesList)
      pagesList >>= getObject >>= extractText


main :: IO ()
main = do
  [inputFile] <- getArgs
  result <- parseDocument <$> BS.readFile inputFile
  case result of
    Left parseError -> hPutStrLn stderr $ show parseError
    Right doc -> mapM_ BS.putStrLn $ listTextObjects doc