pdfcleaner/src/Main.hs

39 lines
1.3 KiB
Haskell

module Main where
import ALTO (Collection, collection)
import Control.Monad.Except.IOH (handle)
import Control.Monad.State (MonadState(..), execStateT)
import qualified Data.ByteString.Char8 as BS (readFile)
import qualified Data.ByteString.Lazy as Lazy (writeFile)
import qualified Data.Map as Map (keys)
import PDF (Document, UnifiedLayers(..), parseDocument, render)
import PDF.Box (Box(..), at)
import PDF.Pages (Pages(..), withFonts)
import Scoria (Scoriae)
import qualified Scoria (fromCSV)
import System.Environment (getArgs)
import System.Exit (die)
clean :: Document -> Collection -> Scoriae -> IO Document
clean pdf alto scoriae =
(at UnifiedLayers $ withFonts cleanLayer) pdf `handle` die
where
cleanLayer = execStateT $ do
pages <- get >>= r Pages
return ()
processFiles :: FilePath -> FilePath -> FilePath -> IO ()
processFiles inputFile altoDir outputFile = do
pdf <- either die return . parseDocument =<< BS.readFile inputFile
alto <- collection altoDir
scoriae <- Scoria.fromCSV <$> getContents
Lazy.writeFile outputFile . render =<< clean pdf alto scoriae
main :: IO ()
main = do
args <- getArgs
case args of
[inputFile, altoDir, outputFile] ->
processFiles inputFile altoDir outputFile
_ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR OUTPUT_FILE"