module Main where import ALTO (Collection, collection) import Control.Monad.Except.IOH (handle) import Control.Monad.State (MonadState(..), execStateT) import qualified Data.ByteString.Char8 as BS (readFile) import qualified Data.ByteString.Lazy as Lazy (writeFile) import qualified Data.Map as Map (keys) import PDF (Document, UnifiedLayers(..), parseDocument, render) import PDF.Box (Box(..), at) import PDF.Pages (Pages(..), withFonts) import Scoria (Scoriae) import qualified Scoria (fromCSV) import System.Environment (getArgs) import System.Exit (die) clean :: Document -> Collection -> Scoriae -> IO Document clean pdf alto scoriae = (at UnifiedLayers $ withFonts cleanLayer) pdf `handle` die where cleanLayer = execStateT $ do pages <- get >>= r Pages return () processFiles :: FilePath -> FilePath -> FilePath -> IO () processFiles inputFile altoDir outputFile = do pdf <- either die return . parseDocument =<< BS.readFile inputFile alto <- collection altoDir scoriae <- Scoria.fromCSV <$> getContents Lazy.writeFile outputFile . render =<< clean pdf alto scoriae main :: IO () main = do args <- getArgs case args of [inputFile, altoDir, outputFile] -> processFiles inputFile altoDir outputFile _ -> die "Syntax: pdfcleaner INPUT_PDF_FILE ALTO_DIR OUTPUT_FILE"