pdfcleaner/src/ALTO.hs

50 lines
1.5 KiB
Haskell

{-# LANGUAGE NamedFieldPuns #-}
module ALTO (
Collection
, ALTO_ID(..)
, Words
, collection
, getWords
) where
import Data.List (sortOn)
import Data.Map (Map, (!), fromList)
import System.Directory (listDirectory)
import System.Exit (die)
import System.FilePath ((</>))
import Text.Read (readMaybe)
import Text.XML.Light (parseXML)
import Text.XML.Light.Extra (getAttr)
import Text.XML.Light.XPath (t, xPath)
data Collection = Collection {
path :: FilePath
, pages :: Map Int FilePath
}
newtype ALTO_ID = ALTO_ID {
altoID :: String
}
type Words = [(String, ALTO_ID)]
getWords :: Collection -> Int -> IO Words
getWords (Collection {path, pages}) pageNumber = do
alto <- parseXML <$> readFile (path </> pages ! pageNumber)
mapM wordOfElem $ xPath stringPath alto
where
stringPath =
t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"]
wordOfElem element =
case (getAttr "CONTENT" element, getAttr "ID" element) of
(Just content, Just elemID) -> return (content, ALTO_ID elemID)
(Nothing, _) -> die $ "Missing CONTENT in word " ++ show element
_ -> die $ "Missing ID in word " ++ show element
collection :: FilePath -> IO Collection
collection directory = buildCollection <$> listDirectory directory
where
rankPages files =
[(rank, file) | file <- files, Just rank <- [pageNumber file]]
pageNumber ('p':s) = readMaybe $ takeWhile (/= '.') s
pageNumber _ = Nothing
buildCollection = Collection directory . fromList . sortOn snd . rankPages