{-# LANGUAGE NamedFieldPuns #-} module ALTO ( Collection , ALTO_ID(..) , Words , collection , getWords ) where import Data.List (sortOn) import Data.Map (Map, (!), fromList) import System.Directory (listDirectory) import System.Exit (die) import System.FilePath (()) import Text.Read (readMaybe) import Text.XML.Light (parseXML) import Text.XML.Light.Extra (getAttr) import Text.XML.Light.XPath (t, xPath) data Collection = Collection { path :: FilePath , pages :: Map Int FilePath } newtype ALTO_ID = ALTO_ID { altoID :: String } type Words = [(String, ALTO_ID)] getWords :: Collection -> Int -> IO Words getWords (Collection {path, pages}) pageNumber = do alto <- parseXML <$> readFile (path pages ! pageNumber) mapM wordOfElem $ xPath stringPath alto where stringPath = t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"] wordOfElem element = case (getAttr "CONTENT" element, getAttr "ID" element) of (Just content, Just elemID) -> return (content, ALTO_ID elemID) (Nothing, _) -> die $ "Missing CONTENT in word " ++ show element _ -> die $ "Missing ID in word " ++ show element collection :: FilePath -> IO Collection collection directory = buildCollection <$> listDirectory directory where rankPages files = [(rank, file) | file <- files, Just rank <- [pageNumber file]] pageNumber ('p':s) = readMaybe $ takeWhile (/= '.') s pageNumber _ = Nothing buildCollection = Collection directory . fromList . sortOn snd . rankPages