46 lines
1.5 KiB
Haskell
46 lines
1.5 KiB
Haskell
{-# LANGUAGE NamedFieldPuns #-}
|
|
module ALTO (
|
|
Collection
|
|
, Words
|
|
, collection
|
|
, getWords
|
|
) where
|
|
|
|
import Data.List (sortOn)
|
|
import Data.Map (Map, (!), fromList)
|
|
import System.Directory (listDirectory)
|
|
import System.Exit (die)
|
|
import System.FilePath ((</>))
|
|
import Text.Read (readMaybe)
|
|
import Text.XML.Light (parseXML)
|
|
import Text.XML.Light.Extra (getAttr)
|
|
import Text.XML.Light.XPath (t, xPath)
|
|
|
|
data Collection = Collection {
|
|
path :: FilePath
|
|
, pages :: Map Int FilePath
|
|
}
|
|
type Words = [(String, String)]
|
|
|
|
getWords :: Collection -> Int -> IO Words
|
|
getWords (Collection {path, pages}) pageNumber = do
|
|
alto <- parseXML <$> readFile (path </> pages ! pageNumber)
|
|
mapM wordOfElem $ xPath stringPath alto
|
|
where
|
|
stringPath =
|
|
t<$>["alto", "Layout", "Page", "PrintSpace", "TextBlock", "TextLine", "String"]
|
|
wordOfElem element =
|
|
case (getAttr "CONTENT" element, getAttr "ID" element) of
|
|
(Just content, Just elemID) -> return (content, elemID)
|
|
(Nothing, _) -> die $ "Missing CONTENT in word " ++ show element
|
|
_ -> die $ "Missing ID in word " ++ show element
|
|
|
|
collection :: FilePath -> IO Collection
|
|
collection directory = buildCollection <$> listDirectory directory
|
|
where
|
|
rankPages files =
|
|
[(rank, file) | file <- files, Just rank <- [pageNumber file]]
|
|
pageNumber ('p':s) = readMaybe $ takeWhile (/= '.') s
|
|
pageNumber _ = Nothing
|
|
buildCollection = Collection directory . fromList . sortOn snd . rankPages
|