Try and implement basic linguistic stuff for english
This commit is contained in:
parent
f44bf6259e
commit
b5432a4358
4 changed files with 213 additions and 27 deletions
|
@ -19,6 +19,8 @@ extra-source-files: CHANGELOG.md
|
|||
|
||||
library
|
||||
exposed-modules: Graph
|
||||
, Lang.En.Grapheme
|
||||
, Lang.En.Morpheme
|
||||
, Stream
|
||||
, Transducer
|
||||
, Tree
|
||||
|
|
100
src/Lang/En/Grapheme.hs
Normal file
100
src/Lang/En/Grapheme.hs
Normal file
|
@ -0,0 +1,100 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
{-# LANGUAGE FlexibleInstances #-}
|
||||
module Lang.En.Grapheme (
|
||||
Grapheme(..)
|
||||
, Output(..)
|
||||
, Regular(..)
|
||||
, graphemes
|
||||
, parse
|
||||
) where
|
||||
|
||||
import Data.Char (toLower, isLower)
|
||||
import Data.List (groupBy)
|
||||
import qualified Data.Map as Map (empty, insert, lookup)
|
||||
import Data.String (IsString(..))
|
||||
--import Graph (Graph(..))
|
||||
import Stream (Stream)
|
||||
import Transducer (Transducer(..), fromList, run)
|
||||
--import Transducer (Transducer(..), RunState(..), Stack(..), fromList)
|
||||
|
||||
data Regular =
|
||||
A
|
||||
| Ae
|
||||
| Ai
|
||||
| Ay
|
||||
| E
|
||||
| Ea
|
||||
| Ee
|
||||
| Ei
|
||||
| B
|
||||
| C
|
||||
| Ck
|
||||
| D
|
||||
| F
|
||||
| G
|
||||
| Gh
|
||||
| L
|
||||
| P
|
||||
| S
|
||||
| T
|
||||
| Th
|
||||
| W
|
||||
| Wh
|
||||
deriving (Bounded, Enum, Eq, Ord, Read, Show)
|
||||
|
||||
{-
|
||||
card :: Int
|
||||
card = fromEnum (maxBound :: Regular)
|
||||
-}
|
||||
|
||||
data Grapheme = Grapheme Regular | Punctuation String deriving (Eq, Ord, Show)
|
||||
|
||||
instance Read Grapheme where
|
||||
readsPrec _ next =
|
||||
[(maybe (Punctuation next) Grapheme $ Map.lookup next stringToGrapheme, "")]
|
||||
where
|
||||
stringToGrapheme = foldr insert Map.empty [minBound .. maxBound]
|
||||
insert regular = Map.insert (show regular) regular
|
||||
|
||||
|
||||
instance {-# OVERLAPS #-} IsString [Grapheme] where
|
||||
fromString = fmap read . groupBy (const isLower)
|
||||
|
||||
data Output = Output {
|
||||
rawText :: String
|
||||
, grapheme :: Grapheme
|
||||
} deriving (Eq, Show)
|
||||
|
||||
{-
|
||||
instance Enum Grapheme where
|
||||
fromEnum (Regular r) = fromEnum r
|
||||
fromEnum (Noise c) = card + fromEnum c
|
||||
|
||||
toEnum n
|
||||
| n < card = Regular $ toEnum n
|
||||
| otherwise = Noise $ toEnum (n - card)
|
||||
-}
|
||||
|
||||
auto :: Regular -> ([Char], Either [Char] Grapheme)
|
||||
auto r = (toLower <$> show r, Right (Grapheme r))
|
||||
|
||||
punctuation :: String -> ([Char], Either [Char] Grapheme)
|
||||
punctuation signs = (signs, Right (Punctuation signs))
|
||||
|
||||
{-
|
||||
groupSpaces :: RunState Char Char Grapheme -> Maybe (RunState Char Char Grapheme)
|
||||
groupSpaces state@(RunState {transducer, stack = Stack (' ':ls)})
|
||||
| focus (graph transducer) == 0 = Just $ state {stack = Stack ls}
|
||||
groupSpaces _ = Nothing
|
||||
-}
|
||||
|
||||
graphemes :: Transducer Char Char Grapheme
|
||||
graphemes = (fromList $ punctuations ++ regulars) {
|
||||
projector = toLower
|
||||
}
|
||||
where
|
||||
regulars = auto <$> [minBound .. maxBound]
|
||||
punctuations = punctuation <$> ["?", "!", ".", ",", ":", " "]
|
||||
|
||||
parse :: Stream Char -> Stream Output
|
||||
parse = fmap (uncurry Output) . run graphemes
|
53
src/Lang/En/Morpheme.hs
Normal file
53
src/Lang/En/Morpheme.hs
Normal file
|
@ -0,0 +1,53 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module Lang.En.Morpheme (
|
||||
Morpheme(..)
|
||||
, Output(..)
|
||||
, morphemes
|
||||
, parse
|
||||
) where
|
||||
|
||||
import Data.String (fromString)
|
||||
import Lang.En.Grapheme (Grapheme(..))
|
||||
import qualified Lang.En.Grapheme as Grapheme (Output(..))
|
||||
import Stream (Stream)
|
||||
import Transducer (Transducer(..), fromList, run)
|
||||
|
||||
data Regular =
|
||||
LY
|
||||
| ED
|
||||
| UN
|
||||
| CAT
|
||||
| DOG
|
||||
| EaT
|
||||
| SLEeP
|
||||
| A
|
||||
| ThE
|
||||
| GOoD
|
||||
| KINd
|
||||
| ANd
|
||||
| Space
|
||||
deriving (Bounded, Enum, Eq, Ord, Show)
|
||||
|
||||
data Morpheme = Morpheme Regular | Punctuation String deriving (Eq, Ord, Show)
|
||||
|
||||
auto :: Morpheme -> ([Grapheme], Either [Grapheme] Morpheme)
|
||||
auto morpheme = (fromString $ show morpheme, Right morpheme)
|
||||
|
||||
morphemes :: Transducer Grapheme.Output Grapheme Morpheme
|
||||
morphemes = (fromList $ auto <$> [minBound .. maxBound]) {
|
||||
projector = Grapheme.grapheme
|
||||
, rules = []
|
||||
}
|
||||
|
||||
data Output = Output {
|
||||
rawText :: String
|
||||
, morpheme :: Morpheme
|
||||
} deriving (Eq, Show)
|
||||
|
||||
parse :: Stream Grapheme.Output -> Stream Output
|
||||
parse = fmap fusionGraphemes . run morphemes
|
||||
where
|
||||
fusionGraphemes (graphemes, morpheme) = Output {
|
||||
rawText = concat $ Grapheme.rawText <$> graphemes
|
||||
, morpheme
|
||||
}
|
|
@ -1,34 +1,35 @@
|
|||
{-# LANGUAGE NamedFieldPuns #-}
|
||||
module Transducer (
|
||||
Transducer
|
||||
RunState(..)
|
||||
, Stack(..)
|
||||
, Transducer(..)
|
||||
, fromList
|
||||
, run
|
||||
) where
|
||||
|
||||
import Data.List (find)
|
||||
import Data.Map ((!), insert)
|
||||
import Data.Maybe (isJust)
|
||||
import Graph (Graph(..), Vertex(..), editVertex, follow, rewind, singleton, weave)
|
||||
import Stream (Stream(..), merge)
|
||||
import qualified Stream (empty)
|
||||
|
||||
type Transducer input output = Graph input [output]
|
||||
newtype Stack a = Stack [a]
|
||||
data Transducer input label output = Transducer {
|
||||
graph :: Graph label [output]
|
||||
, rules :: [RunState input label output -> Maybe (RunState input label output)]
|
||||
, projector :: input -> label
|
||||
}
|
||||
|
||||
push :: a -> Stack a -> Stack a
|
||||
push a (Stack s) = Stack (a:s)
|
||||
|
||||
content :: Stack a -> [a]
|
||||
content (Stack s) = reverse s
|
||||
|
||||
empty :: Transducer input output
|
||||
empty :: Graph edge [label]
|
||||
empty = singleton []
|
||||
|
||||
entry :: Ord input => Transducer input output -> ([input], output) -> Transducer input output
|
||||
entry :: Ord edge => Graph edge [label] -> ([edge], label) -> Graph edge [label]
|
||||
entry transducer (path, output) =
|
||||
rewind $ editVertex (weave transducer path) pushLabel
|
||||
where
|
||||
pushLabel vertex = vertex {label = output:(label vertex)}
|
||||
|
||||
loop :: Ord input => Transducer input output -> ([input], [input]) -> Transducer input output
|
||||
loop :: Ord edge => Graph edge [label] -> ([edge], [edge]) -> Graph edge [label]
|
||||
loop transducer ([], loopPath) =
|
||||
case splitAt (length loopPath - 1) loopPath of
|
||||
(_, []) -> transducer
|
||||
|
@ -41,23 +42,53 @@ loop transducer ([], loopPath) =
|
|||
vertex {edges = insert input (focus transducer) $ edges vertex}
|
||||
loop transducer (prefix, loopPath) = loop (weave transducer prefix) ([], loopPath)
|
||||
|
||||
fromList :: Ord input => [([input], Either [input] output)] -> Transducer input output
|
||||
fromList = foldl add empty
|
||||
fromList :: Ord input => [([input], Either [input] output)] -> Transducer input input output
|
||||
fromList l = Transducer {graph = foldl add empty l, projector = id, rules = []}
|
||||
where
|
||||
add transducer (path, Left loopPath) = loop transducer (path, loopPath)
|
||||
add transducer (path, Right output) = entry transducer (path, output)
|
||||
|
||||
run :: (Ord input, Eq output) => Transducer input output -> Stream input -> Stream ([input], output)
|
||||
run transducer = runWithStack initState
|
||||
where
|
||||
initState = (transducer, Stack [])
|
||||
emit stack stream output = ((content stack, output), runWithStack initState stream)
|
||||
runWithStack (state, stack) (Stream inputs) = foldl (\outputsStream (input, stream) ->
|
||||
case follow state input of
|
||||
data RunState input label output = RunState {
|
||||
transducer :: Transducer input label output
|
||||
, stack :: Stack input
|
||||
}
|
||||
|
||||
initState :: Transducer input label output -> RunState input label output
|
||||
initState transducer = RunState {
|
||||
transducer = transducer {graph = rewind $ graph transducer}
|
||||
, stack = Stack []
|
||||
}
|
||||
|
||||
newtype Stack a = Stack [a]
|
||||
|
||||
push :: a -> Stack a -> Stack a
|
||||
push a (Stack s) = Stack (a:s)
|
||||
|
||||
content :: Stack a -> [a]
|
||||
content (Stack s) = reverse s
|
||||
|
||||
tryInput :: (Eq input, Ord label, Eq output) => RunState input label output -> Stream ([input], output) -> (input, Stream input) -> Stream ([input], output)
|
||||
tryInput state@(RunState {transducer, stack}) outputsStream (input, stream) =
|
||||
case follow (graph transducer) (projector transducer $ input) of
|
||||
Nothing -> Stream.empty
|
||||
Just newState@(Graph {vertices, focus}) ->
|
||||
Just newGraph@(Graph {vertices, focus}) ->
|
||||
let newStack = push input stack in
|
||||
let emitted = Stream $ emit newStack stream <$> label (vertices ! focus) in
|
||||
let continue = runWithStack (newState, newStack) stream in
|
||||
let emitted = Stream $ emit newStack <$> label (vertices ! focus) in
|
||||
let continue = runState (state {
|
||||
transducer = transducer {graph = newGraph}
|
||||
, stack = newStack
|
||||
}) stream in
|
||||
emitted `merge` continue `merge` outputsStream
|
||||
) Stream.empty inputs
|
||||
where
|
||||
emit aStack output = (
|
||||
(content aStack, output), runState (initState transducer) stream
|
||||
)
|
||||
|
||||
runState :: (Eq input, Ord label, Eq output) => RunState input label output -> Stream input -> Stream ([input], output)
|
||||
runState state@(RunState {transducer}) (Stream inputs) =
|
||||
case find isJust $ ($ state) <$> rules transducer of
|
||||
Just (Just newState) -> runState newState (Stream inputs)
|
||||
_ -> foldl (tryInput state) Stream.empty inputs
|
||||
|
||||
run :: (Eq input, Ord label, Eq output) => Transducer input label output -> Stream input -> Stream ([input], output)
|
||||
run transducer = runState $ initState transducer
|
||||
|
|
Loading…
Reference in a new issue