From b5432a43586ed5b857f70930fe5118be782adadd Mon Sep 17 00:00:00 2001 From: Tissevert Date: Sun, 2 Jun 2019 22:06:57 +0200 Subject: [PATCH] Try and implement basic linguistic stuff for english --- Mainate.cabal | 2 + src/Lang/En/Grapheme.hs | 100 ++++++++++++++++++++++++++++++++++++++++ src/Lang/En/Morpheme.hs | 53 +++++++++++++++++++++ src/Transducer.hs | 85 +++++++++++++++++++++++----------- 4 files changed, 213 insertions(+), 27 deletions(-) create mode 100644 src/Lang/En/Grapheme.hs create mode 100644 src/Lang/En/Morpheme.hs diff --git a/Mainate.cabal b/Mainate.cabal index 2f65908..b289cf1 100644 --- a/Mainate.cabal +++ b/Mainate.cabal @@ -19,6 +19,8 @@ extra-source-files: CHANGELOG.md library exposed-modules: Graph + , Lang.En.Grapheme + , Lang.En.Morpheme , Stream , Transducer , Tree diff --git a/src/Lang/En/Grapheme.hs b/src/Lang/En/Grapheme.hs new file mode 100644 index 0000000..ec75d7b --- /dev/null +++ b/src/Lang/En/Grapheme.hs @@ -0,0 +1,100 @@ +{-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE FlexibleInstances #-} +module Lang.En.Grapheme ( + Grapheme(..) + , Output(..) + , Regular(..) + , graphemes + , parse + ) where + +import Data.Char (toLower, isLower) +import Data.List (groupBy) +import qualified Data.Map as Map (empty, insert, lookup) +import Data.String (IsString(..)) +--import Graph (Graph(..)) +import Stream (Stream) +import Transducer (Transducer(..), fromList, run) +--import Transducer (Transducer(..), RunState(..), Stack(..), fromList) + +data Regular = + A + | Ae + | Ai + | Ay + | E + | Ea + | Ee + | Ei + | B + | C + | Ck + | D + | F + | G + | Gh + | L + | P + | S + | T + | Th + | W + | Wh + deriving (Bounded, Enum, Eq, Ord, Read, Show) + +{- +card :: Int +card = fromEnum (maxBound :: Regular) +-} + +data Grapheme = Grapheme Regular | Punctuation String deriving (Eq, Ord, Show) + +instance Read Grapheme where + readsPrec _ next = + [(maybe (Punctuation next) Grapheme $ Map.lookup next stringToGrapheme, "")] + where + stringToGrapheme = foldr insert Map.empty [minBound .. maxBound] + insert regular = Map.insert (show regular) regular + + +instance {-# OVERLAPS #-} IsString [Grapheme] where + fromString = fmap read . groupBy (const isLower) + +data Output = Output { + rawText :: String + , grapheme :: Grapheme + } deriving (Eq, Show) + +{- +instance Enum Grapheme where + fromEnum (Regular r) = fromEnum r + fromEnum (Noise c) = card + fromEnum c + + toEnum n + | n < card = Regular $ toEnum n + | otherwise = Noise $ toEnum (n - card) +-} + +auto :: Regular -> ([Char], Either [Char] Grapheme) +auto r = (toLower <$> show r, Right (Grapheme r)) + +punctuation :: String -> ([Char], Either [Char] Grapheme) +punctuation signs = (signs, Right (Punctuation signs)) + +{- +groupSpaces :: RunState Char Char Grapheme -> Maybe (RunState Char Char Grapheme) +groupSpaces state@(RunState {transducer, stack = Stack (' ':ls)}) + | focus (graph transducer) == 0 = Just $ state {stack = Stack ls} +groupSpaces _ = Nothing +-} + +graphemes :: Transducer Char Char Grapheme +graphemes = (fromList $ punctuations ++ regulars) { + projector = toLower + } + where + regulars = auto <$> [minBound .. maxBound] + punctuations = punctuation <$> ["?", "!", ".", ",", ":", " "] + +parse :: Stream Char -> Stream Output +parse = fmap (uncurry Output) . run graphemes diff --git a/src/Lang/En/Morpheme.hs b/src/Lang/En/Morpheme.hs new file mode 100644 index 0000000..ed374dc --- /dev/null +++ b/src/Lang/En/Morpheme.hs @@ -0,0 +1,53 @@ +{-# LANGUAGE NamedFieldPuns #-} +module Lang.En.Morpheme ( + Morpheme(..) + , Output(..) + , morphemes + , parse + ) where + +import Data.String (fromString) +import Lang.En.Grapheme (Grapheme(..)) +import qualified Lang.En.Grapheme as Grapheme (Output(..)) +import Stream (Stream) +import Transducer (Transducer(..), fromList, run) + +data Regular = + LY + | ED + | UN + | CAT + | DOG + | EaT + | SLEeP + | A + | ThE + | GOoD + | KINd + | ANd + | Space + deriving (Bounded, Enum, Eq, Ord, Show) + +data Morpheme = Morpheme Regular | Punctuation String deriving (Eq, Ord, Show) + +auto :: Morpheme -> ([Grapheme], Either [Grapheme] Morpheme) +auto morpheme = (fromString $ show morpheme, Right morpheme) + +morphemes :: Transducer Grapheme.Output Grapheme Morpheme +morphemes = (fromList $ auto <$> [minBound .. maxBound]) { + projector = Grapheme.grapheme + , rules = [] + } + +data Output = Output { + rawText :: String + , morpheme :: Morpheme + } deriving (Eq, Show) + +parse :: Stream Grapheme.Output -> Stream Output +parse = fmap fusionGraphemes . run morphemes + where + fusionGraphemes (graphemes, morpheme) = Output { + rawText = concat $ Grapheme.rawText <$> graphemes + , morpheme + } diff --git a/src/Transducer.hs b/src/Transducer.hs index 8645944..540947a 100644 --- a/src/Transducer.hs +++ b/src/Transducer.hs @@ -1,34 +1,35 @@ {-# LANGUAGE NamedFieldPuns #-} module Transducer ( - Transducer + RunState(..) + , Stack(..) + , Transducer(..) , fromList , run ) where +import Data.List (find) import Data.Map ((!), insert) +import Data.Maybe (isJust) import Graph (Graph(..), Vertex(..), editVertex, follow, rewind, singleton, weave) import Stream (Stream(..), merge) import qualified Stream (empty) -type Transducer input output = Graph input [output] -newtype Stack a = Stack [a] +data Transducer input label output = Transducer { + graph :: Graph label [output] + , rules :: [RunState input label output -> Maybe (RunState input label output)] + , projector :: input -> label + } -push :: a -> Stack a -> Stack a -push a (Stack s) = Stack (a:s) - -content :: Stack a -> [a] -content (Stack s) = reverse s - -empty :: Transducer input output +empty :: Graph edge [label] empty = singleton [] -entry :: Ord input => Transducer input output -> ([input], output) -> Transducer input output +entry :: Ord edge => Graph edge [label] -> ([edge], label) -> Graph edge [label] entry transducer (path, output) = rewind $ editVertex (weave transducer path) pushLabel where pushLabel vertex = vertex {label = output:(label vertex)} -loop :: Ord input => Transducer input output -> ([input], [input]) -> Transducer input output +loop :: Ord edge => Graph edge [label] -> ([edge], [edge]) -> Graph edge [label] loop transducer ([], loopPath) = case splitAt (length loopPath - 1) loopPath of (_, []) -> transducer @@ -41,23 +42,53 @@ loop transducer ([], loopPath) = vertex {edges = insert input (focus transducer) $ edges vertex} loop transducer (prefix, loopPath) = loop (weave transducer prefix) ([], loopPath) -fromList :: Ord input => [([input], Either [input] output)] -> Transducer input output -fromList = foldl add empty +fromList :: Ord input => [([input], Either [input] output)] -> Transducer input input output +fromList l = Transducer {graph = foldl add empty l, projector = id, rules = []} where add transducer (path, Left loopPath) = loop transducer (path, loopPath) add transducer (path, Right output) = entry transducer (path, output) -run :: (Ord input, Eq output) => Transducer input output -> Stream input -> Stream ([input], output) -run transducer = runWithStack initState +data RunState input label output = RunState { + transducer :: Transducer input label output + , stack :: Stack input + } + +initState :: Transducer input label output -> RunState input label output +initState transducer = RunState { + transducer = transducer {graph = rewind $ graph transducer} + , stack = Stack [] + } + +newtype Stack a = Stack [a] + +push :: a -> Stack a -> Stack a +push a (Stack s) = Stack (a:s) + +content :: Stack a -> [a] +content (Stack s) = reverse s + +tryInput :: (Eq input, Ord label, Eq output) => RunState input label output -> Stream ([input], output) -> (input, Stream input) -> Stream ([input], output) +tryInput state@(RunState {transducer, stack}) outputsStream (input, stream) = + case follow (graph transducer) (projector transducer $ input) of + Nothing -> Stream.empty + Just newGraph@(Graph {vertices, focus}) -> + let newStack = push input stack in + let emitted = Stream $ emit newStack <$> label (vertices ! focus) in + let continue = runState (state { + transducer = transducer {graph = newGraph} + , stack = newStack + }) stream in + emitted `merge` continue `merge` outputsStream where - initState = (transducer, Stack []) - emit stack stream output = ((content stack, output), runWithStack initState stream) - runWithStack (state, stack) (Stream inputs) = foldl (\outputsStream (input, stream) -> - case follow state input of - Nothing -> Stream.empty - Just newState@(Graph {vertices, focus}) -> - let newStack = push input stack in - let emitted = Stream $ emit newStack stream <$> label (vertices ! focus) in - let continue = runWithStack (newState, newStack) stream in - emitted `merge` continue `merge` outputsStream - ) Stream.empty inputs + emit aStack output = ( + (content aStack, output), runState (initState transducer) stream + ) + +runState :: (Eq input, Ord label, Eq output) => RunState input label output -> Stream input -> Stream ([input], output) +runState state@(RunState {transducer}) (Stream inputs) = + case find isJust $ ($ state) <$> rules transducer of + Just (Just newState) -> runState newState (Stream inputs) + _ -> foldl (tryInput state) Stream.empty inputs + +run :: (Eq input, Ord label, Eq output) => Transducer input label output -> Stream input -> Stream ([input], output) +run transducer = runState $ initState transducer