Improve db handling and indexation a wee bit

2024-01-27 17:33:43 +01:00 · 2024-01-27 17:33:43 +01:00 · dc1253bbb1
commit dc1253bbb1
parent 6e72c38153
1 changed files with 30 additions and 10 deletions
--- a/memory.py
+++ b/memory.py
@ -10,32 +10,52 @@ def build_db(inputCSV, outputJSON):
        data = False
        for row in csv_reader:
            if data:
-                db.append(row + (index(row[1]),))
+                insert(db, row)
            else:
                data = True
    with open(outputJSON, 'w') as file:
        json.dump(serialize(db), file)
    return db

+def insert(db, row):
+    db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
+
+"""
+/!\ C'est cassé, il faut trouver comment remplacer élégament juste la
+propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et
+désérialisation
+"""
 def serialize(db):
-    return list(map(lambda row: (row[0], row[1], list(row[2])), db))
+    return list(map(lambda row: dict(**row, index=list(row['index'])), db))

 def unserialize(db):
-    return list(map(lambda row: (row[0], row[1], set(row[2])), db))
+    return list(map(lambda row: dict(**row, index=set(row['index'])), db))
+"""
+Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave
+"""

 def open_db(filePath):
    with open(filePath, 'r') as file:
        return unserialize(json.load(file))

+def keepOnlyAlphaChars(word):
+    return ''.join([c for c in word if c.isalpha()])
+
+def index(text):
+    words = text.split(' ')
+    normalized_words = [keepOnlyAlphaChars(word) for word in words]
+    important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD])
+    return important_words
+
+"""
+We define a similarity measure on sets which counts the number of elements
+they have in common
+"""
 def scalar(a, b):
    return len(a.intersection(b))

-def find_best_quote(db, indexed_input)
+def find_best_quote(db, user_input):
+    indexed_input = index(user_input)
    max_score = None
    for entry in db:
-
-
-def index(text):
-    words = map(lambda w: ''.join([c for c in w if c.isalpha()]), text.split(' '))
-    important_words = set([w for w in words if len(w) > WORD_THRESHOLD])
-    return important_words
+        score = scalar(indexed_input, entry