Reformulate serialization with proper JSON(En|De)coders, separate building the DB from saving it + use regexes to split on whitespace characters instead of just ' ' (0x20)

2024-02-03 12:47:30 +01:00 · 2024-02-03 12:47:30 +01:00 · db5a9dabf9
commit db5a9dabf9
parent 1445322013
1 changed files with 30 additions and 29 deletions
--- a/memory.py
+++ b/memory.py
@ -1,10 +1,35 @@
 import csv
 import json
 from math import sqrt
+import re

 WORD_THRESHOLD = 4

-def build_db(inputCSV, outputJSON):
+class RowDecoder(json.JSONDecoder):
+    def decode(self, s):
+        db = json.JSONDecoder.decode(self, s)
+        return [{**obj, 'index': set(obj['index'])} for obj in db]
+
+class RowEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, set):
+            return list(obj)
+        else:
+            return json.JSONEncoder.default(obj)
+
+def keepOnlyAlphaChars(word):
+    return ''.join([c for c in word if c.isalpha()])
+
+def index(text):
+    words = re.split('\s', text)
+    normalized_words = [keepOnlyAlphaChars(word) for word in words]
+    important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD])
+    return important_words
+
+def insert(db, row):
+    db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
+
+def build_db(inputCSV):
    db = []
    with open(inputCSV, 'r') as file:
        csv_reader = csv.reader(file, delimiter=',')
@ -14,39 +39,15 @@ def build_db(inputCSV, outputJSON):
                insert(db, row)
            else:
                data = True
-    with open(outputJSON, 'w') as file:
-        json.dump(serialize(db), file)
    return db

-def insert(db, row):
-    db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
-
-"""
-/!\ C'est cassé, il faut trouver comment remplacer élégament juste la
-propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et
-désérialisation
-"""
-def serialize(db):
-    return list(map(lambda row: dict(**row, index=list(row['index'])), db))
-
-def unserialize(db):
-    return list(map(lambda row: dict(**row, index=set(row['index'])), db))
-"""
-Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave
-"""
+def save_db(db, outputJSON):
+    with open(outputJSON, 'w') as file:
+        json.dump(db, file, cls=RowEncoder)

 def open_db(filePath):
    with open(filePath, 'r') as file:
-        return unserialize(json.load(file))
-
-def keepOnlyAlphaChars(word):
-    return ''.join([c for c in word if c.isalpha()])
-
-def index(text):
-    words = text.split(' ')
-    normalized_words = [keepOnlyAlphaChars(word) for word in words]
-    important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD])
-    return important_words
+        return json.load(file, cls=RowDecoder)

 """
 We define a similarity measure on sets which counts the number of elements