diff --git a/memory.py b/memory.py index 88216fc..8350879 100644 --- a/memory.py +++ b/memory.py @@ -1,10 +1,35 @@ import csv import json from math import sqrt +import re WORD_THRESHOLD = 4 -def build_db(inputCSV, outputJSON): +class RowDecoder(json.JSONDecoder): + def decode(self, s): + db = json.JSONDecoder.decode(self, s) + return [{**obj, 'index': set(obj['index'])} for obj in db] + +class RowEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + else: + return json.JSONEncoder.default(obj) + +def keepOnlyAlphaChars(word): + return ''.join([c for c in word if c.isalpha()]) + +def index(text): + words = re.split('\s', text) + normalized_words = [keepOnlyAlphaChars(word) for word in words] + important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD]) + return important_words + +def insert(db, row): + db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])}) + +def build_db(inputCSV): db = [] with open(inputCSV, 'r') as file: csv_reader = csv.reader(file, delimiter=',') @@ -14,39 +39,15 @@ def build_db(inputCSV, outputJSON): insert(db, row) else: data = True - with open(outputJSON, 'w') as file: - json.dump(serialize(db), file) return db -def insert(db, row): - db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])}) - -""" -/!\ C'est cassé, il faut trouver comment remplacer élégament juste la -propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et -désérialisation -""" -def serialize(db): - return list(map(lambda row: dict(**row, index=list(row['index'])), db)) - -def unserialize(db): - return list(map(lambda row: dict(**row, index=set(row['index'])), db)) -""" -Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave -""" +def save_db(db, outputJSON): + with open(outputJSON, 'w') as file: + json.dump(db, file, cls=RowEncoder) def open_db(filePath): with open(filePath, 'r') as file: - return unserialize(json.load(file)) - -def keepOnlyAlphaChars(word): - return ''.join([c for c in word if c.isalpha()]) - -def index(text): - words = text.split(' ') - normalized_words = [keepOnlyAlphaChars(word) for word in words] - important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD]) - return important_words + return json.load(file, cls=RowDecoder) """ We define a similarity measure on sets which counts the number of elements