diff --git a/memory.py b/memory.py index 796411d..3a283b8 100644 --- a/memory.py +++ b/memory.py @@ -10,32 +10,52 @@ def build_db(inputCSV, outputJSON): data = False for row in csv_reader: if data: - db.append(row + (index(row[1]),)) + insert(db, row) else: data = True with open(outputJSON, 'w') as file: json.dump(serialize(db), file) return db +def insert(db, row): + db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])}) + +""" +/!\ C'est cassé, il faut trouver comment remplacer élégament juste la +propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et +désérialisation +""" def serialize(db): - return list(map(lambda row: (row[0], row[1], list(row[2])), db)) + return list(map(lambda row: dict(**row, index=list(row['index'])), db)) def unserialize(db): - return list(map(lambda row: (row[0], row[1], set(row[2])), db)) + return list(map(lambda row: dict(**row, index=set(row['index'])), db)) +""" +Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave +""" def open_db(filePath): with open(filePath, 'r') as file: return unserialize(json.load(file)) +def keepOnlyAlphaChars(word): + return ''.join([c for c in word if c.isalpha()]) + +def index(text): + words = text.split(' ') + normalized_words = [keepOnlyAlphaChars(word) for word in words] + important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD]) + return important_words + +""" +We define a similarity measure on sets which counts the number of elements +they have in common +""" def scalar(a, b): return len(a.intersection(b)) -def find_best_quote(db, indexed_input) +def find_best_quote(db, user_input): + indexed_input = index(user_input) max_score = None for entry in db: - - -def index(text): - words = map(lambda w: ''.join([c for c in w if c.isalpha()]), text.split(' ')) - important_words = set([w for w in words if len(w) > WORD_THRESHOLD]) - return important_words + score = scalar(indexed_input, entry