LearnedBot/memory.py

63 lines
1.7 KiB
Python
Raw Normal View History

import csv
import json
from math import sqrt
WORD_THRESHOLD = 4
def build_db(inputCSV, outputJSON):
db = []
with open(inputCSV, 'r') as file:
csv_reader = csv.reader(file, delimiter=',')
data = False
for row in csv_reader:
if data:
insert(db, row)
else:
data = True
with open(outputJSON, 'w') as file:
json.dump(serialize(db), file)
return db
def insert(db, row):
db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
"""
/!\ C'est cassé, il faut trouver comment remplacer élégament juste la
propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et
désérialisation
"""
def serialize(db):
return list(map(lambda row: dict(**row, index=list(row['index'])), db))
def unserialize(db):
return list(map(lambda row: dict(**row, index=set(row['index'])), db))
"""
Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave
"""
def open_db(filePath):
with open(filePath, 'r') as file:
return unserialize(json.load(file))
def keepOnlyAlphaChars(word):
return ''.join([c for c in word if c.isalpha()])
def index(text):
words = text.split(' ')
normalized_words = [keepOnlyAlphaChars(word) for word in words]
important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD])
return important_words
"""
We define a similarity measure on sets which counts the number of elements
they have in common
"""
def scalar(a, b):
return len(a.intersection(b))/sqrt(len(a)*len(b))
def find_best_quote(db, user_input):
indexed_input = index(user_input)
max_score = None
for entry in db:
score = scalar(indexed_input, entry