LearnedBot/memory.py

import csv
import json
from math import sqrt

WORD_THRESHOLD = 4

def build_db(inputCSV, outputJSON):
    db = []
    with open(inputCSV, 'r') as file:
        csv_reader = csv.reader(file, delimiter=',')
        data = False
        for row in csv_reader:
            if data:
                insert(db, row)
            else:
                data = True
    with open(outputJSON, 'w') as file:
        json.dump(serialize(db), file)
    return db

def insert(db, row):
    db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})

"""
/!\ C'est cassé, il faut trouver comment remplacer élégament juste la
propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et
désérialisation
"""
def serialize(db):
    return list(map(lambda row: dict(**row, index=list(row['index'])), db))

def unserialize(db):
    return list(map(lambda row: dict(**row, index=set(row['index'])), db))
"""
Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave
"""

def open_db(filePath):
    with open(filePath, 'r') as file:
        return unserialize(json.load(file))

def keepOnlyAlphaChars(word):
    return ''.join([c for c in word if c.isalpha()])

def index(text):
    words = text.split(' ')
    normalized_words = [keepOnlyAlphaChars(word) for word in words]
    important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD])
    return important_words

"""
We define a similarity measure on sets which counts the number of elements
they have in common
"""
def scalar(a, b):
    return len(a.intersection(b))/sqrt(len(a)*len(b))

def find_best_quote(db, user_input):
    indexed_input = index(user_input)
    max_score = None
    for entry in db:
        score = scalar(indexed_input, entry