LearnedBot/memory.py

import csv
import json
from math import sqrt

WORD_THRESHOLD = 4

def build_db(inputCSV, outputJSON):
    db = []
    with open(inputCSV, 'r') as file:
        csv_reader = csv.reader(file, delimiter=',')
        data = False
        for row in csv_reader:
            if data:
                insert(db, row)
            else:
                data = True
    with open(outputJSON, 'w') as file:
        json.dump(serialize(db), file)
    return db

def insert(db, row):
    db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})

"""
/!\ C'est cassé, il faut trouver comment remplacer élégament juste la
propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et
désérialisation
"""
def serialize(db):
    return list(map(lambda row: dict(**row, index=list(row['index'])), db))

def unserialize(db):
    return list(map(lambda row: dict(**row, index=set(row['index'])), db))
"""
Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave
"""

def open_db(filePath):
    with open(filePath, 'r') as file:
        return unserialize(json.load(file))

def keepOnlyAlphaChars(word):
    return ''.join([c for c in word if c.isalpha()])

def index(text):
    words = text.split(' ')
    normalized_words = [keepOnlyAlphaChars(word) for word in words]
    important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD])
    return important_words

"""
We define a similarity measure on sets which counts the number of elements
they have in common
"""
def scalar(a, b):
    return len(a.intersection(b))/sqrt(len(a)*len(b))

def find_best_quote(db, user_input):
    indexed_input = index(user_input)
    max_score = None
    for entry in db:
        score = scalar(indexed_input, entry
Commit a draft from after the first working session 2022-11-27 22:24:29 +01:00			`import csv`
			`import json`
Normalise the scalar product to get comparable results unbiased by the particular length of a given (word) set 2024-02-03 11:45:39 +01:00			`from math import sqrt`
Commit a draft from after the first working session 2022-11-27 22:24:29 +01:00
			`WORD_THRESHOLD = 4`

			`def build_db(inputCSV, outputJSON):`
			`db = []`
			`with open(inputCSV, 'r') as file:`
			`csv_reader = csv.reader(file, delimiter=',')`
			`data = False`
			`for row in csv_reader:`
			`if data:`
Improve db handling and indexation a wee bit 2024-01-27 17:33:43 +01:00			`insert(db, row)`
Commit a draft from after the first working session 2022-11-27 22:24:29 +01:00			`else:`
			`data = True`
			`with open(outputJSON, 'w') as file:`
			`json.dump(serialize(db), file)`
			`return db`

Improve db handling and indexation a wee bit 2024-01-27 17:33:43 +01:00			`def insert(db, row):`
			`db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})`

			`"""`
			`/!\ C'est cassé, il faut trouver comment remplacer élégament juste la`
			`propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et`
			`désérialisation`
			`"""`
Commit a draft from after the first working session 2022-11-27 22:24:29 +01:00			`def serialize(db):`
Improve db handling and indexation a wee bit 2024-01-27 17:33:43 +01:00			`return list(map(lambda row: dict(**row, index=list(row['index'])), db))`
Commit a draft from after the first working session 2022-11-27 22:24:29 +01:00
			`def unserialize(db):`
Improve db handling and indexation a wee bit 2024-01-27 17:33:43 +01:00			`return list(map(lambda row: dict(**row, index=set(row['index'])), db))`
			`"""`
			`Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave`
			`"""`
Commit a draft from after the first working session 2022-11-27 22:24:29 +01:00
			`def open_db(filePath):`
			`with open(filePath, 'r') as file:`
			`return unserialize(json.load(file))`

Improve db handling and indexation a wee bit 2024-01-27 17:33:43 +01:00			`def keepOnlyAlphaChars(word):`
			`return ''.join([c for c in word if c.isalpha()])`

			`def index(text):`
			`words = text.split(' ')`
			`normalized_words = [keepOnlyAlphaChars(word) for word in words]`
			`important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD])`
			`return important_words`

			`"""`
			`We define a similarity measure on sets which counts the number of elements`
			`they have in common`
			`"""`
Commit a draft from after the first working session 2022-11-27 22:24:29 +01:00			`def scalar(a, b):`
Normalise the scalar product to get comparable results unbiased by the particular length of a given (word) set 2024-02-03 11:45:39 +01:00			`return len(a.intersection(b))/sqrt(len(a)*len(b))`
Commit a draft from after the first working session 2022-11-27 22:24:29 +01:00
Improve db handling and indexation a wee bit 2024-01-27 17:33:43 +01:00			`def find_best_quote(db, user_input):`
			`indexed_input = index(user_input)`
Commit a draft from after the first working session 2022-11-27 22:24:29 +01:00			`max_score = None`
			`for entry in db:`
Improve db handling and indexation a wee bit 2024-01-27 17:33:43 +01:00			`score = scalar(indexed_input, entry`