import csv import json from math import sqrt WORD_THRESHOLD = 4 def build_db(inputCSV, outputJSON): db = [] with open(inputCSV, 'r') as file: csv_reader = csv.reader(file, delimiter=',') data = False for row in csv_reader: if data: insert(db, row) else: data = True with open(outputJSON, 'w') as file: json.dump(serialize(db), file) return db def insert(db, row): db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])}) """ /!\ C'est cassé, il faut trouver comment remplacer élégament juste la propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et désérialisation """ def serialize(db): return list(map(lambda row: dict(**row, index=list(row['index'])), db)) def unserialize(db): return list(map(lambda row: dict(**row, index=set(row['index'])), db)) """ Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave """ def open_db(filePath): with open(filePath, 'r') as file: return unserialize(json.load(file)) def keepOnlyAlphaChars(word): return ''.join([c for c in word if c.isalpha()]) def index(text): words = text.split(' ') normalized_words = [keepOnlyAlphaChars(word) for word in words] important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD]) return important_words """ We define a similarity measure on sets which counts the number of elements they have in common """ def scalar(a, b): return len(a.intersection(b))/sqrt(len(a)*len(b)) def find_best_quote(db, user_input): indexed_input = index(user_input) max_score = None for entry in db: score = scalar(indexed_input, entry