Improve db handling and indexation a wee bit
This commit is contained in:
parent
6e72c38153
commit
dc1253bbb1
1 changed files with 30 additions and 10 deletions
40
memory.py
40
memory.py
|
@ -10,32 +10,52 @@ def build_db(inputCSV, outputJSON):
|
||||||
data = False
|
data = False
|
||||||
for row in csv_reader:
|
for row in csv_reader:
|
||||||
if data:
|
if data:
|
||||||
db.append(row + (index(row[1]),))
|
insert(db, row)
|
||||||
else:
|
else:
|
||||||
data = True
|
data = True
|
||||||
with open(outputJSON, 'w') as file:
|
with open(outputJSON, 'w') as file:
|
||||||
json.dump(serialize(db), file)
|
json.dump(serialize(db), file)
|
||||||
return db
|
return db
|
||||||
|
|
||||||
|
def insert(db, row):
|
||||||
|
db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
|
||||||
|
|
||||||
|
"""
|
||||||
|
/!\ C'est cassé, il faut trouver comment remplacer élégament juste la
|
||||||
|
propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et
|
||||||
|
désérialisation
|
||||||
|
"""
|
||||||
def serialize(db):
|
def serialize(db):
|
||||||
return list(map(lambda row: (row[0], row[1], list(row[2])), db))
|
return list(map(lambda row: dict(**row, index=list(row['index'])), db))
|
||||||
|
|
||||||
def unserialize(db):
|
def unserialize(db):
|
||||||
return list(map(lambda row: (row[0], row[1], set(row[2])), db))
|
return list(map(lambda row: dict(**row, index=set(row['index'])), db))
|
||||||
|
"""
|
||||||
|
Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave
|
||||||
|
"""
|
||||||
|
|
||||||
def open_db(filePath):
|
def open_db(filePath):
|
||||||
with open(filePath, 'r') as file:
|
with open(filePath, 'r') as file:
|
||||||
return unserialize(json.load(file))
|
return unserialize(json.load(file))
|
||||||
|
|
||||||
|
def keepOnlyAlphaChars(word):
|
||||||
|
return ''.join([c for c in word if c.isalpha()])
|
||||||
|
|
||||||
|
def index(text):
|
||||||
|
words = text.split(' ')
|
||||||
|
normalized_words = [keepOnlyAlphaChars(word) for word in words]
|
||||||
|
important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD])
|
||||||
|
return important_words
|
||||||
|
|
||||||
|
"""
|
||||||
|
We define a similarity measure on sets which counts the number of elements
|
||||||
|
they have in common
|
||||||
|
"""
|
||||||
def scalar(a, b):
|
def scalar(a, b):
|
||||||
return len(a.intersection(b))
|
return len(a.intersection(b))
|
||||||
|
|
||||||
def find_best_quote(db, indexed_input)
|
def find_best_quote(db, user_input):
|
||||||
|
indexed_input = index(user_input)
|
||||||
max_score = None
|
max_score = None
|
||||||
for entry in db:
|
for entry in db:
|
||||||
|
score = scalar(indexed_input, entry
|
||||||
|
|
||||||
def index(text):
|
|
||||||
words = map(lambda w: ''.join([c for c in w if c.isalpha()]), text.split(' '))
|
|
||||||
important_words = set([w for w in words if len(w) > WORD_THRESHOLD])
|
|
||||||
return important_words
|
|
||||||
|
|
Loading…
Reference in a new issue