Reformulate serialization with proper JSON(En|De)coders, separate building the DB from saving it + use regexes to split on whitespace characters instead of just ' ' (0x20)
This commit is contained in:
parent
1445322013
commit
db5a9dabf9
1 changed files with 30 additions and 29 deletions
59
memory.py
59
memory.py
|
@ -1,10 +1,35 @@
|
||||||
import csv
|
import csv
|
||||||
import json
|
import json
|
||||||
from math import sqrt
|
from math import sqrt
|
||||||
|
import re
|
||||||
|
|
||||||
WORD_THRESHOLD = 4
|
WORD_THRESHOLD = 4
|
||||||
|
|
||||||
def build_db(inputCSV, outputJSON):
|
class RowDecoder(json.JSONDecoder):
|
||||||
|
def decode(self, s):
|
||||||
|
db = json.JSONDecoder.decode(self, s)
|
||||||
|
return [{**obj, 'index': set(obj['index'])} for obj in db]
|
||||||
|
|
||||||
|
class RowEncoder(json.JSONEncoder):
|
||||||
|
def default(self, obj):
|
||||||
|
if isinstance(obj, set):
|
||||||
|
return list(obj)
|
||||||
|
else:
|
||||||
|
return json.JSONEncoder.default(obj)
|
||||||
|
|
||||||
|
def keepOnlyAlphaChars(word):
|
||||||
|
return ''.join([c for c in word if c.isalpha()])
|
||||||
|
|
||||||
|
def index(text):
|
||||||
|
words = re.split('\s', text)
|
||||||
|
normalized_words = [keepOnlyAlphaChars(word) for word in words]
|
||||||
|
important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD])
|
||||||
|
return important_words
|
||||||
|
|
||||||
|
def insert(db, row):
|
||||||
|
db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
|
||||||
|
|
||||||
|
def build_db(inputCSV):
|
||||||
db = []
|
db = []
|
||||||
with open(inputCSV, 'r') as file:
|
with open(inputCSV, 'r') as file:
|
||||||
csv_reader = csv.reader(file, delimiter=',')
|
csv_reader = csv.reader(file, delimiter=',')
|
||||||
|
@ -14,39 +39,15 @@ def build_db(inputCSV, outputJSON):
|
||||||
insert(db, row)
|
insert(db, row)
|
||||||
else:
|
else:
|
||||||
data = True
|
data = True
|
||||||
with open(outputJSON, 'w') as file:
|
|
||||||
json.dump(serialize(db), file)
|
|
||||||
return db
|
return db
|
||||||
|
|
||||||
def insert(db, row):
|
def save_db(db, outputJSON):
|
||||||
db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
|
with open(outputJSON, 'w') as file:
|
||||||
|
json.dump(db, file, cls=RowEncoder)
|
||||||
"""
|
|
||||||
/!\ C'est cassé, il faut trouver comment remplacer élégament juste la
|
|
||||||
propriété 'index' du dictionnaire en faisant list <-> set entre sérialisation et
|
|
||||||
désérialisation
|
|
||||||
"""
|
|
||||||
def serialize(db):
|
|
||||||
return list(map(lambda row: dict(**row, index=list(row['index'])), db))
|
|
||||||
|
|
||||||
def unserialize(db):
|
|
||||||
return list(map(lambda row: dict(**row, index=set(row['index'])), db))
|
|
||||||
"""
|
|
||||||
Du coup c'est cassé jusqu'ici mais en vrai Cafou il a dit c'était pas trop grave
|
|
||||||
"""
|
|
||||||
|
|
||||||
def open_db(filePath):
|
def open_db(filePath):
|
||||||
with open(filePath, 'r') as file:
|
with open(filePath, 'r') as file:
|
||||||
return unserialize(json.load(file))
|
return json.load(file, cls=RowDecoder)
|
||||||
|
|
||||||
def keepOnlyAlphaChars(word):
|
|
||||||
return ''.join([c for c in word if c.isalpha()])
|
|
||||||
|
|
||||||
def index(text):
|
|
||||||
words = text.split(' ')
|
|
||||||
normalized_words = [keepOnlyAlphaChars(word) for word in words]
|
|
||||||
important_words = set([w for w in normalized_words if len(w) > WORD_THRESHOLD])
|
|
||||||
return important_words
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
We define a similarity measure on sets which counts the number of elements
|
We define a similarity measure on sets which counts the number of elements
|
||||||
|
|
Loading…
Reference in a new issue