42 lines
1 KiB
Python
42 lines
1 KiB
Python
|
import csv
|
||
|
import json
|
||
|
|
||
|
WORD_THRESHOLD = 4
|
||
|
|
||
|
def build_db(inputCSV, outputJSON):
|
||
|
db = []
|
||
|
with open(inputCSV, 'r') as file:
|
||
|
csv_reader = csv.reader(file, delimiter=',')
|
||
|
data = False
|
||
|
for row in csv_reader:
|
||
|
if data:
|
||
|
db.append(row + (index(row[1]),))
|
||
|
else:
|
||
|
data = True
|
||
|
with open(outputJSON, 'w') as file:
|
||
|
json.dump(serialize(db), file)
|
||
|
return db
|
||
|
|
||
|
def serialize(db):
|
||
|
return list(map(lambda row: (row[0], row[1], list(row[2])), db))
|
||
|
|
||
|
def unserialize(db):
|
||
|
return list(map(lambda row: (row[0], row[1], set(row[2])), db))
|
||
|
|
||
|
def open_db(filePath):
|
||
|
with open(filePath, 'r') as file:
|
||
|
return unserialize(json.load(file))
|
||
|
|
||
|
def scalar(a, b):
|
||
|
return len(a.intersection(b))
|
||
|
|
||
|
def find_best_quote(db, indexed_input)
|
||
|
max_score = None
|
||
|
for entry in db:
|
||
|
|
||
|
|
||
|
def index(text):
|
||
|
words = map(lambda w: ''.join([c for c in w if c.isalpha()]), text.split(' '))
|
||
|
important_words = set([w for w in words if len(w) > WORD_THRESHOLD])
|
||
|
return important_words
|