Compare commits

...

11 Commits

2 changed files with 50 additions and 21 deletions

View File

@ -1,41 +1,63 @@
import csv
import json
from math import sqrt
import re
WORD_THRESHOLD = 4
WORD_LENGTH_THRESHOLD = 4
def build_db(inputCSV, outputJSON):
class RowDecoder(json.JSONDecoder):
def decode(self, s):
db = json.JSONDecoder.decode(self, s)
return [{**obj, 'index': set(obj['index'])} for obj in db]
class RowEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, set):
return list(obj)
else:
return json.JSONEncoder.default(obj)
def keepOnlyAlphaChars(word):
return ''.join([c for c in word if c.isalpha()])
def index(text):
words = re.split('\s', text)
normalized_words = [keepOnlyAlphaChars(word).lower() for word in words]
important_words = set([w for w in normalized_words
if len(w) >= WORD_LENGTH_THRESHOLD])
return important_words
def insert(db, row):
db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
def build_db(inputCSV):
db = []
with open(inputCSV, 'r') as file:
csv_reader = csv.reader(file, delimiter=',')
data = False
for row in csv_reader:
if data:
db.append(row + (index(row[1]),))
insert(db, row)
else:
data = True
with open(outputJSON, 'w') as file:
json.dump(serialize(db), file)
return db
def serialize(db):
return list(map(lambda row: (row[0], row[1], list(row[2])), db))
def unserialize(db):
return list(map(lambda row: (row[0], row[1], set(row[2])), db))
def save_db(db, outputJSON):
with open(outputJSON, 'w') as file:
json.dump(db, file, cls=RowEncoder)
def open_db(filePath):
with open(filePath, 'r') as file:
return unserialize(json.load(file))
return json.load(file, cls=RowDecoder)
"""
We define a similarity measure on sets which counts the number of elements
they have in common
"""
def scalar(a, b):
return len(a.intersection(b))
sizeProduct = len(a)*len(b)
return len(a.intersection(b))/sqrt(sizeProduct) if sizeProduct > 0 else 0
def find_best_quote(db, indexed_input)
max_score = None
for entry in db:
def index(text):
words = map(lambda w: ''.join([c for c in w if c.isalpha()]), text.split(' '))
important_words = set([w for w in words if len(w) > WORD_THRESHOLD])
return important_words
def find_best_quote(db, user_input):
indexed_input = index(user_input)
return max(db, key=lambda row: scalar(indexed_input, row['index']))

7
quotes.csv Normal file
View File

@ -0,0 +1,7 @@
book,quote
The Color of Magic,"Tourist, Rincewind decided, meant ""idiot""."
The Colour of Magic,"""Let's just say that if complete and utter chaos was lightning, he'd be the sort to stand on a hilltop in a thunderstorm wearing wet copper armour and shouting 'All gods are bastards'."""
The Light Fantastic,"""DID YOU SAY HUMANS PLAY IT FOR FUN?""
""Some of them get to be very good at it, yes. I'm only an amateur,
I'm afraid""
""BUT THEY ONLY LIVE EIGHTY OR NINETY YEARS!"""
1 book quote
2 The Color of Magic Tourist, Rincewind decided, meant "idiot".
3 The Colour of Magic "Let's just say that if complete and utter chaos was lightning, he'd be the sort to stand on a hilltop in a thunderstorm wearing wet copper armour and shouting 'All gods are bastards'."
4 The Light Fantastic "DID YOU SAY HUMANS PLAY IT FOR FUN?" "Some of them get to be very good at it, yes. I'm only an amateur, I'm afraid" "BUT THEY ONLY LIVE EIGHTY OR NINETY YEARS!"