Compare commits

...

11 commits

2 changed files with 50 additions and 21 deletions

View file

@ -1,41 +1,63 @@
import csv import csv
import json import json
from math import sqrt
import re
WORD_THRESHOLD = 4 WORD_LENGTH_THRESHOLD = 4
def build_db(inputCSV, outputJSON): class RowDecoder(json.JSONDecoder):
def decode(self, s):
db = json.JSONDecoder.decode(self, s)
return [{**obj, 'index': set(obj['index'])} for obj in db]
class RowEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, set):
return list(obj)
else:
return json.JSONEncoder.default(obj)
def keepOnlyAlphaChars(word):
return ''.join([c for c in word if c.isalpha()])
def index(text):
words = re.split('\s', text)
normalized_words = [keepOnlyAlphaChars(word).lower() for word in words]
important_words = set([w for w in normalized_words
if len(w) >= WORD_LENGTH_THRESHOLD])
return important_words
def insert(db, row):
db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
def build_db(inputCSV):
db = [] db = []
with open(inputCSV, 'r') as file: with open(inputCSV, 'r') as file:
csv_reader = csv.reader(file, delimiter=',') csv_reader = csv.reader(file, delimiter=',')
data = False data = False
for row in csv_reader: for row in csv_reader:
if data: if data:
db.append(row + (index(row[1]),)) insert(db, row)
else: else:
data = True data = True
with open(outputJSON, 'w') as file:
json.dump(serialize(db), file)
return db return db
def serialize(db): def save_db(db, outputJSON):
return list(map(lambda row: (row[0], row[1], list(row[2])), db)) with open(outputJSON, 'w') as file:
json.dump(db, file, cls=RowEncoder)
def unserialize(db):
return list(map(lambda row: (row[0], row[1], set(row[2])), db))
def open_db(filePath): def open_db(filePath):
with open(filePath, 'r') as file: with open(filePath, 'r') as file:
return unserialize(json.load(file)) return json.load(file, cls=RowDecoder)
"""
We define a similarity measure on sets which counts the number of elements
they have in common
"""
def scalar(a, b): def scalar(a, b):
return len(a.intersection(b)) sizeProduct = len(a)*len(b)
return len(a.intersection(b))/sqrt(sizeProduct) if sizeProduct > 0 else 0
def find_best_quote(db, indexed_input) def find_best_quote(db, user_input):
max_score = None indexed_input = index(user_input)
for entry in db: return max(db, key=lambda row: scalar(indexed_input, row['index']))
def index(text):
words = map(lambda w: ''.join([c for c in w if c.isalpha()]), text.split(' '))
important_words = set([w for w in words if len(w) > WORD_THRESHOLD])
return important_words

7
quotes.csv Normal file
View file

@ -0,0 +1,7 @@
book,quote
The Color of Magic,"Tourist, Rincewind decided, meant ""idiot""."
The Colour of Magic,"""Let's just say that if complete and utter chaos was lightning, he'd be the sort to stand on a hilltop in a thunderstorm wearing wet copper armour and shouting 'All gods are bastards'."""
The Light Fantastic,"""DID YOU SAY HUMANS PLAY IT FOR FUN?""
""Some of them get to be very good at it, yes. I'm only an amateur,
I'm afraid""
""BUT THEY ONLY LIVE EIGHTY OR NINETY YEARS!"""
1 book quote
2 The Color of Magic Tourist, Rincewind decided, meant "idiot".
3 The Colour of Magic "Let's just say that if complete and utter chaos was lightning, he'd be the sort to stand on a hilltop in a thunderstorm wearing wet copper armour and shouting 'All gods are bastards'."
4 The Light Fantastic "DID YOU SAY HUMANS PLAY IT FOR FUN?" "Some of them get to be very good at it, yes. I'm only an amateur, I'm afraid" "BUT THEY ONLY LIVE EIGHTY OR NINETY YEARS!"