Compare commits

..

No commits in common. "add_quotes" and "main" have entirely different histories.

2 changed files with 21 additions and 50 deletions

View File

@ -1,63 +1,41 @@
import csv
import json
from math import sqrt
import re
WORD_LENGTH_THRESHOLD = 4
WORD_THRESHOLD = 4
class RowDecoder(json.JSONDecoder):
def decode(self, s):
db = json.JSONDecoder.decode(self, s)
return [{**obj, 'index': set(obj['index'])} for obj in db]
class RowEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, set):
return list(obj)
else:
return json.JSONEncoder.default(obj)
def keepOnlyAlphaChars(word):
return ''.join([c for c in word if c.isalpha()])
def index(text):
words = re.split('\s', text)
normalized_words = [keepOnlyAlphaChars(word).lower() for word in words]
important_words = set([w for w in normalized_words
if len(w) >= WORD_LENGTH_THRESHOLD])
return important_words
def insert(db, row):
db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
def build_db(inputCSV):
def build_db(inputCSV, outputJSON):
db = []
with open(inputCSV, 'r') as file:
csv_reader = csv.reader(file, delimiter=',')
data = False
for row in csv_reader:
if data:
insert(db, row)
db.append(row + (index(row[1]),))
else:
data = True
with open(outputJSON, 'w') as file:
json.dump(serialize(db), file)
return db
def save_db(db, outputJSON):
with open(outputJSON, 'w') as file:
json.dump(db, file, cls=RowEncoder)
def serialize(db):
return list(map(lambda row: (row[0], row[1], list(row[2])), db))
def unserialize(db):
return list(map(lambda row: (row[0], row[1], set(row[2])), db))
def open_db(filePath):
with open(filePath, 'r') as file:
return json.load(file, cls=RowDecoder)
return unserialize(json.load(file))
"""
We define a similarity measure on sets which counts the number of elements
they have in common
"""
def scalar(a, b):
sizeProduct = len(a)*len(b)
return len(a.intersection(b))/sqrt(sizeProduct) if sizeProduct > 0 else 0
return len(a.intersection(b))
def find_best_quote(db, user_input):
indexed_input = index(user_input)
return max(db, key=lambda row: scalar(indexed_input, row['index']))
def find_best_quote(db, indexed_input)
max_score = None
for entry in db:
def index(text):
words = map(lambda w: ''.join([c for c in w if c.isalpha()]), text.split(' '))
important_words = set([w for w in words if len(w) > WORD_THRESHOLD])
return important_words

View File

@ -1,7 +0,0 @@
book,quote
The Color of Magic,"Tourist, Rincewind decided, meant ""idiot""."
The Colour of Magic,"""Let's just say that if complete and utter chaos was lightning, he'd be the sort to stand on a hilltop in a thunderstorm wearing wet copper armour and shouting 'All gods are bastards'."""
The Light Fantastic,"""DID YOU SAY HUMANS PLAY IT FOR FUN?""
""Some of them get to be very good at it, yes. I'm only an amateur,
I'm afraid""
""BUT THEY ONLY LIVE EIGHTY OR NINETY YEARS!"""
1 book quote
2 The Color of Magic Tourist, Rincewind decided, meant "idiot".
3 The Colour of Magic "Let's just say that if complete and utter chaos was lightning, he'd be the sort to stand on a hilltop in a thunderstorm wearing wet copper armour and shouting 'All gods are bastards'."
4 The Light Fantastic "DID YOU SAY HUMANS PLAY IT FOR FUN?" "Some of them get to be very good at it, yes. I'm only an amateur, I'm afraid" "BUT THEY ONLY LIVE EIGHTY OR NINETY YEARS!"