Compare commits
11 commits
main
...
add_quotes
Author | SHA1 | Date | |
---|---|---|---|
a3d3305d4b | |||
a0bba7173f | |||
b9066f0933 | |||
f2caf77510 | |||
8eae217d17 | |||
db5a9dabf9 | |||
1445322013 | |||
dc1253bbb1 | |||
|
6e72c38153 | ||
4e972cba18 | |||
|
55c807b192 |
2 changed files with 50 additions and 21 deletions
64
memory.py
64
memory.py
|
@ -1,41 +1,63 @@
|
|||
import csv
|
||||
import json
|
||||
from math import sqrt
|
||||
import re
|
||||
|
||||
WORD_THRESHOLD = 4
|
||||
WORD_LENGTH_THRESHOLD = 4
|
||||
|
||||
def build_db(inputCSV, outputJSON):
|
||||
class RowDecoder(json.JSONDecoder):
|
||||
def decode(self, s):
|
||||
db = json.JSONDecoder.decode(self, s)
|
||||
return [{**obj, 'index': set(obj['index'])} for obj in db]
|
||||
|
||||
class RowEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, set):
|
||||
return list(obj)
|
||||
else:
|
||||
return json.JSONEncoder.default(obj)
|
||||
|
||||
def keepOnlyAlphaChars(word):
|
||||
return ''.join([c for c in word if c.isalpha()])
|
||||
|
||||
def index(text):
|
||||
words = re.split('\s', text)
|
||||
normalized_words = [keepOnlyAlphaChars(word).lower() for word in words]
|
||||
important_words = set([w for w in normalized_words
|
||||
if len(w) >= WORD_LENGTH_THRESHOLD])
|
||||
return important_words
|
||||
|
||||
def insert(db, row):
|
||||
db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
|
||||
|
||||
def build_db(inputCSV):
|
||||
db = []
|
||||
with open(inputCSV, 'r') as file:
|
||||
csv_reader = csv.reader(file, delimiter=',')
|
||||
data = False
|
||||
for row in csv_reader:
|
||||
if data:
|
||||
db.append(row + (index(row[1]),))
|
||||
insert(db, row)
|
||||
else:
|
||||
data = True
|
||||
with open(outputJSON, 'w') as file:
|
||||
json.dump(serialize(db), file)
|
||||
return db
|
||||
|
||||
def serialize(db):
|
||||
return list(map(lambda row: (row[0], row[1], list(row[2])), db))
|
||||
|
||||
def unserialize(db):
|
||||
return list(map(lambda row: (row[0], row[1], set(row[2])), db))
|
||||
def save_db(db, outputJSON):
|
||||
with open(outputJSON, 'w') as file:
|
||||
json.dump(db, file, cls=RowEncoder)
|
||||
|
||||
def open_db(filePath):
|
||||
with open(filePath, 'r') as file:
|
||||
return unserialize(json.load(file))
|
||||
return json.load(file, cls=RowDecoder)
|
||||
|
||||
"""
|
||||
We define a similarity measure on sets which counts the number of elements
|
||||
they have in common
|
||||
"""
|
||||
def scalar(a, b):
|
||||
return len(a.intersection(b))
|
||||
sizeProduct = len(a)*len(b)
|
||||
return len(a.intersection(b))/sqrt(sizeProduct) if sizeProduct > 0 else 0
|
||||
|
||||
def find_best_quote(db, indexed_input)
|
||||
max_score = None
|
||||
for entry in db:
|
||||
|
||||
|
||||
def index(text):
|
||||
words = map(lambda w: ''.join([c for c in w if c.isalpha()]), text.split(' '))
|
||||
important_words = set([w for w in words if len(w) > WORD_THRESHOLD])
|
||||
return important_words
|
||||
def find_best_quote(db, user_input):
|
||||
indexed_input = index(user_input)
|
||||
return max(db, key=lambda row: scalar(indexed_input, row['index']))
|
||||
|
|
7
quotes.csv
Normal file
7
quotes.csv
Normal file
|
@ -0,0 +1,7 @@
|
|||
book,quote
|
||||
The Color of Magic,"Tourist, Rincewind decided, meant ""idiot""."
|
||||
The Colour of Magic,"""Let's just say that if complete and utter chaos was lightning, he'd be the sort to stand on a hilltop in a thunderstorm wearing wet copper armour and shouting 'All gods are bastards'."""
|
||||
The Light Fantastic,"""DID YOU SAY HUMANS PLAY IT FOR FUN?""
|
||||
""Some of them get to be very good at it, yes. I'm only an amateur,
|
||||
I'm afraid""
|
||||
""BUT THEY ONLY LIVE EIGHTY OR NINETY YEARS!"""
|
|
Loading…
Reference in a new issue