Finally implement the search

Avoid division by zero when computing the scalar product of two quotes
Turn indexed words lowercase to gain flexibility (to be restored when we have a real indexation mechanism)
2024-03-16 15:50:38 +01:00 · 2024-03-16 15:49:23 +01:00 · 2024-03-16 15:48:28 +01:00 · 2024-03-16 15:46:37 +01:00 · 2024-03-16 15:45:59 +01:00 · 2024-02-03 12:48:45 +01:00
2 changed files with 50 additions and 21 deletions
--- a/memory.py
+++ b/memory.py
@ -1,41 +1,63 @@
 import csv
 import json
+from math import sqrt
+import re

-WORD_THRESHOLD = 4
+WORD_LENGTH_THRESHOLD = 4

-def build_db(inputCSV, outputJSON):
+class RowDecoder(json.JSONDecoder):
+    def decode(self, s):
+        db = json.JSONDecoder.decode(self, s)
+        return [{**obj, 'index': set(obj['index'])} for obj in db]
+
+class RowEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, set):
+            return list(obj)
+        else:
+            return json.JSONEncoder.default(obj)
+
+def keepOnlyAlphaChars(word):
+    return ''.join([c for c in word if c.isalpha()])
+
+def index(text):
+    words = re.split('\s', text)
+    normalized_words = [keepOnlyAlphaChars(word).lower() for word in words]
+    important_words = set([w for w in normalized_words
+                             if len(w) >= WORD_LENGTH_THRESHOLD])
+    return important_words
+
+def insert(db, row):
+    db.append({'title': row[0], 'quote': row[1], 'index': index(row[1])})
+
+def build_db(inputCSV):
    db = []
    with open(inputCSV, 'r') as file:
        csv_reader = csv.reader(file, delimiter=',')
        data = False
        for row in csv_reader:
            if data:
-                db.append(row + (index(row[1]),))
+                insert(db, row)
            else:
                data = True
-    with open(outputJSON, 'w') as file:
-        json.dump(serialize(db), file)
    return db

-def serialize(db):
-    return list(map(lambda row: (row[0], row[1], list(row[2])), db))
-
-def unserialize(db):
-    return list(map(lambda row: (row[0], row[1], set(row[2])), db))
+def save_db(db, outputJSON):
+    with open(outputJSON, 'w') as file:
+        json.dump(db, file, cls=RowEncoder)

 def open_db(filePath):
    with open(filePath, 'r') as file:
-        return unserialize(json.load(file))
+        return json.load(file, cls=RowDecoder)

+"""
+We define a similarity measure on sets which counts the number of elements
+they have in common
+"""
 def scalar(a, b):
-    return len(a.intersection(b))
+    sizeProduct = len(a)*len(b)
+    return len(a.intersection(b))/sqrt(sizeProduct) if sizeProduct > 0 else 0

-def find_best_quote(db, indexed_input)
-    max_score = None
-    for entry in db:
-
-
-def index(text):
-    words = map(lambda w: ''.join([c for c in w if c.isalpha()]), text.split(' '))
-    important_words = set([w for w in words if len(w) > WORD_THRESHOLD])
-    return important_words
+def find_best_quote(db, user_input):
+    indexed_input = index(user_input)
+    return max(db, key=lambda row: scalar(indexed_input, row['index']))
--- a/quotes.csv
+++ b/quotes.csv
@ -0,0 +1,7 @@
+book,quote
+The Color of Magic,"Tourist, Rincewind decided, meant ""idiot""."
+The Colour of Magic,"""Let's just say that if complete and utter chaos was lightning, he'd be the sort to stand on a hilltop in a thunderstorm wearing wet copper armour and shouting 'All gods are bastards'."""
+The Light Fantastic,"""DID YOU SAY HUMANS PLAY IT FOR FUN?""
+""Some of them get to be very good at it, yes. I'm only an amateur,
+I'm afraid""
+""BUT THEY ONLY LIVE EIGHTY OR NINETY YEARS!"""
Author	SHA1	Message	Date
Tissevert	a3d3305d4b	Finally implement the search	2024-03-16 15:50:38 +01:00
Tissevert	a0bba7173f	Avoid division by zero when computing the scalar product of two quotes	2024-03-16 15:49:23 +01:00
Tissevert	b9066f0933	Turn indexed words lowercase to gain flexibility (to be restored when we have a real indexation mechanism)	2024-03-16 15:48:28 +01:00
Tissevert	f2caf77510	Rename the corresponding variable to be more explicit	2024-03-16 15:46:37 +01:00
Tissevert	8eae217d17	Edit the condition filter on words length to include 4-letter words in index	2024-03-16 15:45:59 +01:00
Tissevert	db5a9dabf9	Reformulate serialization with proper JSON(En\|De)coders, separate building the DB from saving it + use regexes to split on whitespace characters instead of just ' ' (0x20)	2024-02-03 12:48:45 +01:00
Tissevert	1445322013	Normalise the scalar product to get comparable results unbiased by the particular length of a given (word) set	2024-02-03 11:45:39 +01:00
Tissevert	dc1253bbb1	Improve db handling and indexation a wee bit	2024-01-27 17:33:54 +01:00
cafou	6e72c38153	adding new quotes	2024-01-27 16:35:58 +01:00
Tissevert	4e972cba18	Fix CSV format	2024-01-27 16:00:05 +01:00
cafou	55c807b192	adding a quotes file	2024-01-27 15:49:13 +01:00