Normalise the scalar product to get comparable results unbiased by the particular length of a given (word) set

2024-02-03 11:45:39 +01:00 · 2024-02-03 11:45:39 +01:00 · 1445322013
commit 1445322013
parent dc1253bbb1
1 changed files with 2 additions and 1 deletions
--- a/memory.py
+++ b/memory.py
@ -1,5 +1,6 @@
 import csv
 import json
+from math import sqrt

 WORD_THRESHOLD = 4

@ -52,7 +53,7 @@ We define a similarity measure on sets which counts the number of elements
 they have in common
 """
 def scalar(a, b):
-    return len(a.intersection(b))
+    return len(a.intersection(b))/sqrt(len(a)*len(b))

 def find_best_quote(db, user_input):
    indexed_input = index(user_input)