From 144532201381e0c6822a0dd121a69173919a117c Mon Sep 17 00:00:00 2001 From: Tissevert Date: Sat, 3 Feb 2024 11:45:39 +0100 Subject: [PATCH] Normalise the scalar product to get comparable results unbiased by the particular length of a given (word) set --- memory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/memory.py b/memory.py index 3a283b8..88216fc 100644 --- a/memory.py +++ b/memory.py @@ -1,5 +1,6 @@ import csv import json +from math import sqrt WORD_THRESHOLD = 4 @@ -52,7 +53,7 @@ We define a similarity measure on sets which counts the number of elements they have in common """ def scalar(a, b): - return len(a.intersection(b)) + return len(a.intersection(b))/sqrt(len(a)*len(b)) def find_best_quote(db, user_input): indexed_input = index(user_input)