Normalise the scalar product to get comparable results unbiased by the particular length of a given (word) set

This commit is contained in:
Tissevert 2024-02-03 11:45:39 +01:00
parent dc1253bbb1
commit 1445322013

View file

@ -1,5 +1,6 @@
import csv
import json
from math import sqrt
WORD_THRESHOLD = 4
@ -52,7 +53,7 @@ We define a similarity measure on sets which counts the number of elements
they have in common
"""
def scalar(a, b):
return len(a.intersection(b))
return len(a.intersection(b))/sqrt(len(a)*len(b))
def find_best_quote(db, user_input):
indexed_input = index(user_input)