Normalise the scalar product to get comparable results unbiased by the particular length of a given (word) set

This commit is contained in:
Tissevert 2024-02-03 11:45:39 +01:00
parent dc1253bbb1
commit 1445322013

View file

@ -1,5 +1,6 @@
import csv import csv
import json import json
from math import sqrt
WORD_THRESHOLD = 4 WORD_THRESHOLD = 4
@ -52,7 +53,7 @@ We define a similarity measure on sets which counts the number of elements
they have in common they have in common
""" """
def scalar(a, b): def scalar(a, b):
return len(a.intersection(b)) return len(a.intersection(b))/sqrt(len(a)*len(b))
def find_best_quote(db, user_input): def find_best_quote(db, user_input):
indexed_input = index(user_input) indexed_input = index(user_input)