Normalise the scalar product to get comparable results unbiased by the particular length of a given (word) set
This commit is contained in:
parent
dc1253bbb1
commit
1445322013
1 changed files with 2 additions and 1 deletions
|
@ -1,5 +1,6 @@
|
||||||
import csv
|
import csv
|
||||||
import json
|
import json
|
||||||
|
from math import sqrt
|
||||||
|
|
||||||
WORD_THRESHOLD = 4
|
WORD_THRESHOLD = 4
|
||||||
|
|
||||||
|
@ -52,7 +53,7 @@ We define a similarity measure on sets which counts the number of elements
|
||||||
they have in common
|
they have in common
|
||||||
"""
|
"""
|
||||||
def scalar(a, b):
|
def scalar(a, b):
|
||||||
return len(a.intersection(b))
|
return len(a.intersection(b))/sqrt(len(a)*len(b))
|
||||||
|
|
||||||
def find_best_quote(db, user_input):
|
def find_best_quote(db, user_input):
|
||||||
indexed_input = index(user_input)
|
indexed_input = index(user_input)
|
||||||
|
|
Loading…
Reference in a new issue