-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve and add Spearman #227
base: master
Are you sure you want to change the base?
Changes from all commits
781da79
0ab00e6
dec5d9a
061a551
be724f0
e5c52a5
afc541b
6c82c02
184aa8e
f1c5798
f4c1300
ce49014
198d45f
24eafd1
4d207b7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,3 +16,4 @@ script: | |
branches: | ||
only: | ||
- master | ||
- spearman |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ Available similarity measures: | |
msd | ||
pearson | ||
pearson_baseline | ||
spearman | ||
""" | ||
|
||
from __future__ import (absolute_import, division, print_function, | ||
|
@@ -24,6 +25,8 @@ import numpy as np | |
from six.moves import range | ||
from six import iteritems | ||
|
||
from scipy.stats import rankdata | ||
|
||
|
||
def cosine(n_x, yr, min_support): | ||
"""Compute the cosine similarity between all pairs of users (or items). | ||
|
@@ -196,7 +199,8 @@ def pearson(n_x, yr, min_support): | |
-1). | ||
|
||
For details on Pearson coefficient, see `Wikipedia | ||
<https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient#For_a_sample>`__. | ||
<https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_ | ||
coefficient#For_a_sample>`__. | ||
|
||
""" | ||
|
||
|
@@ -359,3 +363,121 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases, | |
sim[xj, xi] = sim[xi, xj] | ||
|
||
return sim | ||
|
||
|
||
def spearman(n_x, yr, min_support): | ||
"""Compute the Spearman correlation coefficient between all pairs of users | ||
(or items). | ||
|
||
Only **common** users (or items) are taken into account. | ||
The Spearman rank correlation is a variation of the Pearson correlation. | ||
The ratings are replaced by their rankings. | ||
|
||
The Spearman Rank Correlation is suitable for the investigation of random | ||
variables which are not based on a normal distribution. | ||
|
||
It is defined by: | ||
|
||
.. math :: | ||
\\text{spearman_sim}(u, v) = \\frac{ \\sum\\limits_{i \\in I_{uv}} | ||
(rank(r_{ui}) - \\overline{rank(u)}) \\cdot (rank(r_{vi}) - | ||
\\overline{rank(v)})} {\\sqrt{\\sum\\limits_{i | ||
\\in I_{uv}} (rank(r_{ui}) - \\overline{rank(u)})^2} \\cdot | ||
\\sqrt{\\sum\\limits_{i \\in | ||
I_{uv}} (rank(r_{vi}) - \\overline{rank(v)})^2} } | ||
|
||
or | ||
|
||
.. math :: | ||
\\text{spearman_sim}(i, j) = \\frac{ \\sum\\limits_{u \\in U_{ij}} | ||
(rank(r_{ui}) - \\overline{rank(i)}) \\cdot (rank(r_{uj}) - | ||
\\overline{rank(j)})} {\\sqrt{\\sum\\limits_{u | ||
\\in U_{ij}} (rank(r_{ui}) - \\overline{rank(i)})^2} \\cdot | ||
\\sqrt{\\sum\\limits_{u \\in | ||
U_{ij}} (rank(r_{uj}) - \\overline{rank(j)})^2} } | ||
|
||
depending on the ``user_based`` field of ``sim_options`` (see | ||
:ref:`similarity_measures_configuration`). | ||
|
||
|
||
Note: if there are no common users or items, similarity will be 0 (and not | ||
-1). | ||
|
||
For details on Spearman coefficient, see in chapter 4, page 126 of | ||
*Recommender Systems Handbook*. | ||
|
||
""" | ||
|
||
# number of common ys | ||
cdef np.ndarray[np.int_t, ndim=2] freq | ||
# sum (rank_xy * rank_x'y) for common ys | ||
cdef np.ndarray[np.double_t, ndim=2] prods | ||
# sum (rank_xy ^ 2) for common ys | ||
cdef np.ndarray[np.double_t, ndim=2] sqi | ||
# sum (rank_x'y ^ 2) for common ys | ||
cdef np.ndarray[np.double_t, ndim=2] sqj | ||
# sum (rank_xy) for common ys | ||
cdef np.ndarray[np.double_t, ndim=2] si | ||
# sum (rank_x'y) for common ys | ||
cdef np.ndarray[np.double_t, ndim=2] sj | ||
# the similarity matrix | ||
cdef np.ndarray[np.double_t, ndim=2] sim | ||
|
||
cdef np.ndarray[np.double_t, ndim=1] ranks | ||
cdef np.ndarray[np.double_t, ndim=2] matrix | ||
|
||
cdef int xi, xj | ||
cdef double ri, rj | ||
cdef int min_sprt = min_support | ||
|
||
freq = np.zeros((n_x, n_x), np.int) | ||
prods = np.zeros((n_x, n_x), np.double) | ||
sqi = np.zeros((n_x, n_x), np.double) | ||
sqj = np.zeros((n_x, n_x), np.double) | ||
si = np.zeros((n_x, n_x), np.double) | ||
sj = np.zeros((n_x, n_x), np.double) | ||
sim = np.zeros((n_x, n_x), np.double) | ||
ranks = np.zeros(n_x, np.double) | ||
matrix = np.zeros((len(yr), n_x), np.double) | ||
|
||
# turn yr into a matrix | ||
for y, y_ratings in iteritems(yr): | ||
for x_i, r_i in y_ratings: | ||
matrix[y, x_i] = r_i | ||
# turn the yr matrix into a matrix which contains the ranks | ||
for x_i in range(n_x): | ||
matrix[:, x_i] = rankdata(matrix[:, x_i]) | ||
|
||
for y, y_ratings in iteritems(yr): | ||
for xi, ri in y_ratings: | ||
# use the ranking matrix to get the elements row by row | ||
ranks[xi] = matrix[y, xi] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there might be a problem here:
But when we compare 2 xs, we only want to do the that on the basis of their common Say we have 5 items and 2 users
The ranks are:
But on the common items the ratings are
and the ranks are then
So your code will consider the ranks
while it should actually be considering
Maybe this has no impact because the relative order of each rank will stay the same, and it has no effect on pearson? I don't know what would happen if there are ties though... |
||
for xi, _ in y_ratings: | ||
for xj, _ in y_ratings: | ||
prods[xi, xj] += ranks[xi] * ranks[xj] | ||
freq[xi, xj] += 1 | ||
sqi[xi, xj] += ranks[xi]**2 | ||
sqj[xi, xj] += ranks[xj]**2 | ||
si[xi, xj] += ranks[xi] | ||
sj[xi, xj] += ranks[xj] | ||
|
||
for xi in range(n_x): | ||
sim[xi, xi] = 1 | ||
for xj in range(xi + 1, n_x): | ||
|
||
if freq[xi, xj] < min_sprt: | ||
sim[xi, xj] = 0 | ||
else: | ||
n = freq[xi, xj] | ||
num = (n * prods[xi, xj]) - (si[xi, xj] * sj[xi, xj]) | ||
denum_l = (n * sqi[xi, xj]) - si[xi, xj]**2 | ||
denum_r = (n * sqj[xi, xj]) - sj[xi, xj]**2 | ||
denum = np.sqrt(denum_l * denum_r) | ||
if denum == 0: | ||
sim[xi, xj] = 0 | ||
else: | ||
sim[xi, xj] = num / denum | ||
|
||
sim[xj, xi] = sim[xi, xj] | ||
|
||
return sim |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,11 +12,11 @@ | |
|
||
n_x = 8 | ||
yr_global = { | ||
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa | ||
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa | ||
1: [(0, 4), (1, 4), (2, 4), ], # noqa | ||
2: [ (2, 5), (3, 2), (4, 3) ], # noqa | ||
3: [(1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa | ||
4: [(1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa | ||
3: [ (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa | ||
4: [ (1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa | ||
} | ||
|
||
|
||
|
@@ -33,7 +33,7 @@ def test_cosine_sim(): | |
|
||
sim = sims.cosine(n_x, yr, min_support=1) | ||
|
||
# check symetry and bounds (as ratings are > 0, cosine sim must be >= 0) | ||
# check symmetry and bounds (as ratings are > 0, cosine sim must be >= 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lol thanks for correcting the typos There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Always leave the place cleaner than you found it. ^^ |
||
for xi in range(n_x): | ||
assert sim[xi, xi] == 1 | ||
for xj in range(n_x): | ||
|
@@ -81,7 +81,7 @@ def test_msd_sim(): | |
|
||
sim = sims.msd(n_x, yr, min_support=1) | ||
|
||
# check symetry and bounds. MSD sim must be in [0, 1] | ||
# check symmetry and bounds. MSD sim must be in [0, 1] | ||
for xi in range(n_x): | ||
assert sim[xi, xi] == 1 | ||
for xj in range(n_x): | ||
|
@@ -120,7 +120,7 @@ def test_pearson_sim(): | |
random.shuffle(ratings) | ||
|
||
sim = sims.pearson(n_x, yr, min_support=1) | ||
# check symetry and bounds. -1 <= pearson coeff <= 1 | ||
# check symmetry and bounds. -1 <= pearson coeff <= 1 | ||
for xi in range(n_x): | ||
assert sim[xi, xi] == 1 | ||
for xj in range(n_x): | ||
|
@@ -182,7 +182,7 @@ def test_pearson_baseline_sim(): | |
x_biases = np.random.normal(0, 1, n_x) # fake | ||
y_biases = np.random.normal(0, 1, 5) # fake (there are 5 ys) | ||
sim = sims.pearson_baseline(n_x, yr, 1, global_mean, x_biases, y_biases) | ||
# check symetry and bounds. -1 <= pearson coeff <= 1 | ||
# check symmetry and bounds. -1 <= pearson coeff <= 1 | ||
for xi in range(n_x): | ||
assert sim[xi, xi] == 1 | ||
for xj in range(n_x): | ||
|
@@ -205,3 +205,59 @@ def test_pearson_baseline_sim(): | |
for j in range(i + 1, n_x): | ||
if i != 1 and j != 2: | ||
assert sim[i, j] == 0 | ||
|
||
|
||
def test_spearman_sim(): | ||
"""Test for spearman similarity""" | ||
|
||
yr = yr_global.copy() | ||
|
||
# shuffle every rating list, to ensure the order in which ratings are | ||
# processed does not matter (it's important because it used to be error | ||
# prone when we were using itertools.combinations) | ||
for _, ratings in yr.items(): | ||
random.shuffle(ratings) | ||
|
||
sim = sims.spearman(n_x, yr, min_support=1) | ||
# check symetry and bounds. -1 <= pearson coeff <= 1 | ||
for xi in range(n_x): | ||
assert sim[xi, xi] == 1 | ||
for xj in range(n_x): | ||
assert sim[xi, xj] == sim[xj, xi] | ||
assert -1 <= sim[xi, xj] <= 1 | ||
|
||
# on common items, users 0, 1 and 2 have the same ratings | ||
assert sim[0, 1] == 1 | ||
assert sim[0, 2] == 1 | ||
|
||
# for vectors with constant ratings, pearson sim is necessarily zero (as | ||
# ratings are centered) | ||
assert sim[3, 4] == 0 | ||
assert sim[2, 3] == 0 | ||
assert sim[2, 4] == 0 | ||
|
||
# pairs of users (0, 3), have no common items | ||
assert sim[0, 3] == 0 | ||
assert sim[0, 4] == 0 | ||
|
||
# ratings have same rankings | ||
assert sim[5, 6] == 1 | ||
|
||
# check for float point support and computation correctness | ||
mean6 = (1 + 2 + 3) / 3 | ||
var6 = (3 - mean6) ** 2 + (1 - mean6) ** 2 + (2 - mean6) ** 2 | ||
mean7 = (1 + 2 + 3) / 3 | ||
var7 = (1 - mean7) ** 2 + (3 - mean7) ** 2 + (2 - mean7) ** 2 | ||
num = sum([((3 - mean6) * (1 - mean7)), | ||
((1 - mean6) * (3 - mean7)), | ||
((2 - mean6) * (2 - mean7)) | ||
]) | ||
assert sim[6, 7] == num / (var6 * var7) ** 0.5 | ||
|
||
# ensure min_support is taken into account. Only users 1 and 2 have more | ||
# than 4 common ratings. | ||
sim = sims.spearman(n_x, yr, min_support=4) | ||
for i in range(n_x): | ||
for j in range(i + 1, n_x): | ||
if i != 1 and j != 2: | ||
assert sim[i, j] == 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is going to be huge (n_users * n_items).
Passing
xr
as well would avoid the need to creatematrix
right? If that's the case then we should do it.