Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve and add Spearman #227

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ script:
branches:
only:
- master
- spearman
3 changes: 2 additions & 1 deletion surprise/prediction_algorithms/algo_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ def compute_similarities(self, verbose=False):
construction_func = {'cosine': sims.cosine,
'msd': sims.msd,
'pearson': sims.pearson,
'pearson_baseline': sims.pearson_baseline}
'pearson_baseline': sims.pearson_baseline,
'spearman': sims.spearman}

if self.sim_options['user_based']:
n_x, yr = self.trainset.n_users, self.trainset.ir
Expand Down
124 changes: 123 additions & 1 deletion surprise/similarities.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Available similarity measures:
msd
pearson
pearson_baseline
spearman
"""

from __future__ import (absolute_import, division, print_function,
Expand All @@ -24,6 +25,8 @@ import numpy as np
from six.moves import range
from six import iteritems

from scipy.stats import rankdata


def cosine(n_x, yr, min_support):
"""Compute the cosine similarity between all pairs of users (or items).
Expand Down Expand Up @@ -196,7 +199,8 @@ def pearson(n_x, yr, min_support):
-1).

For details on Pearson coefficient, see `Wikipedia
<https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient#For_a_sample>`__.
<https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_
coefficient#For_a_sample>`__.

"""

Expand Down Expand Up @@ -359,3 +363,121 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases,
sim[xj, xi] = sim[xi, xj]

return sim


def spearman(n_x, yr, min_support):
"""Compute the Spearman correlation coefficient between all pairs of users
(or items).

Only **common** users (or items) are taken into account.
The Spearman rank correlation is a variation of the Pearson correlation.
The ratings are replaced by their rankings.

The Spearman Rank Correlation is suitable for the investigation of random
variables which are not based on a normal distribution.

It is defined by:

.. math ::
\\text{spearman_sim}(u, v) = \\frac{ \\sum\\limits_{i \\in I_{uv}}
(rank(r_{ui}) - \\overline{rank(u)}) \\cdot (rank(r_{vi}) -
\\overline{rank(v)})} {\\sqrt{\\sum\\limits_{i
\\in I_{uv}} (rank(r_{ui}) - \\overline{rank(u)})^2} \\cdot
\\sqrt{\\sum\\limits_{i \\in
I_{uv}} (rank(r_{vi}) - \\overline{rank(v)})^2} }

or

.. math ::
\\text{spearman_sim}(i, j) = \\frac{ \\sum\\limits_{u \\in U_{ij}}
(rank(r_{ui}) - \\overline{rank(i)}) \\cdot (rank(r_{uj}) -
\\overline{rank(j)})} {\\sqrt{\\sum\\limits_{u
\\in U_{ij}} (rank(r_{ui}) - \\overline{rank(i)})^2} \\cdot
\\sqrt{\\sum\\limits_{u \\in
U_{ij}} (rank(r_{uj}) - \\overline{rank(j)})^2} }

depending on the ``user_based`` field of ``sim_options`` (see
:ref:`similarity_measures_configuration`).


Note: if there are no common users or items, similarity will be 0 (and not
-1).

For details on Spearman coefficient, see in chapter 4, page 126 of
*Recommender Systems Handbook*.

"""

# number of common ys
cdef np.ndarray[np.int_t, ndim=2] freq
# sum (rank_xy * rank_x'y) for common ys
cdef np.ndarray[np.double_t, ndim=2] prods
# sum (rank_xy ^ 2) for common ys
cdef np.ndarray[np.double_t, ndim=2] sqi
# sum (rank_x'y ^ 2) for common ys
cdef np.ndarray[np.double_t, ndim=2] sqj
# sum (rank_xy) for common ys
cdef np.ndarray[np.double_t, ndim=2] si
# sum (rank_x'y) for common ys
cdef np.ndarray[np.double_t, ndim=2] sj
# the similarity matrix
cdef np.ndarray[np.double_t, ndim=2] sim

cdef np.ndarray[np.double_t, ndim=1] ranks
cdef np.ndarray[np.double_t, ndim=2] matrix

cdef int xi, xj
cdef double ri, rj
cdef int min_sprt = min_support

freq = np.zeros((n_x, n_x), np.int)
prods = np.zeros((n_x, n_x), np.double)
sqi = np.zeros((n_x, n_x), np.double)
sqj = np.zeros((n_x, n_x), np.double)
si = np.zeros((n_x, n_x), np.double)
sj = np.zeros((n_x, n_x), np.double)
sim = np.zeros((n_x, n_x), np.double)
ranks = np.zeros(n_x, np.double)
matrix = np.zeros((len(yr), n_x), np.double)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is going to be huge (n_users * n_items).

Passing xr as well would avoid the need to create matrix right? If that's the case then we should do it.


# turn yr into a matrix
for y, y_ratings in iteritems(yr):
for x_i, r_i in y_ratings:
matrix[y, x_i] = r_i
# turn the yr matrix into a matrix which contains the ranks
for x_i in range(n_x):
matrix[:, x_i] = rankdata(matrix[:, x_i])

for y, y_ratings in iteritems(yr):
for xi, ri in y_ratings:
# use the ranking matrix to get the elements row by row
ranks[xi] = matrix[y, xi]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there might be a problem here:

ranks[xi] contains the ranks for all the ys, right?

But when we compare 2 xs, we only want to do the that on the basis of their common ys. In the subsequent code you will compare them on the basis of all the ys.

Say we have 5 items and 2 users

ratings:
user 1: 1, 2, X, 4, 5
user 2: X, X, 1, 5, 2

The ranks are:

ranks:
user 1: 1, 2, X, 4, 5
user 2: X, X, 1, 3, 2

But on the common items the ratings are

ratings:
user 1: X, X, X, 4, 5
user 2: X, X, X, 5, 2

and the ranks are then

ranks:
user 1: X, X, X, 1, 2
user 2: X, X, X, 2, 1

So your code will consider the ranks

ranks:
user 1: 4, 5
user 2: 3, 2

while it should actually be considering

ranks:
user 1: 1, 2
user 2: 2, 1

Maybe this has no impact because the relative order of each rank will stay the same, and it has no effect on pearson? I don't know what would happen if there are ties though...

for xi, _ in y_ratings:
for xj, _ in y_ratings:
prods[xi, xj] += ranks[xi] * ranks[xj]
freq[xi, xj] += 1
sqi[xi, xj] += ranks[xi]**2
sqj[xi, xj] += ranks[xj]**2
si[xi, xj] += ranks[xi]
sj[xi, xj] += ranks[xj]

for xi in range(n_x):
sim[xi, xi] = 1
for xj in range(xi + 1, n_x):

if freq[xi, xj] < min_sprt:
sim[xi, xj] = 0
else:
n = freq[xi, xj]
num = (n * prods[xi, xj]) - (si[xi, xj] * sj[xi, xj])
denum_l = (n * sqi[xi, xj]) - si[xi, xj]**2
denum_r = (n * sqj[xi, xj]) - sj[xi, xj]**2
denum = np.sqrt(denum_l * denum_r)
if denum == 0:
sim[xi, xj] = 0
else:
sim[xi, xj] = num / denum

sim[xj, xi] = sim[xi, xj]

return sim
7 changes: 6 additions & 1 deletion tests/test_sim_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@ def test_name_field(u1_ml100k, pkf):
algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
rmse_pearson_bsl = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

sim_options = {'name': 'spearman'}
bsl_options = {'n_epochs': 1}
algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
rmse_spearman = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

for rmse_a, rmse_b in combinations((rmse_cosine, rmse_msd, rmse_pearson,
rmse_pearson_bsl), 2):
rmse_pearson_bsl, rmse_spearman), 2):
assert (rmse_a != rmse_b)

with pytest.raises(NameError):
Expand Down
70 changes: 63 additions & 7 deletions tests/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@

n_x = 8
yr_global = {
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
1: [(0, 4), (1, 4), (2, 4), ], # noqa
2: [ (2, 5), (3, 2), (4, 3) ], # noqa
3: [(1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
4: [(1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
3: [ (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
4: [ (1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
}


Expand All @@ -33,7 +33,7 @@ def test_cosine_sim():

sim = sims.cosine(n_x, yr, min_support=1)

# check symetry and bounds (as ratings are > 0, cosine sim must be >= 0)
# check symmetry and bounds (as ratings are > 0, cosine sim must be >= 0)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol thanks for correcting the typos

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Always leave the place cleaner than you found it. ^^

for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
Expand Down Expand Up @@ -81,7 +81,7 @@ def test_msd_sim():

sim = sims.msd(n_x, yr, min_support=1)

# check symetry and bounds. MSD sim must be in [0, 1]
# check symmetry and bounds. MSD sim must be in [0, 1]
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
Expand Down Expand Up @@ -120,7 +120,7 @@ def test_pearson_sim():
random.shuffle(ratings)

sim = sims.pearson(n_x, yr, min_support=1)
# check symetry and bounds. -1 <= pearson coeff <= 1
# check symmetry and bounds. -1 <= pearson coeff <= 1
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
Expand Down Expand Up @@ -182,7 +182,7 @@ def test_pearson_baseline_sim():
x_biases = np.random.normal(0, 1, n_x) # fake
y_biases = np.random.normal(0, 1, 5) # fake (there are 5 ys)
sim = sims.pearson_baseline(n_x, yr, 1, global_mean, x_biases, y_biases)
# check symetry and bounds. -1 <= pearson coeff <= 1
# check symmetry and bounds. -1 <= pearson coeff <= 1
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
Expand All @@ -205,3 +205,59 @@ def test_pearson_baseline_sim():
for j in range(i + 1, n_x):
if i != 1 and j != 2:
assert sim[i, j] == 0


def test_spearman_sim():
"""Test for spearman similarity"""

yr = yr_global.copy()

# shuffle every rating list, to ensure the order in which ratings are
# processed does not matter (it's important because it used to be error
# prone when we were using itertools.combinations)
for _, ratings in yr.items():
random.shuffle(ratings)

sim = sims.spearman(n_x, yr, min_support=1)
# check symetry and bounds. -1 <= pearson coeff <= 1
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
assert sim[xi, xj] == sim[xj, xi]
assert -1 <= sim[xi, xj] <= 1

# on common items, users 0, 1 and 2 have the same ratings
assert sim[0, 1] == 1
assert sim[0, 2] == 1

# for vectors with constant ratings, pearson sim is necessarily zero (as
# ratings are centered)
assert sim[3, 4] == 0
assert sim[2, 3] == 0
assert sim[2, 4] == 0

# pairs of users (0, 3), have no common items
assert sim[0, 3] == 0
assert sim[0, 4] == 0

# ratings have same rankings
assert sim[5, 6] == 1

# check for float point support and computation correctness
mean6 = (1 + 2 + 3) / 3
var6 = (3 - mean6) ** 2 + (1 - mean6) ** 2 + (2 - mean6) ** 2
mean7 = (1 + 2 + 3) / 3
var7 = (1 - mean7) ** 2 + (3 - mean7) ** 2 + (2 - mean7) ** 2
num = sum([((3 - mean6) * (1 - mean7)),
((1 - mean6) * (3 - mean7)),
((2 - mean6) * (2 - mean7))
])
assert sim[6, 7] == num / (var6 * var7) ** 0.5

# ensure min_support is taken into account. Only users 1 and 2 have more
# than 4 common ratings.
sim = sims.spearman(n_x, yr, min_support=4)
for i in range(n_x):
for j in range(i + 1, n_x):
if i != 1 and j != 2:
assert sim[i, j] == 0