NicolasHug · ghost · Apr 7, 2018 · May 1, 2018 · May 1, 2018 · Jun 27, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -16,3 +16,4 @@ script:
 branches:
   only:
   - master
+  - spearman
diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py
@@ -269,7 +269,8 @@ def compute_similarities(self, verbose=False):
         construction_func = {'cosine': sims.cosine,
                              'msd': sims.msd,
                              'pearson': sims.pearson,
-                             'pearson_baseline': sims.pearson_baseline}
+                             'pearson_baseline': sims.pearson_baseline,
+                             'spearman': sims.spearman}
 
         if self.sim_options['user_based']:
             n_x, yr = self.trainset.n_users, self.trainset.ir

diff --git a/surprise/similarities.pyx b/surprise/similarities.pyx
@@ -13,6 +13,7 @@ Available similarity measures:
     msd
     pearson
     pearson_baseline
+    spearman
 """
 
 from __future__ import (absolute_import, division, print_function,
@@ -24,6 +25,8 @@ import numpy as np
 from six.moves import range
 from six import iteritems
 
+from scipy.stats import rankdata
+
 
 def cosine(n_x, yr, min_support):
     """Compute the cosine similarity between all pairs of users (or items).
@@ -196,7 +199,8 @@ def pearson(n_x, yr, min_support):
     -1).
 
     For details on Pearson coefficient, see `Wikipedia
-    <https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient#For_a_sample>`__.
+    <https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_
+    coefficient#For_a_sample>`__.
 
     """
 
@@ -359,3 +363,121 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases,
             sim[xj, xi] = sim[xi, xj]
 
     return sim
+
+
+def spearman(n_x, yr, min_support):
+    """Compute the Spearman correlation coefficient between all pairs of users
+    (or items).
+
+    Only **common** users (or items) are taken into account.
+    The Spearman rank correlation is a variation of the Pearson correlation.
+    The ratings are replaced by their rankings.
+
+    The Spearman Rank Correlation is suitable for the investigation of random
+    variables which are not based on a normal distribution.
+
+    It is defined by:
+
+    .. math ::
+        \\text{spearman_sim}(u, v) = \\frac{ \\sum\\limits_{i \\in I_{uv}}
+        (rank(r_{ui}) - \\overline{rank(u)}) \\cdot (rank(r_{vi}) -
+        \\overline{rank(v)})} {\\sqrt{\\sum\\limits_{i
+        \\in I_{uv}} (rank(r_{ui}) - \\overline{rank(u)})^2} \\cdot
+        \\sqrt{\\sum\\limits_{i \\in
+        I_{uv}}  (rank(r_{vi}) - \\overline{rank(v)})^2} }
+
+    or
+
+    .. math ::
+        \\text{spearman_sim}(i, j) = \\frac{ \\sum\\limits_{u \\in U_{ij}}
+        (rank(r_{ui}) - \\overline{rank(i)}) \\cdot (rank(r_{uj}) -
+        \\overline{rank(j)})} {\\sqrt{\\sum\\limits_{u
+        \\in U_{ij}} (rank(r_{ui}) - \\overline{rank(i)})^2} \\cdot
+        \\sqrt{\\sum\\limits_{u \\in
+        U_{ij}}  (rank(r_{uj}) - \\overline{rank(j)})^2} }
+
+    depending on the ``user_based`` field of ``sim_options`` (see
+    :ref:`similarity_measures_configuration`).
+
+
+    Note: if there are no common users or items, similarity will be 0 (and not
+    -1).
+
+    For details on Spearman coefficient, see in chapter 4, page 126 of
+    *Recommender Systems Handbook*.
+
+    """
+
+    # number of common ys
+    cdef np.ndarray[np.int_t, ndim=2] freq
+    # sum (rank_xy * rank_x'y) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] prods
+    # sum (rank_xy ^ 2) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] sqi
+    # sum (rank_x'y ^ 2) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] sqj
+    # sum (rank_xy) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] si
+    # sum (rank_x'y) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] sj
+    # the similarity matrix
+    cdef np.ndarray[np.double_t, ndim=2] sim
+
+    cdef np.ndarray[np.double_t, ndim=1] ranks
+    cdef np.ndarray[np.double_t, ndim=2] matrix
+
+    cdef int xi, xj
+    cdef double ri, rj
+    cdef int min_sprt = min_support
+
+    freq = np.zeros((n_x, n_x), np.int)
+    prods = np.zeros((n_x, n_x), np.double)
+    sqi = np.zeros((n_x, n_x), np.double)
+    sqj = np.zeros((n_x, n_x), np.double)
+    si = np.zeros((n_x, n_x), np.double)
+    sj = np.zeros((n_x, n_x), np.double)
+    sim = np.zeros((n_x, n_x), np.double)
+    ranks = np.zeros(n_x, np.double)
+    matrix = np.zeros((len(yr), n_x), np.double)
+
+    # turn yr into a matrix
+    for y, y_ratings in iteritems(yr):
+        for x_i, r_i in y_ratings:
+            matrix[y, x_i] = r_i
+    # turn the yr matrix into a matrix which contains the ranks
+    for x_i in range(n_x):
+        matrix[:, x_i] = rankdata(matrix[:, x_i])
+
+    for y, y_ratings in iteritems(yr):
+        for xi, ri in y_ratings:
+            # use the ranking matrix to get the elements row by row
+            ranks[xi] = matrix[y, xi]
+        for xi, _ in y_ratings:
+            for xj, _ in y_ratings:
+                prods[xi, xj] += ranks[xi] * ranks[xj]
+                freq[xi, xj] += 1
+                sqi[xi, xj] += ranks[xi]**2
+                sqj[xi, xj] += ranks[xj]**2
+                si[xi, xj] += ranks[xi]
+                sj[xi, xj] += ranks[xj]
+
+    for xi in range(n_x):
+        sim[xi, xi] = 1
+        for xj in range(xi + 1, n_x):
+
+            if freq[xi, xj] < min_sprt:
+                sim[xi, xj] = 0
+            else:
+                n = freq[xi, xj]
+                num = (n * prods[xi, xj]) - (si[xi, xj] * sj[xi, xj])
+                denum_l = (n * sqi[xi, xj]) - si[xi, xj]**2
+                denum_r = (n * sqj[xi, xj]) - sj[xi, xj]**2
+                denum = np.sqrt(denum_l * denum_r)
+                if denum == 0:
+                    sim[xi, xj] = 0
+                else:
+                    sim[xi, xj] = num / denum
+
+            sim[xj, xi] = sim[xi, xj]
+
+    return sim
diff --git a/tests/test_sim_options.py b/tests/test_sim_options.py
@@ -34,8 +34,13 @@ def test_name_field(u1_ml100k, pkf):
     algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
     rmse_pearson_bsl = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
 
+    sim_options = {'name': 'spearman'}
+    bsl_options = {'n_epochs': 1}
+    algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
+    rmse_spearman = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
+
     for rmse_a, rmse_b in combinations((rmse_cosine, rmse_msd, rmse_pearson,
-                                        rmse_pearson_bsl), 2):
+                                        rmse_pearson_bsl, rmse_spearman), 2):
         assert (rmse_a != rmse_b)
 
     with pytest.raises(NameError):

diff --git a/tests/test_similarities.py b/tests/test_similarities.py
@@ -12,11 +12,11 @@
 
 n_x = 8
 yr_global = {
-    0: [(0, 3), (1, 3), (2, 3), (5, 1),                 (6, 1.5), (7, 3)],  # noqa
+    0: [(0, 3), (1, 3), (2, 3),                 (5, 1), (6, 1.5), (7, 3)],  # noqa
     1: [(0, 4), (1, 4), (2, 4),                                         ],  # noqa
     2: [                (2, 5), (3, 2), (4, 3)                          ],  # noqa
-    3: [(1, 1),         (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)],  # noqa
-    4: [(1, 5),         (2, 1),                 (5, 2), (6, 2.5), (7, 2.5)], # noqa
+    3: [        (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)],  # noqa
+    4: [        (1, 5), (2, 1),                 (5, 2), (6, 2.5), (7, 2.5)], # noqa
 }
 
 
@@ -33,7 +33,7 @@ def test_cosine_sim():
 
     sim = sims.cosine(n_x, yr, min_support=1)
 
-    # check symetry and bounds (as ratings are > 0, cosine sim must be >= 0)
+    # check symmetry and bounds (as ratings are > 0, cosine sim must be >= 0)
     for xi in range(n_x):
         assert sim[xi, xi] == 1
         for xj in range(n_x):
@@ -81,7 +81,7 @@ def test_msd_sim():
 
     sim = sims.msd(n_x, yr, min_support=1)
 
-    # check symetry and bounds. MSD sim must be in [0, 1]
+    # check symmetry and bounds. MSD sim must be in [0, 1]
     for xi in range(n_x):
         assert sim[xi, xi] == 1
         for xj in range(n_x):
@@ -120,7 +120,7 @@ def test_pearson_sim():
         random.shuffle(ratings)
 
     sim = sims.pearson(n_x, yr, min_support=1)
-    # check symetry and bounds. -1 <= pearson coeff <= 1
+    # check symmetry and bounds. -1 <= pearson coeff <= 1
     for xi in range(n_x):
         assert sim[xi, xi] == 1
         for xj in range(n_x):
@@ -182,7 +182,7 @@ def test_pearson_baseline_sim():
     x_biases = np.random.normal(0, 1, n_x)  # fake
     y_biases = np.random.normal(0, 1, 5)  # fake (there are 5 ys)
     sim = sims.pearson_baseline(n_x, yr, 1, global_mean, x_biases, y_biases)
-    # check symetry and bounds. -1 <= pearson coeff <= 1
+    # check symmetry and bounds. -1 <= pearson coeff <= 1
     for xi in range(n_x):
         assert sim[xi, xi] == 1
         for xj in range(n_x):
@@ -205,3 +205,59 @@ def test_pearson_baseline_sim():
         for j in range(i + 1, n_x):
             if i != 1 and j != 2:
                 assert sim[i, j] == 0
+
+
+def test_spearman_sim():
+    """Test for spearman similarity"""
+
+    yr = yr_global.copy()
+
+    # shuffle every rating list, to ensure the order in which ratings are
+    # processed does not matter (it's important because it used to be error
+    # prone when we were using itertools.combinations)
+    for _, ratings in yr.items():
+        random.shuffle(ratings)
+
+    sim = sims.spearman(n_x, yr, min_support=1)
+    # check symetry and bounds. -1 <= pearson coeff <= 1
+    for xi in range(n_x):
+        assert sim[xi, xi] == 1
+        for xj in range(n_x):
+            assert sim[xi, xj] == sim[xj, xi]
+            assert -1 <= sim[xi, xj] <= 1
+
+    # on common items, users 0, 1 and 2 have the same ratings
+    assert sim[0, 1] == 1
+    assert sim[0, 2] == 1
+
+    # for vectors with constant ratings, pearson sim is necessarily zero (as
+    # ratings are centered)
+    assert sim[3, 4] == 0
+    assert sim[2, 3] == 0
+    assert sim[2, 4] == 0
+
+    # pairs of users (0, 3), have no common items
+    assert sim[0, 3] == 0
+    assert sim[0, 4] == 0
+
+    # ratings have same rankings
+    assert sim[5, 6] == 1
+
+    # check for float point support and computation correctness
+    mean6 = (1 + 2 + 3) / 3
+    var6 = (3 - mean6) ** 2 + (1 - mean6) ** 2 + (2 - mean6) ** 2
+    mean7 = (1 + 2 + 3) / 3
+    var7 = (1 - mean7) ** 2 + (3 - mean7) ** 2 + (2 - mean7) ** 2
+    num = sum([((3 - mean6) * (1 - mean7)),
+               ((1 - mean6) * (3 - mean7)),
+               ((2 - mean6) * (2 - mean7))
+               ])
+    assert sim[6, 7] == num / (var6 * var7) ** 0.5
+
+    # ensure min_support is taken into account. Only users 1 and 2 have more
+    # than 4 common ratings.
+    sim = sims.spearman(n_x, yr, min_support=4)
+    for i in range(n_x):
+        for j in range(i + 1, n_x):
+            if i != 1 and j != 2:
+                assert sim[i, j] == 0