From d597882a3644856357d5cf6183af75308a31240e Mon Sep 17 00:00:00 2001
From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com>
Date: Wed, 20 Apr 2022 12:14:59 +0200
Subject: [PATCH 1/8] Integrate-graph-objects into hdbscan_.py

Create a new option for metrics calles "graph", which takes graph objects as a csr adjacency matrix and runs the HDBSCAN function on the csgraph min_span_tree of the given graph.

Example plot file (plot_hdbscan_graph.py) shows the working of the function and displays the example plots of the graphs.
---
 examples/plot_hdbscan_graph.py | 147 +++++++++++++++++++++++++++++++++
 hdbscan/hdbscan_.py            |  42 ++++++++++
 2 files changed, 189 insertions(+)
 create mode 100644 examples/plot_hdbscan_graph.py

diff --git a/examples/plot_hdbscan_graph.py b/examples/plot_hdbscan_graph.py
new file mode 100644
index 00000000..eac762af
--- /dev/null
+++ b/examples/plot_hdbscan_graph.py
@@ -0,0 +1,147 @@
+import numpy
+import numpy as np
+import sklearn.metrics
+from scipy import sparse
+import igraph
+import networkx as nx
+import time
+from hdbscan import (
+    HDBSCAN,
+    hdbscan,
+    validity_index,
+    approximate_predict,
+    approximate_predict_scores,
+    membership_vector,
+    all_points_membership_vectors,
+)
+
+
+def create_distance_matrix(graph):
+    """
+    Creates a distance matrix from the given graph using the igraph shortest path algorithm.
+    :param graph: An igraph graph object.
+    :return: Scipy csr matrix based on the graph.
+    """
+
+    # create a distance matrix based of the graph
+    # create variables
+    path_weight, vertex_from_list, vertex_to_list, vertex_from = [], [], [], 0
+
+    for vertex in graph.vs:
+        list_edges_shortest_path = graph.get_shortest_paths(vertex, to=None, weights="weight", mode='out',
+                                                            output="epath")
+        vertex_to = 0
+
+        for edge_list in list_edges_shortest_path:
+            if edge_list:
+                vertex_from_list.append(vertex_from)
+                vertex_to_list.append(vertex_to)
+                path_weight.append(sum(graph.es.select(edge_list)["weight"]))
+            else:
+                vertex_from_list.append(vertex_from)
+                vertex_to_list.append(vertex_to)
+                path_weight.append(0)
+
+            vertex_to += 1
+        vertex_from += 1
+
+    distance_matrix = sparse.csr_matrix((path_weight, (vertex_from_list, vertex_to_list)))
+
+    return distance_matrix
+
+
+def hdbscan_graph():
+    """
+    Creates a weighted stochastic_block_model graph to compare the newly created graph function of HDBSCAN
+    to the precomputed metric using a distance matrix created for the graph.
+    """
+    # measure time
+    start_build_graph = time.time()
+
+    # set parameters graph and edges
+    number_communities = 4
+    edge_weight_in_community = 0.1
+    edge_weight_out_community = 1
+
+    # create graph
+    community_sizes = np.random.randint(low=30, high=70, size=number_communities)
+    matrix_prob = np.random.rand(number_communities, number_communities)
+    matrix_prob = (np.tril(matrix_prob) + np.tril(matrix_prob, -1).T) * 0.5
+    numpy.fill_diagonal(matrix_prob, 0.7)
+    sbm_graph = nx.stochastic_block_model(community_sizes, matrix_prob, seed=0)
+
+    # convert to igraph object
+    graph = igraph.Graph(n=sbm_graph.number_of_nodes(), directed=False)
+    graph.add_edges(sbm_graph.edges())
+
+    # check for double edges and loops and delete those
+    graph.simplify()
+    graph.vs.select(_degree=0).delete()
+
+    # run community detection to assign edge weights
+    community_detection = graph.community_multilevel()
+
+    # add edge weights
+    weight_list = []
+    for edge in graph.es:
+        vertex_1 = edge.source
+        vertex_2 = edge.target
+        edge_weight_added = False
+        for subgraph in community_detection:
+            if vertex_1 in subgraph and vertex_2 in subgraph:
+                weight_list.append(edge_weight_in_community)
+                edge_weight_added = True
+        if not edge_weight_added:
+            weight_list.append(edge_weight_out_community)
+
+    graph.es["weight"] = weight_list
+
+    print("Graph created:", time.time() - start_build_graph)
+
+    # run HDBSCAN on graph distance matrix
+    start_distance_matrix = time.time()
+
+    # create a distance matrix from the graph
+    distance_matrix = create_distance_matrix(graph)
+
+    # run HDBSCAN on the created distance matrix
+    clusterer = HDBSCAN(metric="precomputed").fit(distance_matrix)
+    labels_distance_matrix = clusterer.labels_
+
+    # measure time
+    print("HDBSCAN distance matrix:", time.time() - start_distance_matrix)
+
+    # plot graph clustering using iGraph
+    graph.vs["label_distance_matrix"] = labels_distance_matrix
+    vclustering = igraph.clustering.VertexClustering.FromAttribute(graph, "label_distance_matrix")
+    igraph.plot(vclustering)
+
+    """
+    Convert the iGraph graph into a csr sparse matrix, which the modified HDBSCAN function accepts and 
+    transforms into a scipy csgraph. 
+    """
+    # run HDBSCAN using the graph metric
+    start_hdbscan_graph = time.time()
+
+    # create adjacency matrix from the graph, csr sparse matrix format
+    adjacency = graph.get_adjacency_sparse(attribute="weight")
+
+    clusterer = HDBSCAN(metric="graph").fit(adjacency)
+    labels_hdbscan_graph = clusterer.labels_
+
+    print("HDBSCAN graph:", time.time() - start_hdbscan_graph)
+
+    # plot clustering labels using iGraph
+    graph.vs["label_hdbscan_graph"] = labels_hdbscan_graph
+    vclustering = igraph.clustering.VertexClustering.FromAttribute(graph, "label_hdbscan_graph")
+    igraph.plot(vclustering)
+
+    # print the AMI and ARI for the labels
+    print("AMI:", sklearn.metrics.adjusted_mutual_info_score(labels_distance_matrix, labels_hdbscan_graph))
+    print("ARI:", sklearn.metrics.adjusted_rand_score(labels_distance_matrix, labels_hdbscan_graph))
+
+
+"""
+run the example function displaying the graph feature of HDBSCAN
+"""
+hdbscan_graph()
diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py
index 31f60470..b6e6175b 100644
--- a/hdbscan/hdbscan_.py
+++ b/hdbscan/hdbscan_.py
@@ -96,6 +96,10 @@ def _hdbscan_generic(
         #   matrix to indicate missing distance information.
         # TODO: Check if copying is necessary
         distance_matrix = X.copy()
+    elif metric == "graph":
+        # takes the graph csr matrix and converts it directly into a min_span_tree
+        return _hdbscan_graph_to_tree(X, gen_min_span_tree)
+
     else:
         distance_matrix = pairwise_distances(X, metric=metric, **kwargs)
 
@@ -151,6 +155,44 @@ def _hdbscan_generic(
     return single_linkage_tree, result_min_span_tree
 
 
+def _hdbscan_graph_to_tree(
+        graph,
+        gen_min_span_tree
+):
+    # Check the graph for multiple components.
+    # If more than one component it means that there exists points
+    # with less than `min_samples` neighbors
+    if (
+            csgraph.connected_components(graph)[0]
+            > 1
+    ):
+        raise ValueError(
+            (
+                "The passed graph has more than on component."
+                "Run hdbscan on each component."
+            )
+        )
+
+    # graph as an csr distance matrix object
+    sparse_min_spanning_tree = csgraph.minimum_spanning_tree(graph)
+
+    # Convert the graph to scipy cluster array format
+    nonzeros = sparse_min_spanning_tree.nonzero()
+    nonzero_vals = sparse_min_spanning_tree[nonzeros]
+    min_spanning_tree = np.vstack(nonzeros + (nonzero_vals,)).T
+
+    # Sort edges of the min_spanning_tree by weight
+    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :][0]
+
+    # Convert edge list into standard hierarchical clustering format
+    single_linkage_tree = label(min_spanning_tree)
+
+    if gen_min_span_tree:
+        return single_linkage_tree, min_spanning_tree
+    else:
+        return single_linkage_tree, None
+
+
 def _hdbscan_sparse_distance_matrix(
     X,
     min_samples=5,

From ec7f7333d37122326352e5bf54a71a670e9d4d1f Mon Sep 17 00:00:00 2001
From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com>
Date: Mon, 25 Apr 2022 13:58:26 +0200
Subject: [PATCH 2/8] Graph metric changes and tests added

Modifed the _hdbscan_sparse_distance_matrix function to integrate the graph metric into the hdbscan code.

Started working on the test for the graph metric.
---
 hdbscan/hdbscan_.py           | 138 +++++++++++++++++-----------------
 hdbscan/tests/test_hdbscan.py |  47 +++++++++---
 2 files changed, 104 insertions(+), 81 deletions(-)

diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py
index b6e6175b..8accef86 100644
--- a/hdbscan/hdbscan_.py
+++ b/hdbscan/hdbscan_.py
@@ -98,7 +98,21 @@ def _hdbscan_generic(
         distance_matrix = X.copy()
     elif metric == "graph":
         # takes the graph csr matrix and converts it directly into a min_span_tree
-        return _hdbscan_graph_to_tree(X, gen_min_span_tree)
+
+        # X should be the adjacency of the graph in csr sparse format
+        adjacency_matrix = X
+
+        # run the distance matrix function with metric "graph" creating a cs min spanning tree
+        return _hdbscan_sparse_distance_matrix(
+            adjacency_matrix,
+            min_samples,
+            alpha,
+            "graph",
+            p,
+            leaf_size,
+            gen_min_span_tree,
+            **kwargs
+        )
 
     else:
         distance_matrix = pairwise_distances(X, metric=metric, **kwargs)
@@ -155,44 +169,6 @@ def _hdbscan_generic(
     return single_linkage_tree, result_min_span_tree
 
 
-def _hdbscan_graph_to_tree(
-        graph,
-        gen_min_span_tree
-):
-    # Check the graph for multiple components.
-    # If more than one component it means that there exists points
-    # with less than `min_samples` neighbors
-    if (
-            csgraph.connected_components(graph)[0]
-            > 1
-    ):
-        raise ValueError(
-            (
-                "The passed graph has more than on component."
-                "Run hdbscan on each component."
-            )
-        )
-
-    # graph as an csr distance matrix object
-    sparse_min_spanning_tree = csgraph.minimum_spanning_tree(graph)
-
-    # Convert the graph to scipy cluster array format
-    nonzeros = sparse_min_spanning_tree.nonzero()
-    nonzero_vals = sparse_min_spanning_tree[nonzeros]
-    min_spanning_tree = np.vstack(nonzeros + (nonzero_vals,)).T
-
-    # Sort edges of the min_spanning_tree by weight
-    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :][0]
-
-    # Convert edge list into standard hierarchical clustering format
-    single_linkage_tree = label(min_spanning_tree)
-
-    if gen_min_span_tree:
-        return single_linkage_tree, min_spanning_tree
-    else:
-        return single_linkage_tree, None
-
-
 def _hdbscan_sparse_distance_matrix(
     X,
     min_samples=5,
@@ -204,42 +180,62 @@ def _hdbscan_sparse_distance_matrix(
     **kwargs
 ):
     assert issparse(X)
-    # Check for connected component on X
-    if csgraph.connected_components(X, directed=False, return_labels=False) > 1:
-        raise ValueError(
-            "Sparse distance matrix has multiple connected "
-            "components!\nThat is, there exist groups of points "
-            "that are completely disjoint -- there are no distance "
-            "relations connecting them\n"
-            "Run hdbscan on each component."
-        )
 
-    lil_matrix = X.tolil()
+    # if the metric is not graph, build a min spanning tree from the sparse matrix
+    if metric != "graph":
+        # Check for connected component on X
+        if csgraph.connected_components(X, directed=False, return_labels=False) > 1:
+            raise ValueError(
+                "Sparse distance matrix has multiple connected "
+                "components!\nThat is, there exist groups of points "
+                "that are completely disjoint -- there are no distance "
+                "relations connecting them\n"
+                "Run hdbscan on each component."
+            )
 
-    # Compute sparse mutual reachability graph
-    # if max_dist > 0, max distance to use when the reachability is infinite
-    max_dist = kwargs.get("max_dist", 0.0)
-    mutual_reachability_ = sparse_mutual_reachability(
-        lil_matrix, min_points=min_samples, max_dist=max_dist, alpha=alpha
-    )
-    # Check connected component on mutual reachability
-    # If more than one component, it means that even if the distance matrix X
-    # has one component, there exists with less than `min_samples` neighbors
-    if (
-        csgraph.connected_components(
-            mutual_reachability_, directed=False, return_labels=False
+        lil_matrix = X.tolil()
+
+        # Compute sparse mutual reachability graph
+        # if max_dist > 0, max distance to use when the reachability is infinite
+        max_dist = kwargs.get("max_dist", 0.0)
+        mutual_reachability_ = sparse_mutual_reachability(
+            lil_matrix, min_points=min_samples, max_dist=max_dist, alpha=alpha
         )
-        > 1
-    ):
-        raise ValueError(
-            (
-                "There exists points with less than %s neighbors. "
-                "Ensure your distance matrix has non zeros values for "
-                "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), "
-                "or specify a `max_dist` to use when distances are missing."
+        # Check connected component on mutual reachability
+        # If more than one component, it means that even if the distance matrix X
+        # has one component, there exists with less than `min_samples` neighbors
+        if (
+                csgraph.connected_components(
+                    mutual_reachability_, directed=False, return_labels=False
+                )
+                > 1
+        ):
+            raise ValueError(
+                (
+                    "There exists points with less than %s neighbors. "
+                    "Ensure your distance matrix has non zeros values for "
+                    "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), "
+                    "or specify a `max_dist` to use when distances are missing."
+                )
+                % (min_samples, min_samples)
             )
-            % (min_samples, min_samples)
-        )
+
+    # otherwise convert the csr adjacency matrix from the graph into a minimum spanning tree
+    else:
+        # check components of the graph
+        if (
+                csgraph.connected_components(X)[0]
+                > 1
+        ):
+            raise ValueError(
+                (
+                    "The passed graph has more than on component. \n"
+                    "Run hdbscan on each component."
+                )
+            )
+        # if one component set the mutual_reachability_ to the csr from the graph
+        else:
+            mutual_reachability_ = X
 
     # Compute the minimum spanning tree for the sparse graph
     sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability_)
diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py
index 9e3faa34..433d2c65 100644
--- a/hdbscan/tests/test_hdbscan.py
+++ b/hdbscan/tests/test_hdbscan.py
@@ -2,7 +2,10 @@
 Tests for HDBSCAN clustering algorithm
 Shamelessly based on (i.e. ripped off from) the DBSCAN test code
 """
+import matplotlib.pyplot as plt
 import numpy as np
+import networkx as nx
+import sklearn.metrics
 from scipy.spatial import distance
 from scipy import sparse
 from scipy import stats
@@ -251,11 +254,13 @@ def test_hdbscan_generic():
     n_clusters_2 = len(set(labels)) - int(-1 in labels)
     assert n_clusters_2 == n_clusters
 
+
 def test_hdbscan_dbscan_clustering():
     clusterer = HDBSCAN().fit(X)
     labels = clusterer.dbscan_clustering(0.3)
     n_clusters_1 = len(set(labels)) - int(-1 in labels)
-    assert(n_clusters == n_clusters_1)
+    assert (n_clusters == n_clusters_1)
+
 
 def test_hdbscan_high_dimensional():
     H, y = make_blobs(n_samples=50, random_state=0, n_features=64)
@@ -267,8 +272,8 @@ def test_hdbscan_high_dimensional():
 
     labels = (
         HDBSCAN(algorithm="best", metric="seuclidean", V=np.ones(H.shape[1]))
-        .fit(H)
-        .labels_
+            .fit(H)
+            .labels_
     )
     n_clusters_2 = len(set(labels)) - int(-1 in labels)
     assert n_clusters_2 == n_clusters
@@ -330,7 +335,6 @@ def test_hdbscan_input_lists():
 
 
 def test_hdbscan_boruvka_kdtree_matches():
-
     data = generate_noisy_data()
 
     labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic")
@@ -351,7 +355,6 @@ def test_hdbscan_boruvka_kdtree_matches():
 
 
 def test_hdbscan_boruvka_balltree_matches():
-
     data = generate_noisy_data()
 
     labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic")
@@ -414,7 +417,6 @@ def test_min_span_tree_plot():
 
 
 def test_tree_numpy_output_formats():
-
     clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
 
     clusterer.single_linkage_tree_.to_numpy()
@@ -423,7 +425,6 @@ def test_tree_numpy_output_formats():
 
 
 def test_tree_pandas_output_formats():
-
     clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
     if_pandas(clusterer.condensed_tree_.to_pandas)()
     if_pandas(clusterer.single_linkage_tree_.to_pandas)()
@@ -431,7 +432,6 @@ def test_tree_pandas_output_formats():
 
 
 def test_tree_networkx_output_formats():
-
     clusterer = HDBSCAN(gen_min_span_tree=True).fit(X)
     if_networkx(clusterer.condensed_tree_.to_networkx)()
     if_networkx(clusterer.single_linkage_tree_.to_networkx)()
@@ -576,7 +576,6 @@ def test_hdbscan_badargs():
 
 
 def test_hdbscan_sparse():
-
     sparse_X = sparse.csr_matrix(X)
 
     labels = HDBSCAN().fit(sparse_X).labels_
@@ -585,7 +584,6 @@ def test_hdbscan_sparse():
 
 
 def test_hdbscan_caching():
-
     cachedir = mkdtemp()
     labels1 = HDBSCAN(memory=cachedir, min_samples=5).fit(X).labels_
     labels2 = HDBSCAN(memory=cachedir, min_samples=5, min_cluster_size=6).fit(X).labels_
@@ -646,6 +644,35 @@ def test_hdbscan_is_sklearn_estimator():
     check_estimator(HDBSCAN)
 
 
+def test_hdbscan_graph():
+    # create graph
+    graph = nx.barbell_graph(5, 5)
+    communities_generator = nx.community.louvain_partitions(graph)
+    next_level_communities = next(communities_generator)
+
+    print(sorted(map(sorted, next_level_communities)))
+
+
+    # create the adjacency matrix
+    adjacency = nx.adjacency_matrix(graph)
+
+    # create the distance matrix
+    distance_matrix = nx.floyd_warshall_numpy(graph)
+
+    # run HDBSCAN on the created distance matrix using the "precomputed" metric
+    clusterer = HDBSCAN(metric="precomputed").fit(sparse.csr_matrix(distance_matrix))
+    labels_distance_matrix = clusterer.labels_
+
+    # run HDBSCAN on graph using the csr adjacency matrix with the "graph" metric
+    clusterer = HDBSCAN(metric="graph").fit(adjacency)
+    labels_hdbscan_graph = clusterer.labels_
+
+    print(labels_hdbscan_graph)
+    print(labels_distance_matrix)
+
+    assert sklearn.metrics.adjusted_mutual_info_score(labels_distance_matrix, labels_hdbscan_graph) == 1
+
+
 # Probably not applicable now #
 # def test_dbscan_sparse():
 # def test_dbscan_balltree():

From fd9bad5628b3e093842b80f0372ed8ad347e3b61 Mon Sep 17 00:00:00 2001
From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:49:39 +0200
Subject: [PATCH 3/8] Create test for hdbscan graph metric

Use the test_hdbscan_sparse_distance_matrix test to create the test_hdbscan_graph function. Added the requirments for the metric to the requirements.txt (igraph and networkx).
---
 examples/plot_hdbscan_graph.py | 12 ++----------
 hdbscan/tests/test_hdbscan.py  | 35 ++++++++++++++++------------------
 requirements.txt               |  7 +++++++
 3 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/examples/plot_hdbscan_graph.py b/examples/plot_hdbscan_graph.py
index eac762af..11e5a6ce 100644
--- a/examples/plot_hdbscan_graph.py
+++ b/examples/plot_hdbscan_graph.py
@@ -5,15 +5,7 @@
 import igraph
 import networkx as nx
 import time
-from hdbscan import (
-    HDBSCAN,
-    hdbscan,
-    validity_index,
-    approximate_predict,
-    approximate_predict_scores,
-    membership_vector,
-    all_points_membership_vectors,
-)
+from hdbscan import HDBSCAN
 
 
 def create_distance_matrix(graph):
@@ -59,7 +51,7 @@ def hdbscan_graph():
     start_build_graph = time.time()
 
     # set parameters graph and edges
-    number_communities = 4
+    number_communities = np.random.randint(3, 20, 1)[0]
     edge_weight_in_community = 0.1
     edge_weight_out_community = 1
 
diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py
index 433d2c65..5e86c5dc 100644
--- a/hdbscan/tests/test_hdbscan.py
+++ b/hdbscan/tests/test_hdbscan.py
@@ -2,7 +2,6 @@
 Tests for HDBSCAN clustering algorithm
 Shamelessly based on (i.e. ripped off from) the DBSCAN test code
 """
-import matplotlib.pyplot as plt
 import numpy as np
 import networkx as nx
 import sklearn.metrics
@@ -645,32 +644,30 @@ def test_hdbscan_is_sklearn_estimator():
 
 
 def test_hdbscan_graph():
-    # create graph
-    graph = nx.barbell_graph(5, 5)
-    communities_generator = nx.community.louvain_partitions(graph)
-    next_level_communities = next(communities_generator)
-
-    print(sorted(map(sorted, next_level_communities)))
-
+    # create a distance matrix, see test_hdbscan_distance_matrix
+    D = distance.squareform(distance.pdist(X))
+    D /= np.max(D)
 
-    # create the adjacency matrix
-    adjacency = nx.adjacency_matrix(graph)
+    threshold = stats.scoreatpercentile(D.flatten(), 50)
 
-    # create the distance matrix
-    distance_matrix = nx.floyd_warshall_numpy(graph)
+    D[D >= threshold] = 0.0
+    D = sparse.csr_matrix(D)
+    D.eliminate_zeros()
 
-    # run HDBSCAN on the created distance matrix using the "precomputed" metric
-    clusterer = HDBSCAN(metric="precomputed").fit(sparse.csr_matrix(distance_matrix))
+    # create cluster labels using precomputed metric
+    clusterer = HDBSCAN(metric="precomputed").fit(D)
     labels_distance_matrix = clusterer.labels_
 
-    # run HDBSCAN on graph using the csr adjacency matrix with the "graph" metric
-    clusterer = HDBSCAN(metric="graph").fit(adjacency)
+    # create a graph from the distance matrix and transform the graph to a csr adjacency matrix
+    graph = nx.from_numpy_matrix(D)
+    adjacency_matrix = nx.adjacency_matrix(graph)
+
+    # create cluster labels using the graph metric
+    clusterer = HDBSCAN(metric="graph").fit(adjacency_matrix)
     labels_hdbscan_graph = clusterer.labels_
 
-    print(labels_hdbscan_graph)
-    print(labels_distance_matrix)
+    assert sklearn.metrics.accuracy_score(labels_distance_matrix, labels_hdbscan_graph) == 1
 
-    assert sklearn.metrics.adjusted_mutual_info_score(labels_distance_matrix, labels_hdbscan_graph) == 1
 
 
 # Probably not applicable now #
diff --git a/requirements.txt b/requirements.txt
index 487cfe7b..31dff836 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,10 @@ numpy>=1.20
 scipy>= 1.0
 scikit-learn>=0.20
 joblib>=1.0
+
+pytest~=7.1.1
+hdbscan~=0.8.28
+networkx~=2.8
+matplotlib~=3.5.1
+igraph~=0.9.9
+setuptools~=61.2.0
\ No newline at end of file

From 3f6ad3179cbacabacc9bc14b105ef3ac829ab952 Mon Sep 17 00:00:00 2001
From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com>
Date: Mon, 25 Apr 2022 15:08:47 +0200
Subject: [PATCH 4/8] Clean up code

Added commentary and proving the test.
---
 examples/plot_hdbscan_graph.py | 5 +++--
 hdbscan/tests/test_hdbscan.py  | 2 +-
 requirements.txt               | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/plot_hdbscan_graph.py b/examples/plot_hdbscan_graph.py
index 11e5a6ce..cc55dd86 100644
--- a/examples/plot_hdbscan_graph.py
+++ b/examples/plot_hdbscan_graph.py
@@ -15,10 +15,10 @@ def create_distance_matrix(graph):
     :return: Scipy csr matrix based on the graph.
     """
 
-    # create a distance matrix based of the graph
     # create variables
     path_weight, vertex_from_list, vertex_to_list, vertex_from = [], [], [], 0
 
+    # create a distance matrix based of the graph
     for vertex in graph.vs:
         list_edges_shortest_path = graph.get_shortest_paths(vertex, to=None, weights="weight", mode='out',
                                                             output="epath")
@@ -37,6 +37,7 @@ def create_distance_matrix(graph):
             vertex_to += 1
         vertex_from += 1
 
+    # transform lists into a csr matrix
     distance_matrix = sparse.csr_matrix((path_weight, (vertex_from_list, vertex_to_list)))
 
     return distance_matrix
@@ -70,7 +71,7 @@ def hdbscan_graph():
     graph.simplify()
     graph.vs.select(_degree=0).delete()
 
-    # run community detection to assign edge weights
+    # run community detection to assign edge weights, the function won't works on unweighted graphs
     community_detection = graph.community_multilevel()
 
     # add edge weights
diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py
index 5e86c5dc..46254703 100644
--- a/hdbscan/tests/test_hdbscan.py
+++ b/hdbscan/tests/test_hdbscan.py
@@ -659,7 +659,7 @@ def test_hdbscan_graph():
     labels_distance_matrix = clusterer.labels_
 
     # create a graph from the distance matrix and transform the graph to a csr adjacency matrix
-    graph = nx.from_numpy_matrix(D)
+    graph = nx.from_numpy_matrix(D.toarray())
     adjacency_matrix = nx.adjacency_matrix(graph)
 
     # create cluster labels using the graph metric
diff --git a/requirements.txt b/requirements.txt
index 31dff836..5d6a5ddd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,5 @@ hdbscan~=0.8.28
 networkx~=2.8
 matplotlib~=3.5.1
 igraph~=0.9.9
+pycairo~=1.21.0
 setuptools~=61.2.0
\ No newline at end of file

From 6da5a87c6f018785402de4294e359e57ccf9a148 Mon Sep 17 00:00:00 2001
From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com>
Date: Mon, 25 Apr 2022 15:48:40 +0200
Subject: [PATCH 5/8] Reset requirements

Set requirements back to the latest HDBSCAN version.
---
 requirements.txt | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5d6a5ddd..19ca08da 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,12 +2,4 @@ cython>=0.27
 numpy>=1.20
 scipy>= 1.0
 scikit-learn>=0.20
-joblib>=1.0
-
-pytest~=7.1.1
-hdbscan~=0.8.28
-networkx~=2.8
-matplotlib~=3.5.1
-igraph~=0.9.9
-pycairo~=1.21.0
-setuptools~=61.2.0
\ No newline at end of file
+joblib>=1.0
\ No newline at end of file

From 69ab735243b83b9df29031222f2fdf30b6da412a Mon Sep 17 00:00:00 2001
From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com>
Date: Thu, 5 May 2022 15:47:40 +0200
Subject: [PATCH 6/8] Update hdbscan/tests/test_hdbscan.py

Co-authored-by: Isaac Virshup <ivirshup@gmail.com>
---
 hdbscan/tests/test_hdbscan.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py
index 46254703..7481b5e9 100644
--- a/hdbscan/tests/test_hdbscan.py
+++ b/hdbscan/tests/test_hdbscan.py
@@ -655,16 +655,17 @@ def test_hdbscan_graph():
     D.eliminate_zeros()
 
     # create cluster labels using precomputed metric
-    clusterer = HDBSCAN(metric="precomputed").fit(D)
-    labels_distance_matrix = clusterer.labels_
+    dist_clusterer = HDBSCAN(metric="precomputed").fit(D)
+    labels_distance_matrix = dist_clusterer.labels_
 
     # create a graph from the distance matrix and transform the graph to a csr adjacency matrix
-    graph = nx.from_numpy_matrix(D.toarray())
-    adjacency_matrix = nx.adjacency_matrix(graph)
+    from hdbscan._hdbscan_reachability import sparse_mutual_reachability
+
+    graph = sparse_mutual_rechablility(D.tolil())
 
     # create cluster labels using the graph metric
-    clusterer = HDBSCAN(metric="graph").fit(adjacency_matrix)
-    labels_hdbscan_graph = clusterer.labels_
+    graph_clusterer = HDBSCAN(metric="graph").fit(graph)
+    labels_hdbscan_graph = graph_clusterer.labels_
 
     assert sklearn.metrics.accuracy_score(labels_distance_matrix, labels_hdbscan_graph) == 1
 

From 643b1f36a1c1e442fe0e071d5e78ffc3f3ac6a85 Mon Sep 17 00:00:00 2001
From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com>
Date: Thu, 5 May 2022 16:02:59 +0200
Subject: [PATCH 7/8] Integrate comments from ivirshup

Updated the hdbscan_ function for metric="graph" and test_hdbscan by integrating the comments made by ivirshup
---
 hdbscan/hdbscan_.py           | 75 +++++++++++------------------------
 hdbscan/tests/test_hdbscan.py |  2 +-
 2 files changed, 25 insertions(+), 52 deletions(-)

diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py
index 8accef86..8624b5f9 100644
--- a/hdbscan/hdbscan_.py
+++ b/hdbscan/hdbscan_.py
@@ -97,23 +97,9 @@ def _hdbscan_generic(
         # TODO: Check if copying is necessary
         distance_matrix = X.copy()
     elif metric == "graph":
-        # takes the graph csr matrix and converts it directly into a min_span_tree
-
-        # X should be the adjacency of the graph in csr sparse format
-        adjacency_matrix = X
-
-        # run the distance matrix function with metric "graph" creating a cs min spanning tree
-        return _hdbscan_sparse_distance_matrix(
-            adjacency_matrix,
-            min_samples,
-            alpha,
-            "graph",
-            p,
-            leaf_size,
-            gen_min_span_tree,
-            **kwargs
-        )
+        assert issparse(X), f"Graphs must be passed as sparse arrays, was a {type(X)}."
 
+        distance_matrix = X.copy()
     else:
         distance_matrix = pairwise_distances(X, metric=metric, **kwargs)
 
@@ -181,8 +167,11 @@ def _hdbscan_sparse_distance_matrix(
 ):
     assert issparse(X)
 
-    # if the metric is not graph, build a min spanning tree from the sparse matrix
-    if metric != "graph":
+    # if the metric is not graph, compute mutual_reachability of distance matrix
+    if metric == "graph":
+        mutual_reachability_ = X.tocsr()
+
+    else:
         # Check for connected component on X
         if csgraph.connected_components(X, directed=False, return_labels=False) > 1:
             raise ValueError(
@@ -201,41 +190,25 @@ def _hdbscan_sparse_distance_matrix(
         mutual_reachability_ = sparse_mutual_reachability(
             lil_matrix, min_points=min_samples, max_dist=max_dist, alpha=alpha
         )
-        # Check connected component on mutual reachability
-        # If more than one component, it means that even if the distance matrix X
-        # has one component, there exists with less than `min_samples` neighbors
-        if (
-                csgraph.connected_components(
-                    mutual_reachability_, directed=False, return_labels=False
-                )
-                > 1
-        ):
-            raise ValueError(
-                (
-                    "There exists points with less than %s neighbors. "
-                    "Ensure your distance matrix has non zeros values for "
-                    "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), "
-                    "or specify a `max_dist` to use when distances are missing."
-                )
-                % (min_samples, min_samples)
-            )
 
-    # otherwise convert the csr adjacency matrix from the graph into a minimum spanning tree
-    else:
-        # check components of the graph
-        if (
-                csgraph.connected_components(X)[0]
-                > 1
-        ):
-            raise ValueError(
-                (
-                    "The passed graph has more than on component. \n"
-                    "Run hdbscan on each component."
-                )
+    # Check connected component on mutual reachability
+    # If more than one component, it means that even if the distance matrix X
+    # has one component, there exists with less than `min_samples` neighbors
+    if (
+            csgraph.connected_components(
+                mutual_reachability_, directed=False, return_labels=False
             )
-        # if one component set the mutual_reachability_ to the csr from the graph
-        else:
-            mutual_reachability_ = X
+            > 1
+    ):
+        raise ValueError(
+            (
+                "There exists points with less than %s neighbors. "
+                "Ensure your distance matrix (or graph for metric= `graph`) has non zeros values for "
+                "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), "
+                "or specify a `max_dist` to use when distances are missing."
+            )
+            % (min_samples, min_samples)
+        )
 
     # Compute the minimum spanning tree for the sparse graph
     sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability_)
diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py
index 7481b5e9..7d5a88ae 100644
--- a/hdbscan/tests/test_hdbscan.py
+++ b/hdbscan/tests/test_hdbscan.py
@@ -661,7 +661,7 @@ def test_hdbscan_graph():
     # create a graph from the distance matrix and transform the graph to a csr adjacency matrix
     from hdbscan._hdbscan_reachability import sparse_mutual_reachability
 
-    graph = sparse_mutual_rechablility(D.tolil())
+    graph = sparse_mutual_reachability(D.tolil())
 
     # create cluster labels using the graph metric
     graph_clusterer = HDBSCAN(metric="graph").fit(graph)

From c50f6042e8ae221983c9e5bc8268387c50c01413 Mon Sep 17 00:00:00 2001
From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com>
Date: Tue, 10 May 2022 12:55:07 +0200
Subject: [PATCH 8/8] Remove networkx from test_hdbscan

Removed the networkx as nx import from the test_hdbscan file to avoid test failures, as it wasn't used.
---
 hdbscan/tests/test_hdbscan.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py
index 7d5a88ae..411c4c22 100644
--- a/hdbscan/tests/test_hdbscan.py
+++ b/hdbscan/tests/test_hdbscan.py
@@ -3,7 +3,6 @@
 Shamelessly based on (i.e. ripped off from) the DBSCAN test code
 """
 import numpy as np
-import networkx as nx
 import sklearn.metrics
 from scipy.spatial import distance
 from scipy import sparse