From d597882a3644856357d5cf6183af75308a31240e Mon Sep 17 00:00:00 2001 From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com> Date: Wed, 20 Apr 2022 12:14:59 +0200 Subject: [PATCH 1/8] Integrate-graph-objects into hdbscan_.py Create a new option for metrics calles "graph", which takes graph objects as a csr adjacency matrix and runs the HDBSCAN function on the csgraph min_span_tree of the given graph. Example plot file (plot_hdbscan_graph.py) shows the working of the function and displays the example plots of the graphs. --- examples/plot_hdbscan_graph.py | 147 +++++++++++++++++++++++++++++++++ hdbscan/hdbscan_.py | 42 ++++++++++ 2 files changed, 189 insertions(+) create mode 100644 examples/plot_hdbscan_graph.py diff --git a/examples/plot_hdbscan_graph.py b/examples/plot_hdbscan_graph.py new file mode 100644 index 00000000..eac762af --- /dev/null +++ b/examples/plot_hdbscan_graph.py @@ -0,0 +1,147 @@ +import numpy +import numpy as np +import sklearn.metrics +from scipy import sparse +import igraph +import networkx as nx +import time +from hdbscan import ( + HDBSCAN, + hdbscan, + validity_index, + approximate_predict, + approximate_predict_scores, + membership_vector, + all_points_membership_vectors, +) + + +def create_distance_matrix(graph): + """ + Creates a distance matrix from the given graph using the igraph shortest path algorithm. + :param graph: An igraph graph object. + :return: Scipy csr matrix based on the graph. + """ + + # create a distance matrix based of the graph + # create variables + path_weight, vertex_from_list, vertex_to_list, vertex_from = [], [], [], 0 + + for vertex in graph.vs: + list_edges_shortest_path = graph.get_shortest_paths(vertex, to=None, weights="weight", mode='out', + output="epath") + vertex_to = 0 + + for edge_list in list_edges_shortest_path: + if edge_list: + vertex_from_list.append(vertex_from) + vertex_to_list.append(vertex_to) + path_weight.append(sum(graph.es.select(edge_list)["weight"])) + else: + vertex_from_list.append(vertex_from) + vertex_to_list.append(vertex_to) + path_weight.append(0) + + vertex_to += 1 + vertex_from += 1 + + distance_matrix = sparse.csr_matrix((path_weight, (vertex_from_list, vertex_to_list))) + + return distance_matrix + + +def hdbscan_graph(): + """ + Creates a weighted stochastic_block_model graph to compare the newly created graph function of HDBSCAN + to the precomputed metric using a distance matrix created for the graph. + """ + # measure time + start_build_graph = time.time() + + # set parameters graph and edges + number_communities = 4 + edge_weight_in_community = 0.1 + edge_weight_out_community = 1 + + # create graph + community_sizes = np.random.randint(low=30, high=70, size=number_communities) + matrix_prob = np.random.rand(number_communities, number_communities) + matrix_prob = (np.tril(matrix_prob) + np.tril(matrix_prob, -1).T) * 0.5 + numpy.fill_diagonal(matrix_prob, 0.7) + sbm_graph = nx.stochastic_block_model(community_sizes, matrix_prob, seed=0) + + # convert to igraph object + graph = igraph.Graph(n=sbm_graph.number_of_nodes(), directed=False) + graph.add_edges(sbm_graph.edges()) + + # check for double edges and loops and delete those + graph.simplify() + graph.vs.select(_degree=0).delete() + + # run community detection to assign edge weights + community_detection = graph.community_multilevel() + + # add edge weights + weight_list = [] + for edge in graph.es: + vertex_1 = edge.source + vertex_2 = edge.target + edge_weight_added = False + for subgraph in community_detection: + if vertex_1 in subgraph and vertex_2 in subgraph: + weight_list.append(edge_weight_in_community) + edge_weight_added = True + if not edge_weight_added: + weight_list.append(edge_weight_out_community) + + graph.es["weight"] = weight_list + + print("Graph created:", time.time() - start_build_graph) + + # run HDBSCAN on graph distance matrix + start_distance_matrix = time.time() + + # create a distance matrix from the graph + distance_matrix = create_distance_matrix(graph) + + # run HDBSCAN on the created distance matrix + clusterer = HDBSCAN(metric="precomputed").fit(distance_matrix) + labels_distance_matrix = clusterer.labels_ + + # measure time + print("HDBSCAN distance matrix:", time.time() - start_distance_matrix) + + # plot graph clustering using iGraph + graph.vs["label_distance_matrix"] = labels_distance_matrix + vclustering = igraph.clustering.VertexClustering.FromAttribute(graph, "label_distance_matrix") + igraph.plot(vclustering) + + """ + Convert the iGraph graph into a csr sparse matrix, which the modified HDBSCAN function accepts and + transforms into a scipy csgraph. + """ + # run HDBSCAN using the graph metric + start_hdbscan_graph = time.time() + + # create adjacency matrix from the graph, csr sparse matrix format + adjacency = graph.get_adjacency_sparse(attribute="weight") + + clusterer = HDBSCAN(metric="graph").fit(adjacency) + labels_hdbscan_graph = clusterer.labels_ + + print("HDBSCAN graph:", time.time() - start_hdbscan_graph) + + # plot clustering labels using iGraph + graph.vs["label_hdbscan_graph"] = labels_hdbscan_graph + vclustering = igraph.clustering.VertexClustering.FromAttribute(graph, "label_hdbscan_graph") + igraph.plot(vclustering) + + # print the AMI and ARI for the labels + print("AMI:", sklearn.metrics.adjusted_mutual_info_score(labels_distance_matrix, labels_hdbscan_graph)) + print("ARI:", sklearn.metrics.adjusted_rand_score(labels_distance_matrix, labels_hdbscan_graph)) + + +""" +run the example function displaying the graph feature of HDBSCAN +""" +hdbscan_graph() diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py index 31f60470..b6e6175b 100644 --- a/hdbscan/hdbscan_.py +++ b/hdbscan/hdbscan_.py @@ -96,6 +96,10 @@ def _hdbscan_generic( # matrix to indicate missing distance information. # TODO: Check if copying is necessary distance_matrix = X.copy() + elif metric == "graph": + # takes the graph csr matrix and converts it directly into a min_span_tree + return _hdbscan_graph_to_tree(X, gen_min_span_tree) + else: distance_matrix = pairwise_distances(X, metric=metric, **kwargs) @@ -151,6 +155,44 @@ def _hdbscan_generic( return single_linkage_tree, result_min_span_tree +def _hdbscan_graph_to_tree( + graph, + gen_min_span_tree +): + # Check the graph for multiple components. + # If more than one component it means that there exists points + # with less than `min_samples` neighbors + if ( + csgraph.connected_components(graph)[0] + > 1 + ): + raise ValueError( + ( + "The passed graph has more than on component." + "Run hdbscan on each component." + ) + ) + + # graph as an csr distance matrix object + sparse_min_spanning_tree = csgraph.minimum_spanning_tree(graph) + + # Convert the graph to scipy cluster array format + nonzeros = sparse_min_spanning_tree.nonzero() + nonzero_vals = sparse_min_spanning_tree[nonzeros] + min_spanning_tree = np.vstack(nonzeros + (nonzero_vals,)).T + + # Sort edges of the min_spanning_tree by weight + min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :][0] + + # Convert edge list into standard hierarchical clustering format + single_linkage_tree = label(min_spanning_tree) + + if gen_min_span_tree: + return single_linkage_tree, min_spanning_tree + else: + return single_linkage_tree, None + + def _hdbscan_sparse_distance_matrix( X, min_samples=5, From ec7f7333d37122326352e5bf54a71a670e9d4d1f Mon Sep 17 00:00:00 2001 From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com> Date: Mon, 25 Apr 2022 13:58:26 +0200 Subject: [PATCH 2/8] Graph metric changes and tests added Modifed the _hdbscan_sparse_distance_matrix function to integrate the graph metric into the hdbscan code. Started working on the test for the graph metric. --- hdbscan/hdbscan_.py | 138 +++++++++++++++++----------------- hdbscan/tests/test_hdbscan.py | 47 +++++++++--- 2 files changed, 104 insertions(+), 81 deletions(-) diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py index b6e6175b..8accef86 100644 --- a/hdbscan/hdbscan_.py +++ b/hdbscan/hdbscan_.py @@ -98,7 +98,21 @@ def _hdbscan_generic( distance_matrix = X.copy() elif metric == "graph": # takes the graph csr matrix and converts it directly into a min_span_tree - return _hdbscan_graph_to_tree(X, gen_min_span_tree) + + # X should be the adjacency of the graph in csr sparse format + adjacency_matrix = X + + # run the distance matrix function with metric "graph" creating a cs min spanning tree + return _hdbscan_sparse_distance_matrix( + adjacency_matrix, + min_samples, + alpha, + "graph", + p, + leaf_size, + gen_min_span_tree, + **kwargs + ) else: distance_matrix = pairwise_distances(X, metric=metric, **kwargs) @@ -155,44 +169,6 @@ def _hdbscan_generic( return single_linkage_tree, result_min_span_tree -def _hdbscan_graph_to_tree( - graph, - gen_min_span_tree -): - # Check the graph for multiple components. - # If more than one component it means that there exists points - # with less than `min_samples` neighbors - if ( - csgraph.connected_components(graph)[0] - > 1 - ): - raise ValueError( - ( - "The passed graph has more than on component." - "Run hdbscan on each component." - ) - ) - - # graph as an csr distance matrix object - sparse_min_spanning_tree = csgraph.minimum_spanning_tree(graph) - - # Convert the graph to scipy cluster array format - nonzeros = sparse_min_spanning_tree.nonzero() - nonzero_vals = sparse_min_spanning_tree[nonzeros] - min_spanning_tree = np.vstack(nonzeros + (nonzero_vals,)).T - - # Sort edges of the min_spanning_tree by weight - min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :][0] - - # Convert edge list into standard hierarchical clustering format - single_linkage_tree = label(min_spanning_tree) - - if gen_min_span_tree: - return single_linkage_tree, min_spanning_tree - else: - return single_linkage_tree, None - - def _hdbscan_sparse_distance_matrix( X, min_samples=5, @@ -204,42 +180,62 @@ def _hdbscan_sparse_distance_matrix( **kwargs ): assert issparse(X) - # Check for connected component on X - if csgraph.connected_components(X, directed=False, return_labels=False) > 1: - raise ValueError( - "Sparse distance matrix has multiple connected " - "components!\nThat is, there exist groups of points " - "that are completely disjoint -- there are no distance " - "relations connecting them\n" - "Run hdbscan on each component." - ) - lil_matrix = X.tolil() + # if the metric is not graph, build a min spanning tree from the sparse matrix + if metric != "graph": + # Check for connected component on X + if csgraph.connected_components(X, directed=False, return_labels=False) > 1: + raise ValueError( + "Sparse distance matrix has multiple connected " + "components!\nThat is, there exist groups of points " + "that are completely disjoint -- there are no distance " + "relations connecting them\n" + "Run hdbscan on each component." + ) - # Compute sparse mutual reachability graph - # if max_dist > 0, max distance to use when the reachability is infinite - max_dist = kwargs.get("max_dist", 0.0) - mutual_reachability_ = sparse_mutual_reachability( - lil_matrix, min_points=min_samples, max_dist=max_dist, alpha=alpha - ) - # Check connected component on mutual reachability - # If more than one component, it means that even if the distance matrix X - # has one component, there exists with less than `min_samples` neighbors - if ( - csgraph.connected_components( - mutual_reachability_, directed=False, return_labels=False + lil_matrix = X.tolil() + + # Compute sparse mutual reachability graph + # if max_dist > 0, max distance to use when the reachability is infinite + max_dist = kwargs.get("max_dist", 0.0) + mutual_reachability_ = sparse_mutual_reachability( + lil_matrix, min_points=min_samples, max_dist=max_dist, alpha=alpha ) - > 1 - ): - raise ValueError( - ( - "There exists points with less than %s neighbors. " - "Ensure your distance matrix has non zeros values for " - "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), " - "or specify a `max_dist` to use when distances are missing." + # Check connected component on mutual reachability + # If more than one component, it means that even if the distance matrix X + # has one component, there exists with less than `min_samples` neighbors + if ( + csgraph.connected_components( + mutual_reachability_, directed=False, return_labels=False + ) + > 1 + ): + raise ValueError( + ( + "There exists points with less than %s neighbors. " + "Ensure your distance matrix has non zeros values for " + "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), " + "or specify a `max_dist` to use when distances are missing." + ) + % (min_samples, min_samples) ) - % (min_samples, min_samples) - ) + + # otherwise convert the csr adjacency matrix from the graph into a minimum spanning tree + else: + # check components of the graph + if ( + csgraph.connected_components(X)[0] + > 1 + ): + raise ValueError( + ( + "The passed graph has more than on component. \n" + "Run hdbscan on each component." + ) + ) + # if one component set the mutual_reachability_ to the csr from the graph + else: + mutual_reachability_ = X # Compute the minimum spanning tree for the sparse graph sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability_) diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py index 9e3faa34..433d2c65 100644 --- a/hdbscan/tests/test_hdbscan.py +++ b/hdbscan/tests/test_hdbscan.py @@ -2,7 +2,10 @@ Tests for HDBSCAN clustering algorithm Shamelessly based on (i.e. ripped off from) the DBSCAN test code """ +import matplotlib.pyplot as plt import numpy as np +import networkx as nx +import sklearn.metrics from scipy.spatial import distance from scipy import sparse from scipy import stats @@ -251,11 +254,13 @@ def test_hdbscan_generic(): n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters + def test_hdbscan_dbscan_clustering(): clusterer = HDBSCAN().fit(X) labels = clusterer.dbscan_clustering(0.3) n_clusters_1 = len(set(labels)) - int(-1 in labels) - assert(n_clusters == n_clusters_1) + assert (n_clusters == n_clusters_1) + def test_hdbscan_high_dimensional(): H, y = make_blobs(n_samples=50, random_state=0, n_features=64) @@ -267,8 +272,8 @@ def test_hdbscan_high_dimensional(): labels = ( HDBSCAN(algorithm="best", metric="seuclidean", V=np.ones(H.shape[1])) - .fit(H) - .labels_ + .fit(H) + .labels_ ) n_clusters_2 = len(set(labels)) - int(-1 in labels) assert n_clusters_2 == n_clusters @@ -330,7 +335,6 @@ def test_hdbscan_input_lists(): def test_hdbscan_boruvka_kdtree_matches(): - data = generate_noisy_data() labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic") @@ -351,7 +355,6 @@ def test_hdbscan_boruvka_kdtree_matches(): def test_hdbscan_boruvka_balltree_matches(): - data = generate_noisy_data() labels_prims, p, persist, ctree, ltree, mtree = hdbscan(data, algorithm="generic") @@ -414,7 +417,6 @@ def test_min_span_tree_plot(): def test_tree_numpy_output_formats(): - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) clusterer.single_linkage_tree_.to_numpy() @@ -423,7 +425,6 @@ def test_tree_numpy_output_formats(): def test_tree_pandas_output_formats(): - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) if_pandas(clusterer.condensed_tree_.to_pandas)() if_pandas(clusterer.single_linkage_tree_.to_pandas)() @@ -431,7 +432,6 @@ def test_tree_pandas_output_formats(): def test_tree_networkx_output_formats(): - clusterer = HDBSCAN(gen_min_span_tree=True).fit(X) if_networkx(clusterer.condensed_tree_.to_networkx)() if_networkx(clusterer.single_linkage_tree_.to_networkx)() @@ -576,7 +576,6 @@ def test_hdbscan_badargs(): def test_hdbscan_sparse(): - sparse_X = sparse.csr_matrix(X) labels = HDBSCAN().fit(sparse_X).labels_ @@ -585,7 +584,6 @@ def test_hdbscan_sparse(): def test_hdbscan_caching(): - cachedir = mkdtemp() labels1 = HDBSCAN(memory=cachedir, min_samples=5).fit(X).labels_ labels2 = HDBSCAN(memory=cachedir, min_samples=5, min_cluster_size=6).fit(X).labels_ @@ -646,6 +644,35 @@ def test_hdbscan_is_sklearn_estimator(): check_estimator(HDBSCAN) +def test_hdbscan_graph(): + # create graph + graph = nx.barbell_graph(5, 5) + communities_generator = nx.community.louvain_partitions(graph) + next_level_communities = next(communities_generator) + + print(sorted(map(sorted, next_level_communities))) + + + # create the adjacency matrix + adjacency = nx.adjacency_matrix(graph) + + # create the distance matrix + distance_matrix = nx.floyd_warshall_numpy(graph) + + # run HDBSCAN on the created distance matrix using the "precomputed" metric + clusterer = HDBSCAN(metric="precomputed").fit(sparse.csr_matrix(distance_matrix)) + labels_distance_matrix = clusterer.labels_ + + # run HDBSCAN on graph using the csr adjacency matrix with the "graph" metric + clusterer = HDBSCAN(metric="graph").fit(adjacency) + labels_hdbscan_graph = clusterer.labels_ + + print(labels_hdbscan_graph) + print(labels_distance_matrix) + + assert sklearn.metrics.adjusted_mutual_info_score(labels_distance_matrix, labels_hdbscan_graph) == 1 + + # Probably not applicable now # # def test_dbscan_sparse(): # def test_dbscan_balltree(): From fd9bad5628b3e093842b80f0372ed8ad347e3b61 Mon Sep 17 00:00:00 2001 From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:49:39 +0200 Subject: [PATCH 3/8] Create test for hdbscan graph metric Use the test_hdbscan_sparse_distance_matrix test to create the test_hdbscan_graph function. Added the requirments for the metric to the requirements.txt (igraph and networkx). --- examples/plot_hdbscan_graph.py | 12 ++---------- hdbscan/tests/test_hdbscan.py | 35 ++++++++++++++++------------------ requirements.txt | 7 +++++++ 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/examples/plot_hdbscan_graph.py b/examples/plot_hdbscan_graph.py index eac762af..11e5a6ce 100644 --- a/examples/plot_hdbscan_graph.py +++ b/examples/plot_hdbscan_graph.py @@ -5,15 +5,7 @@ import igraph import networkx as nx import time -from hdbscan import ( - HDBSCAN, - hdbscan, - validity_index, - approximate_predict, - approximate_predict_scores, - membership_vector, - all_points_membership_vectors, -) +from hdbscan import HDBSCAN def create_distance_matrix(graph): @@ -59,7 +51,7 @@ def hdbscan_graph(): start_build_graph = time.time() # set parameters graph and edges - number_communities = 4 + number_communities = np.random.randint(3, 20, 1)[0] edge_weight_in_community = 0.1 edge_weight_out_community = 1 diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py index 433d2c65..5e86c5dc 100644 --- a/hdbscan/tests/test_hdbscan.py +++ b/hdbscan/tests/test_hdbscan.py @@ -2,7 +2,6 @@ Tests for HDBSCAN clustering algorithm Shamelessly based on (i.e. ripped off from) the DBSCAN test code """ -import matplotlib.pyplot as plt import numpy as np import networkx as nx import sklearn.metrics @@ -645,32 +644,30 @@ def test_hdbscan_is_sklearn_estimator(): def test_hdbscan_graph(): - # create graph - graph = nx.barbell_graph(5, 5) - communities_generator = nx.community.louvain_partitions(graph) - next_level_communities = next(communities_generator) - - print(sorted(map(sorted, next_level_communities))) - + # create a distance matrix, see test_hdbscan_distance_matrix + D = distance.squareform(distance.pdist(X)) + D /= np.max(D) - # create the adjacency matrix - adjacency = nx.adjacency_matrix(graph) + threshold = stats.scoreatpercentile(D.flatten(), 50) - # create the distance matrix - distance_matrix = nx.floyd_warshall_numpy(graph) + D[D >= threshold] = 0.0 + D = sparse.csr_matrix(D) + D.eliminate_zeros() - # run HDBSCAN on the created distance matrix using the "precomputed" metric - clusterer = HDBSCAN(metric="precomputed").fit(sparse.csr_matrix(distance_matrix)) + # create cluster labels using precomputed metric + clusterer = HDBSCAN(metric="precomputed").fit(D) labels_distance_matrix = clusterer.labels_ - # run HDBSCAN on graph using the csr adjacency matrix with the "graph" metric - clusterer = HDBSCAN(metric="graph").fit(adjacency) + # create a graph from the distance matrix and transform the graph to a csr adjacency matrix + graph = nx.from_numpy_matrix(D) + adjacency_matrix = nx.adjacency_matrix(graph) + + # create cluster labels using the graph metric + clusterer = HDBSCAN(metric="graph").fit(adjacency_matrix) labels_hdbscan_graph = clusterer.labels_ - print(labels_hdbscan_graph) - print(labels_distance_matrix) + assert sklearn.metrics.accuracy_score(labels_distance_matrix, labels_hdbscan_graph) == 1 - assert sklearn.metrics.adjusted_mutual_info_score(labels_distance_matrix, labels_hdbscan_graph) == 1 # Probably not applicable now # diff --git a/requirements.txt b/requirements.txt index 487cfe7b..31dff836 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,10 @@ numpy>=1.20 scipy>= 1.0 scikit-learn>=0.20 joblib>=1.0 + +pytest~=7.1.1 +hdbscan~=0.8.28 +networkx~=2.8 +matplotlib~=3.5.1 +igraph~=0.9.9 +setuptools~=61.2.0 \ No newline at end of file From 3f6ad3179cbacabacc9bc14b105ef3ac829ab952 Mon Sep 17 00:00:00 2001 From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com> Date: Mon, 25 Apr 2022 15:08:47 +0200 Subject: [PATCH 4/8] Clean up code Added commentary and proving the test. --- examples/plot_hdbscan_graph.py | 5 +++-- hdbscan/tests/test_hdbscan.py | 2 +- requirements.txt | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/plot_hdbscan_graph.py b/examples/plot_hdbscan_graph.py index 11e5a6ce..cc55dd86 100644 --- a/examples/plot_hdbscan_graph.py +++ b/examples/plot_hdbscan_graph.py @@ -15,10 +15,10 @@ def create_distance_matrix(graph): :return: Scipy csr matrix based on the graph. """ - # create a distance matrix based of the graph # create variables path_weight, vertex_from_list, vertex_to_list, vertex_from = [], [], [], 0 + # create a distance matrix based of the graph for vertex in graph.vs: list_edges_shortest_path = graph.get_shortest_paths(vertex, to=None, weights="weight", mode='out', output="epath") @@ -37,6 +37,7 @@ def create_distance_matrix(graph): vertex_to += 1 vertex_from += 1 + # transform lists into a csr matrix distance_matrix = sparse.csr_matrix((path_weight, (vertex_from_list, vertex_to_list))) return distance_matrix @@ -70,7 +71,7 @@ def hdbscan_graph(): graph.simplify() graph.vs.select(_degree=0).delete() - # run community detection to assign edge weights + # run community detection to assign edge weights, the function won't works on unweighted graphs community_detection = graph.community_multilevel() # add edge weights diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py index 5e86c5dc..46254703 100644 --- a/hdbscan/tests/test_hdbscan.py +++ b/hdbscan/tests/test_hdbscan.py @@ -659,7 +659,7 @@ def test_hdbscan_graph(): labels_distance_matrix = clusterer.labels_ # create a graph from the distance matrix and transform the graph to a csr adjacency matrix - graph = nx.from_numpy_matrix(D) + graph = nx.from_numpy_matrix(D.toarray()) adjacency_matrix = nx.adjacency_matrix(graph) # create cluster labels using the graph metric diff --git a/requirements.txt b/requirements.txt index 31dff836..5d6a5ddd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ hdbscan~=0.8.28 networkx~=2.8 matplotlib~=3.5.1 igraph~=0.9.9 +pycairo~=1.21.0 setuptools~=61.2.0 \ No newline at end of file From 6da5a87c6f018785402de4294e359e57ccf9a148 Mon Sep 17 00:00:00 2001 From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com> Date: Mon, 25 Apr 2022 15:48:40 +0200 Subject: [PATCH 5/8] Reset requirements Set requirements back to the latest HDBSCAN version. --- requirements.txt | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5d6a5ddd..19ca08da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,12 +2,4 @@ cython>=0.27 numpy>=1.20 scipy>= 1.0 scikit-learn>=0.20 -joblib>=1.0 - -pytest~=7.1.1 -hdbscan~=0.8.28 -networkx~=2.8 -matplotlib~=3.5.1 -igraph~=0.9.9 -pycairo~=1.21.0 -setuptools~=61.2.0 \ No newline at end of file +joblib>=1.0 \ No newline at end of file From 69ab735243b83b9df29031222f2fdf30b6da412a Mon Sep 17 00:00:00 2001 From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com> Date: Thu, 5 May 2022 15:47:40 +0200 Subject: [PATCH 6/8] Update hdbscan/tests/test_hdbscan.py Co-authored-by: Isaac Virshup --- hdbscan/tests/test_hdbscan.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py index 46254703..7481b5e9 100644 --- a/hdbscan/tests/test_hdbscan.py +++ b/hdbscan/tests/test_hdbscan.py @@ -655,16 +655,17 @@ def test_hdbscan_graph(): D.eliminate_zeros() # create cluster labels using precomputed metric - clusterer = HDBSCAN(metric="precomputed").fit(D) - labels_distance_matrix = clusterer.labels_ + dist_clusterer = HDBSCAN(metric="precomputed").fit(D) + labels_distance_matrix = dist_clusterer.labels_ # create a graph from the distance matrix and transform the graph to a csr adjacency matrix - graph = nx.from_numpy_matrix(D.toarray()) - adjacency_matrix = nx.adjacency_matrix(graph) + from hdbscan._hdbscan_reachability import sparse_mutual_reachability + + graph = sparse_mutual_rechablility(D.tolil()) # create cluster labels using the graph metric - clusterer = HDBSCAN(metric="graph").fit(adjacency_matrix) - labels_hdbscan_graph = clusterer.labels_ + graph_clusterer = HDBSCAN(metric="graph").fit(graph) + labels_hdbscan_graph = graph_clusterer.labels_ assert sklearn.metrics.accuracy_score(labels_distance_matrix, labels_hdbscan_graph) == 1 From 643b1f36a1c1e442fe0e071d5e78ffc3f3ac6a85 Mon Sep 17 00:00:00 2001 From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com> Date: Thu, 5 May 2022 16:02:59 +0200 Subject: [PATCH 7/8] Integrate comments from ivirshup Updated the hdbscan_ function for metric="graph" and test_hdbscan by integrating the comments made by ivirshup --- hdbscan/hdbscan_.py | 75 +++++++++++------------------------ hdbscan/tests/test_hdbscan.py | 2 +- 2 files changed, 25 insertions(+), 52 deletions(-) diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py index 8accef86..8624b5f9 100644 --- a/hdbscan/hdbscan_.py +++ b/hdbscan/hdbscan_.py @@ -97,23 +97,9 @@ def _hdbscan_generic( # TODO: Check if copying is necessary distance_matrix = X.copy() elif metric == "graph": - # takes the graph csr matrix and converts it directly into a min_span_tree - - # X should be the adjacency of the graph in csr sparse format - adjacency_matrix = X - - # run the distance matrix function with metric "graph" creating a cs min spanning tree - return _hdbscan_sparse_distance_matrix( - adjacency_matrix, - min_samples, - alpha, - "graph", - p, - leaf_size, - gen_min_span_tree, - **kwargs - ) + assert issparse(X), f"Graphs must be passed as sparse arrays, was a {type(X)}." + distance_matrix = X.copy() else: distance_matrix = pairwise_distances(X, metric=metric, **kwargs) @@ -181,8 +167,11 @@ def _hdbscan_sparse_distance_matrix( ): assert issparse(X) - # if the metric is not graph, build a min spanning tree from the sparse matrix - if metric != "graph": + # if the metric is not graph, compute mutual_reachability of distance matrix + if metric == "graph": + mutual_reachability_ = X.tocsr() + + else: # Check for connected component on X if csgraph.connected_components(X, directed=False, return_labels=False) > 1: raise ValueError( @@ -201,41 +190,25 @@ def _hdbscan_sparse_distance_matrix( mutual_reachability_ = sparse_mutual_reachability( lil_matrix, min_points=min_samples, max_dist=max_dist, alpha=alpha ) - # Check connected component on mutual reachability - # If more than one component, it means that even if the distance matrix X - # has one component, there exists with less than `min_samples` neighbors - if ( - csgraph.connected_components( - mutual_reachability_, directed=False, return_labels=False - ) - > 1 - ): - raise ValueError( - ( - "There exists points with less than %s neighbors. " - "Ensure your distance matrix has non zeros values for " - "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), " - "or specify a `max_dist` to use when distances are missing." - ) - % (min_samples, min_samples) - ) - # otherwise convert the csr adjacency matrix from the graph into a minimum spanning tree - else: - # check components of the graph - if ( - csgraph.connected_components(X)[0] - > 1 - ): - raise ValueError( - ( - "The passed graph has more than on component. \n" - "Run hdbscan on each component." - ) + # Check connected component on mutual reachability + # If more than one component, it means that even if the distance matrix X + # has one component, there exists with less than `min_samples` neighbors + if ( + csgraph.connected_components( + mutual_reachability_, directed=False, return_labels=False ) - # if one component set the mutual_reachability_ to the csr from the graph - else: - mutual_reachability_ = X + > 1 + ): + raise ValueError( + ( + "There exists points with less than %s neighbors. " + "Ensure your distance matrix (or graph for metric= `graph`) has non zeros values for " + "at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), " + "or specify a `max_dist` to use when distances are missing." + ) + % (min_samples, min_samples) + ) # Compute the minimum spanning tree for the sparse graph sparse_min_spanning_tree = csgraph.minimum_spanning_tree(mutual_reachability_) diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py index 7481b5e9..7d5a88ae 100644 --- a/hdbscan/tests/test_hdbscan.py +++ b/hdbscan/tests/test_hdbscan.py @@ -661,7 +661,7 @@ def test_hdbscan_graph(): # create a graph from the distance matrix and transform the graph to a csr adjacency matrix from hdbscan._hdbscan_reachability import sparse_mutual_reachability - graph = sparse_mutual_rechablility(D.tolil()) + graph = sparse_mutual_reachability(D.tolil()) # create cluster labels using the graph metric graph_clusterer = HDBSCAN(metric="graph").fit(graph) From c50f6042e8ae221983c9e5bc8268387c50c01413 Mon Sep 17 00:00:00 2001 From: JanRhoKa <92030143+JanRhoKa@users.noreply.github.com> Date: Tue, 10 May 2022 12:55:07 +0200 Subject: [PATCH 8/8] Remove networkx from test_hdbscan Removed the networkx as nx import from the test_hdbscan file to avoid test failures, as it wasn't used. --- hdbscan/tests/test_hdbscan.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py index 7d5a88ae..411c4c22 100644 --- a/hdbscan/tests/test_hdbscan.py +++ b/hdbscan/tests/test_hdbscan.py @@ -3,7 +3,6 @@ Shamelessly based on (i.e. ripped off from) the DBSCAN test code """ import numpy as np -import networkx as nx import sklearn.metrics from scipy.spatial import distance from scipy import sparse