Commit 3bec1a46 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

fixed bug that occured when cluster had identical size

parent 8819e3bb
Pipeline #2254 failed with stage
in 1 minute and 6 seconds
......@@ -54,10 +54,8 @@ class ClusteringTask(AnalyticTask):
col_names, col_clusters = self._hclust(df.T, method,
metric, n_col_clusters)
return {
'row_names': row_names,
'col_names': col_names,
'row_cluster': row_clusters,
'col_cluster': col_clusters
'row_clusters': list(zip(row_names, row_clusters)),
'col_clusters': list(zip(col_names, col_clusters))
}
def _hclust(self, df: pd.DataFrame,
......@@ -70,7 +68,8 @@ class ClusteringTask(AnalyticTask):
cluster_count = Counter(cluster)
# sort elements by their cluster size
sorted_cluster = sorted(zip(names, cluster),
key=lambda x: cluster_count[x[1]], reverse=True)
key=lambda x: (cluster_count[x[1]], x[1]),
reverse=True)
names = [x[0] for x in sorted_cluster]
cluster = [x[1] for x in sorted_cluster]
# relabel cluster, with the biggest cluster being 0
......@@ -81,6 +80,7 @@ class ClusteringTask(AnalyticTask):
c += 1
relabeled_cluster.append(c)
cluster = relabeled_cluster
return names, cluster
def kmeans(self, df: pd.DataFrame, options: dict) -> dict:
......@@ -96,10 +96,8 @@ class ClusteringTask(AnalyticTask):
row_names, row_clusters = self._kmeans(df, n_row_centroids)
col_names, col_clusters = self._kmeans(df.T, n_col_centroids)
return {
'row_names': row_names,
'col_names': col_names,
'row_cluster': row_clusters,
'col_cluster': col_clusters
'row_clusters': list(zip(row_names, row_clusters)),
'col_clusters': list(zip(col_names, col_clusters))
}
def _kmeans(self, df: pd.DataFrame, n_centroids) -> Tuple[List, List]:
......@@ -109,7 +107,8 @@ class ClusteringTask(AnalyticTask):
cluster_count = Counter(cluster)
# sort elements by their cluster size
sorted_cluster = sorted(zip(names, cluster),
key=lambda x: cluster_count[x[1]], reverse=True)
key=lambda x: (cluster_count[x[1]], x[1]),
reverse=True)
names = [x[0] for x in sorted_cluster]
cluster = [x[1] for x in sorted_cluster]
# relabel cluster, with the biggest cluster being 0
......
......@@ -63,6 +63,31 @@ class TestClustering:
self.task.main(df=self.df, cluster_algo='hclust', options=options)
assert 'mandatory parameters' in e
def test_hclust_can_handle_identical_cluster_size(self):
df = {
'A': {
'a': 5, 'b': 10
},
'B': {
'a': 500, 'b': 550
},
'C': {
'a': 5, 'b': 10
},
'D': {
'a': 500, 'b': 550
}
}
options = {
'method': 'single',
'metric': 'euclidean',
'n_row_clusters': 2,
'n_col_clusters': 2
}
result = self.task.main(df=df, cluster_algo='hclust', options=options)
assert ['B', 'D', 'A', 'C'] == [x[0] for x in result['col_clusters']]
assert [0, 0, 1, 1] == [x[1] for x in result['col_clusters']]
def test_hclust_returns_valid_result(self):
options = {
'method': 'single',
......@@ -72,14 +97,12 @@ class TestClustering:
}
result = self.task.main(df=self.df,
cluster_algo='hclust', options=options)
assert 'row_names' in result
assert 'col_names' in result
assert 'row_cluster' in result
assert 'col_cluster' in result
assert ['a', 'c', 'b'] == result['row_names']
assert ['A', 'C', 'B'] == result['col_names']
assert [0, 0, 1] == result['row_cluster']
assert [0, 0, 1] == result['col_cluster']
assert 'row_clusters' in result
assert 'col_clusters' in result
assert ['a', 'c', 'b'] == [x[0] for x in result['row_clusters']]
assert ['A', 'C', 'B'] == [x[0] for x in result['col_clusters']]
assert [0, 0, 1] == [x[1] for x in result['col_clusters']]
assert [0, 0, 1] == [x[1] for x in result['col_clusters']]
def test_kmean_raises_with_invalid_param_1(self):
with pytest.raises(ValueError) as e:
......@@ -98,6 +121,28 @@ class TestClustering:
self.task.main(df=self.df, cluster_algo='kmeans', options=options)
assert 'mandatory parameters' in e
def test_kmeans_can_handle_identical_cluster_size(self):
df = {
'A': {
'a': 5, 'b': 10
},
'B': {
'a': 500, 'b': 550
},
'C': {
'a': 5, 'b': 10
},
'D': {
'a': 500, 'b': 550
}
}
options = {
'n_row_centroids': 2,
'n_col_centroids': 2
}
result = self.task.main(df=df, cluster_algo='kmeans', options=options)
assert [0, 0, 1, 1] == [x[1] for x in result['col_clusters']]
def test_kmean_returns_valid_result(self):
options = {
'n_row_centroids': 2,
......@@ -105,11 +150,9 @@ class TestClustering:
}
result = self.task.main(df=self.df,
cluster_algo='kmeans', options=options)
assert 'row_names' in result
assert 'col_names' in result
assert 'row_cluster' in result
assert 'col_cluster' in result
assert ['a', 'c', 'b'] == result['row_names']
assert ['A', 'C', 'B'] == result['col_names']
assert [0, 0, 1] == result['row_cluster']
assert [0, 0, 1] == result['col_cluster']
\ No newline at end of file
assert 'row_clusters' in result
assert 'col_clusters' in result
assert ['a', 'c', 'b'] == [x[0] for x in result['row_clusters']]
assert ['A', 'C', 'B'] == [x[0] for x in result['col_clusters']]
assert [0, 0, 1] == [x[1] for x in result['col_clusters']]
assert [0, 0, 1] == [x[1] for x in result['col_clusters']]
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment