Commit 8819e3bb authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Fixed a bug where row and columns were mixed up

parent c9f6e3fc
Pipeline #2253 failed with stage
in 7 minutes and 48 seconds
......@@ -17,14 +17,14 @@ logger = logging.getLogger(__name__)
class ClusteringTask(AnalyticTask):
name = 'compute-clustering'
name = 'compute-cluster'
def main(self, df: str, cluster_algo: str,
def main(self, df: dict, cluster_algo: str,
options: dict) -> dict:
try:
df = pd.read_json(df)
df = pd.DataFrame.from_dict(df)
except Exception:
error = "Failed to parse string to data frame."
error = "Failed to parse input data frame."
logger.error(error)
raise ValueError(error)
# fill NAs with col medians so the clustering algorithms will work
......@@ -49,9 +49,9 @@ class ClusteringTask(AnalyticTask):
"perform a hierarchical clustering."
logger.error(error)
raise ValueError(error)
row_names, row_clusters = self._hclust(df.T, method,
row_names, row_clusters = self._hclust(df, method,
metric, n_row_clusters)
col_names, col_clusters = self._hclust(df, method,
col_names, col_clusters = self._hclust(df.T, method,
metric, n_col_clusters)
return {
'row_names': row_names,
......@@ -62,9 +62,9 @@ class ClusteringTask(AnalyticTask):
def _hclust(self, df: pd.DataFrame,
method: str, metric: str, n_clusters: int) -> Tuple[List, List]:
names = list(df)
series = np.array(df)
z = hclust.linkage(series, method=method, metric=metric)
names = list(df.index)
values = df.values
z = hclust.linkage(values, method=method, metric=metric)
cluster = [x[0] for x in hclust.cut_tree(z,
n_clusters=[n_clusters])]
cluster_count = Counter(cluster)
......@@ -93,8 +93,8 @@ class ClusteringTask(AnalyticTask):
logger.error(error)
raise ValueError(error)
row_names, row_clusters = self._kmeans(df.T, n_row_centroids)
col_names, col_clusters = self._kmeans(df, n_col_centroids)
row_names, row_clusters = self._kmeans(df, n_row_centroids)
col_names, col_clusters = self._kmeans(df.T, n_col_centroids)
return {
'row_names': row_names,
'col_names': col_names,
......@@ -103,9 +103,9 @@ class ClusteringTask(AnalyticTask):
}
def _kmeans(self, df: pd.DataFrame, n_centroids) -> Tuple[List, List]:
names = list(df)
series = np.array(df).astype('float')
cluster = list(kmeans2(series, k=n_centroids, minit='points')[1])
names = list(df.index)
values = df.as_matrix().astype('float')
cluster = list(kmeans2(values, k=n_centroids, minit='points')[1])
cluster_count = Counter(cluster)
# sort elements by their cluster size
sorted_cluster = sorted(zip(names, cluster),
......
......@@ -13,7 +13,7 @@ class TestClustering:
task = ClusteringTask()
valid_df = json.dumps({
df = {
'A': {
'a': 50,
'b': 2,
......@@ -29,20 +29,9 @@ class TestClustering:
'b': 4,
'c': 60
}
})
}
def test_hclust_raises_with_invalid_param_1(self):
with pytest.raises(ValueError) as e:
options = {
'method': 'single',
'metric': 'euclidean',
'n_row_clusters': 2,
'n_col_clusters': 2
}
self.task.main(df='{//foo', cluster_algo='hclust', options=options)
assert 'parse string to data frame' in e
def test_hclust_raises_with_invalid_param_2(self):
with pytest.raises(ValueError) as e:
options = {
'method': 'abc',
......@@ -50,11 +39,10 @@ class TestClustering:
'n_row_clusters': 2,
'n_col_clusters': 2
}
self.task.main(df=self.valid_df,
cluster_algo='hclust', options=options)
self.task.main(df=self.df, cluster_algo='hclust', options=options)
assert 'Invalid method' in e
def test_hclust_raises_with_invalid_param_3(self):
def test_hclust_raises_with_invalid_param_2(self):
with pytest.raises(ValueError) as e:
options = {
'method': 'single',
......@@ -62,19 +50,17 @@ class TestClustering:
'n_row_clusters': 2,
'n_col_clusters': 2
}
self.task.main(df=self.valid_df,
cluster_algo='hclust', options=options)
self.task.main(df=self.df, cluster_algo='hclust', options=options)
assert 'Invalid metric' in e
def test_hclust_raises_with_invalid_param_4(self):
def test_hclust_raises_with_invalid_param_3(self):
with pytest.raises(ValueError) as e:
options = {
'method': 'single',
'metric': 'abc',
'n_row_clusters': 2,
}
self.task.main(df=self.valid_df,
cluster_algo='hclust', options=options)
self.task.main(df=self.df, cluster_algo='hclust', options=options)
assert 'mandatory parameters' in e
def test_hclust_returns_valid_result(self):
......@@ -84,7 +70,7 @@ class TestClustering:
'n_row_clusters': 2,
'n_col_clusters': 2
}
result = self.task.main(df=self.valid_df,
result = self.task.main(df=self.df,
cluster_algo='hclust', options=options)
assert 'row_names' in result
assert 'col_names' in result
......@@ -95,33 +81,21 @@ class TestClustering:
assert [0, 0, 1] == result['row_cluster']
assert [0, 0, 1] == result['col_cluster']
def test_kmean_raises_with_invalid_param_1(self):
with pytest.raises(ValueError) as e:
options = {
'n_row_centroids': 2,
'n_col_centroids': 2
}
self.task.main(df='{//foo', cluster_algo='kmeans', options=options)
assert 'parse string to data frame' in e
def test_kmean_raises_with_invalid_param_2(self):
with pytest.raises(ValueError) as e:
options = {
'n_row_centroids': 2,
'n_col_centroids': 'abc'
}
self.task.main(df=self.valid_df,
cluster_algo='kmeans', options=options)
self.task.main(df=self.df, cluster_algo='kmeans', options=options)
assert 'invalid' in e
def test_kmean_raises_with_invalid_param_3(self):
def test_kmean_raises_with_invalid_param_2(self):
with pytest.raises(ValueError) as e:
options = {
'n_row_centroids': 2,
}
self.task.main(df=self.valid_df,
cluster_algo='kmeans', options=options)
self.task.main(df=self.df, cluster_algo='kmeans', options=options)
assert 'mandatory parameters' in e
def test_kmean_returns_valid_result(self):
......@@ -129,7 +103,7 @@ class TestClustering:
'n_row_centroids': 2,
'n_col_centroids': 2
}
result = self.task.main(df=self.valid_df,
result = self.task.main(df=self.df,
cluster_algo='kmeans', options=options)
assert 'row_names' in result
assert 'col_names' in result
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment