Commit 7bf7cc34 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Added sorting and cutting to heatmap algo

parent 0a6d4fe7
......@@ -5,7 +5,6 @@ from functools import reduce
import logging
import pandas as pd
from scipy.stats import zscore
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.heatmap.stats import StatisticTask
......@@ -28,6 +27,7 @@ class HeatmapTask(AnalyticTask):
categoricals: List[pd.DataFrame],
ranking_method: str,
id_filter: List[T],
max_rows: int,
subsets: List[List[T]]) -> dict:
# merge input data into single df
df = reduce(lambda a, b: a.append(b), numerical_arrays)
......@@ -49,18 +49,13 @@ class HeatmapTask(AnalyticTask):
"the subset sample ids do not match the data."
logger.error(error)
raise ValueError(error)
for subset in subsets:
if not subset:
error = "One or more of the specified subsets does not " \
"match any sample id for the given array data."
logger.error(error)
raise ValueError(error)
# make matrix of input data
_df = df.pivot(index='feature', columns='id', values='value')
# create z-score matrix used for visualising the heatmap
z_df = _df.apply(zscore, axis=1)
z_df = _df.apply(lambda row: (row - row.mean()) / row.std(ddof=0),
axis=1)
# compute statistic for ranking
stats = self.stat_task.main(df=_df, subsets=subsets,
......@@ -73,6 +68,14 @@ class HeatmapTask(AnalyticTask):
df = df.merge(z_df, on=['id', 'feature'])
df.columns = ['id', 'feature', 'value', 'zscore']
# sort by ranking_value
df['sort_value'] = df['feature'].apply(
lambda x: stats[stats['feature'] == x][ranking_method][0])
df = df.sort_values('sort_value', ascending=False).drop('sort_value', 1)
# discard rows according to max_rows
df = df[df['feature'].isin(df['feature'].unique()[:max_rows])]
return {
'data': df.to_json(orient='records'),
'stats': stats.to_json(orient='records')
......
......@@ -70,7 +70,7 @@ class StatisticTask(AnalyticTask):
# prepare the df in case an id exists in more than one subset
if len(subsets) < 2:
error = "Limma analysis requires at least " \
"two groups for comparison."
"two non-empty groups for comparison."
logger.error(error)
raise ValueError(error)
if df.shape[0] < 1 or df.shape[1] < 2:
......
"""This module provides tests for the heatmap analysis main module."""
import json
import pytest
import pandas as pd
import numpy as np
from fractalis.analytics.tasks.heatmap.main import HeatmapTask
......@@ -11,7 +14,7 @@ class TestHeatmap:
task = HeatmapTask()
def test_functional_1(self):
def test_functional(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
[102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
......@@ -24,43 +27,47 @@ class TestHeatmap:
categoricals=[],
ranking_method='B',
id_filter=[],
max_rows=100,
subsets=subsets)
assert 'data' in result
assert 'stats' in result
def test_main_raises_if_invalid_data(self):
def test_functional_with_nans_and_missing(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
[102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
[102, 'foo', 10],
[103, 'foo', float('nan')], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
columns=['id', 'feature', 'value'])
]
subsets = [[1, 2, 3, 4]] # does not match sample colnames
with pytest.raises(ValueError) as e:
self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='mean',
id_filter=[],
subsets=subsets)
assert 'subset sample ids do not match the data' in e
subsets = [[101, 102], [103, 104]]
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='B',
id_filter=[],
max_rows=100,
subsets=subsets)
stats = json.loads(result['stats'])
assert stats[0] != stats[1]
def test_main_raises_if_invalid_subsets(self):
def test_main_raises_if_invalid_data(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
[102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
columns=['id', 'feature', 'value'])
]
subsets = [[101, 102, 103], [123]]
subsets = [[1, 2, 3, 4]] # does not match sample colnames
with pytest.raises(ValueError) as e:
self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='mean',
id_filter=[],
max_rows=100,
subsets=subsets)
assert 'specified subsets does not match' in e
assert 'data set is too small' in e
def test_empty_subset_equals_full_subset(self):
numerical_arrays = [
......@@ -74,6 +81,7 @@ class TestHeatmap:
categoricals=[],
ranking_method='mean',
id_filter=[],
max_rows=100,
subsets=[])
result_2 = self.task.main(numerical_arrays=numerical_arrays,
......@@ -81,5 +89,95 @@ class TestHeatmap:
categoricals=[],
ranking_method='mean',
id_filter=[],
max_rows=100,
subsets=[[101, 102, 103, 104]])
assert result_1 == result_2
def test_multiple_numerical_array_data(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
[102, 'foo', 10], [102, 'bar', 11],
[103, 'foo', 15], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
columns=['id', 'feature', 'value']),
pd.DataFrame([[101, 'baz', 10], [102, 'baz', 11],
[105, 'foo', 20], [105, 'baz', 21],
[106, 'bar', 15]],
columns=['id', 'feature', 'value'])
]
subsets = [[101, 102, 106], [103, 104, 105]]
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='B',
id_filter=[],
max_rows=100,
subsets=subsets)
assert 'data' in result
assert 'stats' in result
def test_zscore_is_not_nan_if_data_misses_values(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
[102, 'foo', 10], [102, 'bar', 11],
[103, 'foo', 15], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
columns=['id', 'feature', 'value']),
pd.DataFrame([[101, 'baz', 10], [102, 'baz', 11],
[105, 'foo', 20], [105, 'baz', 21],
[106, 'bar', 15]],
columns=['id', 'feature', 'value'])
]
subsets = [[101, 102, 106], [103, 104, 105]]
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='B',
id_filter=[],
max_rows=100,
subsets=subsets)
data = json.loads(result['data'])
data = pd.DataFrame(data)
assert not np.isnan(np.min(data['zscore']))
def test_results_are_sorted(self):
numerical_arrays = [
pd.DataFrame([[101, 'A', 5], [102, 'A', 5],
[101, 'B', 2], [102, 'B', 2],
[101, 'C', 8], [102, 'C', 8],
[101, 'D', 10], [102, 'D', 10]],
columns=['id', 'feature', 'value'])
]
subsets = []
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='mean',
id_filter=[],
max_rows=100,
subsets=subsets)
data = json.loads(result['data'])
data = pd.DataFrame(data)
feature_col = data['feature'].tolist()
assert ['D', 'D', 'C', 'C', 'A', 'A', 'B', 'B'] == feature_col
def test_max_rows_works(self):
numerical_arrays = [
pd.DataFrame([[101, 'A', 5], [102, 'A', 5],
[101, 'B', 2], [102, 'B', 2],
[101, 'C', 8], [102, 'C', 8],
[101, 'D', 10], [102, 'D', 10]],
columns=['id', 'feature', 'value'])
]
subsets = []
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
ranking_method='mean',
id_filter=[],
max_rows=2,
subsets=subsets)
data = json.loads(result['data'])
data = pd.DataFrame(data)
feature_col = data['feature'].tolist()
assert ['D', 'D', 'C', 'C'] == feature_col
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment