Commit 488dd501 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Many improvements around the heatmap analysis

parent 25c90a68
......@@ -10,6 +10,7 @@ import yaml
from flask import Flask
from flask_cors import CORS
from flask_request_id import RequestID
from flask_compress import Compress
from redis import StrictRedis
from fractalis.session import RedisSessionInterface
......@@ -40,6 +41,9 @@ if default_config:
# Plugin that assigns every request an id
RequestID(app)
# Plugin that compresses all responses
Compress(app)
# create a redis instance
log.info("Creating Redis connection.")
redis = StrictRedis(host=app.config['REDIS_HOST'],
......
"""The /analytics controller. Please refer to doc/api for more information."""
import json
import logging
from typing import Tuple
from uuid import UUID
......
......@@ -120,7 +120,9 @@ class AnalyticTask(Task, metaclass=abc.ABCMeta):
:param value: The string to test.
:return: True if argument contains data_task_id.
"""
return value.startswith('$') and value.endswith('$')
return isinstance(value, str) and \
value.startswith('$') and \
value.endswith('$')
@staticmethod
def parse_value(value: str) -> Tuple[str, dict]:
......@@ -157,7 +159,7 @@ class AnalyticTask(Task, metaclass=abc.ABCMeta):
value = args[arg]
# value is data id
if isinstance(value, str) and self.contains_data_task_id(value):
if self.contains_data_task_id(value):
data_task_id, filters = self.parse_value(value)
df = self.data_task_id_to_data_frame(
data_task_id, session_data_tasks, decrypt)
......
......@@ -51,32 +51,41 @@ class HeatmapTask(AnalyticTask):
raise ValueError(error)
# make matrix of input data
_df = df.pivot(index='feature', columns='id', values='value')
df = df.pivot(index='feature', columns='id', values='value')
# create z-score matrix used for visualising the heatmap
z_df = _df.apply(lambda row: (row - row.mean()) / row.std(ddof=0),
axis=1)
z_df = [(df.iloc[i] - df.iloc[i].mean()) / df.iloc[i].std(ddof=0)
for i in range(df.shape[0])]
z_df = pd.DataFrame(z_df, columns=df.columns, index=df.index)
# compute statistic for ranking
stats = self.stat_task.main(df=_df, subsets=subsets,
stats = self.stat_task.main(df=df, subsets=subsets,
ranking_method=ranking_method)
del _df
# prepare output for front-end
z_df['feature'] = z_df.index
z_df = pd.melt(z_df, id_vars='feature')
df = df.merge(z_df, on=['id', 'feature'])
df.columns = ['id', 'feature', 'value', 'zscore']
# sort by ranking_value
df['sort_value'] = df['feature'].apply(
lambda x: stats[stats['feature'] == x][ranking_method].tolist()[0])
df = df.sort_values('sort_value', ascending=False).drop('sort_value', 1)
df = pd.merge(df, stats[['feature', ranking_method]], how='left',
left_index=True, right_on='feature')
df = df.sort_values(ranking_method, ascending=False) \
.drop(ranking_method, axis=1)
z_df = pd.merge(z_df, stats[['feature', ranking_method]], how='left',
left_index=True, right_on='feature')
z_df = z_df.sort_values(ranking_method, ascending=False) \
.drop(ranking_method, axis=1)
# discard rows according to max_rows
df = df[df['feature'].isin(df['feature'].unique()[:max_rows])]
df = df[:max_rows]
z_df = z_df[:max_rows]
stats = stats[:max_rows]
# prepare output for front-end
df = pd.melt(df, id_vars='feature', var_name='id')
z_df = pd.melt(z_df, id_vars='feature', var_name='id')
df = df.merge(z_df, on=['id', 'feature'])
df.rename(columns={'value_x': 'value', 'value_y': 'zscore'},
inplace=True)
return {
'data': df.to_json(orient='records'),
'stats': stats.to_json(orient='records')
'data': df.to_dict(orient='list'),
'stats': stats.to_dict(orient='list')
}
......@@ -5,7 +5,6 @@ from typing import List, TypeVar
import logging
import pandas as pd
from numpy import mean, median, var
from rpy2 import robjects as R
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
......@@ -37,24 +36,24 @@ class StatisticTask(AnalyticTask):
@staticmethod
def get_mean_stats(df: pd.DataFrame) -> pd.DataFrame:
mean_series = df.apply(mean, axis=1)
df = mean_series.to_frame('mean')
df['feature'] = df.index
return df
means = [row.mean() for row in df.values]
stats = pd.DataFrame(means, columns=['mean'])
stats['feature'] = df.index
return stats
@staticmethod
def get_median_stats(df: pd.DataFrame) -> pd.DataFrame:
median_series = df.apply(median, axis=1)
df = median_series.to_frame('median')
df['feature'] = df.index
return df
means = [row.median() for row in df.values]
stats = pd.DataFrame(means, columns=['median'])
stats['feature'] = df.index
return stats
@staticmethod
def get_variance_stats(df: pd.DataFrame) -> pd.DataFrame:
var_series = df.apply(var, axis=1)
df = var_series.to_frame('var')
df['feature'] = df.index
return df
means = [row.var() for row in df.values]
stats = pd.DataFrame(means, columns=['var'])
stats['feature'] = df.index
return stats
@staticmethod
def get_limma_stats(df: pd.DataFrame,
......
......@@ -21,6 +21,7 @@ PERMANENT_SESSION_LIFETIME = timedelta(days=1)
BROKER_URL = 'amqp://'
CELERY_RESULT_BACKEND = 'redis://{}:{}'.format(REDIS_HOST, REDIS_PORT)
CELERYD_TASK_SOFT_TIME_LIMIT = 60 * 20
CELERYD_TASK_TIME_LIMIT = 60 * 30
CELERY_TASK_RESULT_EXPIRES = timedelta(hours=1)
CELERYD_HIJACK_ROOT_LOGGER = False
......
......@@ -18,7 +18,8 @@ class RandomNumericalETL(ETL):
def extract(self, server: str,
token: str, descriptor: dict) -> pd.DataFrame:
data = pd.DataFrame(np.random.randn(50000, 200).tolist())
data = pd.DataFrame(np.random.randn(
descriptor['num_samples'], descriptor['num_features']).tolist())
return data
def transform(self, raw_data: pd.DataFrame,
......
......@@ -24,6 +24,7 @@ class RedisSession(CallbackDict, SessionMixin):
self.permanent = True
self.modified = False
class RedisSessionInterface(SessionInterface):
def __init__(self, redis, app):
......
......@@ -12,6 +12,7 @@ setup(
'flask-cors',
'Flask-Script',
'flask-request-id-middleware',
'flask-compress',
'jsonschema',
'celery[redis]',
'redis',
......
......@@ -34,10 +34,10 @@ class TestHeatmap:
def test_functional_with_nans_and_missing(self):
numerical_arrays = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 5],
[102, 'foo', 10],
[103, 'foo', float('nan')], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
[103, 'foo', float('nan')], [103, 'bar', 15],
[104, 'foo', 20], [104, 'bar', 20]],
columns=['id', 'feature', 'value'])
]
subsets = [[101, 102], [103, 104]]
......@@ -48,8 +48,9 @@ class TestHeatmap:
id_filter=[],
max_rows=100,
subsets=subsets)
stats = json.loads(result['stats'])
assert stats[0] != stats[1]
for stat in result['stats']:
if stat != 'feature' and stat != 'AveExpr':
assert result['stats'][stat][0] == result['stats'][stat][1]
def test_main_raises_if_invalid_data(self):
numerical_arrays = [
......@@ -136,7 +137,7 @@ class TestHeatmap:
id_filter=[],
max_rows=100,
subsets=subsets)
data = json.loads(result['data'])
data = result['data']
data = pd.DataFrame(data)
assert not np.isnan(np.min(data['zscore']))
......@@ -156,10 +157,10 @@ class TestHeatmap:
id_filter=[],
max_rows=100,
subsets=subsets)
data = json.loads(result['data'])
data = result['data']
data = pd.DataFrame(data)
feature_col = data['feature'].tolist()
assert ['D', 'D', 'C', 'C', 'A', 'A', 'B', 'B'] == feature_col
assert ['D', 'C', 'A', 'B', 'D', 'C', 'A', 'B'] == feature_col
def test_max_rows_works(self):
numerical_arrays = [
......@@ -177,7 +178,7 @@ class TestHeatmap:
id_filter=[],
max_rows=2,
subsets=subsets)
data = json.loads(result['data'])
data = result['data']
data = pd.DataFrame(data)
feature_col = data['feature'].tolist()
assert ['D', 'D', 'C', 'C'] == feature_col
assert ['D', 'C', 'D', 'C'] == feature_col
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment