Commit f36edb7e authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Implementing a heatmap tests and finalizing analysis for now

parent dd2e5e35
Pipeline #2247 failed with stage
in 1 minute and 5 seconds
......@@ -95,7 +95,7 @@ class AnalyticTask(Task, metaclass=abc.ABCMeta):
df = self.secure_load(file_path)
else:
df = read_csv(file_path)
df = self.load_data_frame(file_path, decrypt)
df = self.secure_load(file_path)
return df
def prepare_args(self, session_data_tasks: List[str],
......
......@@ -3,6 +3,7 @@
from copy import deepcopy
from typing import List, TypeVar
from functools import reduce
import logging
import pandas as pd
from scipy.stats import zscore
......@@ -19,6 +20,7 @@ importr('limma')
pandas2ri.activate()
T = TypeVar('T')
logger = logging.getLogger(__name__)
class HeatmapTask(AnalyticTask):
......@@ -31,40 +33,41 @@ class HeatmapTask(AnalyticTask):
numericals: List[pd.DataFrame],
categoricals: List[pd.DataFrame],
subsets: List[List[T]]) -> dict:
# combine data frames col wise
# prepare inputs args
df = reduce(lambda a, b: a.append(b), numerical_arrays)
# prepare inputs
if not subsets:
subsets = [[]]
ids = list(df)
ids.remove('variable')
subsets = [ids]
df = drop_ungrouped_samples(df=df, subsets=subsets)
subsets = drop_unused_subset_ids(df=df, subsets=subsets)
# get samples-only data frame
variables = df['variable']
_df = df.drop('variable', axis=1)
# make sure the input data are still valid after the pre-processing
if df.shape[0] < 1 or df.shape[1] < 2:
error = "Either the input data set is too small or " \
"the subset sample ids do not match the data."
logger.error(error)
raise ValueError(error)
for subset in subsets:
if not subset:
error = "One or more of the specified subsets does not " \
"match any sample id for the given array data."
logger.error(error)
raise ValueError(error)
# create z-score matrix used for visualising the heatmap
zscores = _df.apply(zscore, axis=1)
variables = df['variable']
zscores = df.drop('variable', axis=1)
zscores = zscores.apply(zscore, axis=1)
zscores.insert(0, 'variable', variables)
# execute differential gene expression analysis
stats = self.getLimmaStats(_df, subsets)
# not needed any longer
del _df
stats = self.get_limma_stats(df, subsets)
# prepare output for front-end
df = df.transpose()
df.columns = variables
df.index.name = 'id'
df.reset_index(inplace=True)
df = pd.melt(df, id_vars='id')
zscores = zscores.transpose()
zscores.columns = variables
zscores.index.name = 'id'
zscores.reset_index(inplace=True)
zscores = pd.melt(zscores, id_vars='id')
df = self.melt_standard_format_df(df)
zscores = self.melt_standard_format_df(zscores)
df = pd.merge(df, zscores, on=['id', 'variable'])
df.columns = ['id', 'variable', 'value', 'zscore']
......@@ -73,7 +76,21 @@ class HeatmapTask(AnalyticTask):
'stats': stats.to_json(orient='index')
}
def getLimmaStats(self, df: pd.DataFrame,
def melt_standard_format_df(self, df: pd.DataFrame) -> pd.DataFrame:
if df.shape[0] < 1 or df.shape[1] < 2:
error = "Data must be non-empty for melting."
logger.error(error)
raise ValueError(error)
variables = df['variable']
df.drop('variable', axis=1, inplace=True)
df = df.T
df.columns = variables
df.index.name = 'id'
df.reset_index(inplace=True)
df = pd.melt(df, id_vars='id')
return df
def get_limma_stats(self, df: pd.DataFrame,
subsets: List[List[T]]) -> pd.DataFrame:
"""Use the R bioconductor package 'limma' to perform a differential
gene expression analysis on the given data frame.
......@@ -84,6 +101,21 @@ class HeatmapTask(AnalyticTask):
a different structured result data frame. See ?topTableF in R.
"""
# prepare the df in case an id exists in more than one subset
if len(subsets) < 2:
error = "Limma analysis requires at least " \
"two groups for comparison."
logger.error(error)
raise ValueError(error)
if df.shape[0] < 1 or df.shape[1] < 2:
error = "Limma analysis requires a " \
"data frame with dimension 1x2 or more."
logger.error(error)
raise ValueError(error)
# for analysis we want only sample cols
variables = df['variable']
df = df.drop('variable', axis=1)
flattened_subsets = [x for subset in subsets for x in subset]
df = df[flattened_subsets]
ids = list(df)
......@@ -118,13 +150,19 @@ class HeatmapTask(AnalyticTask):
r_design.colnames = R.StrVector(groups)
r_data = pandas2ri.py2ri(df)
# the next two lines are necessary if column ids are not unique, because
# the python to r transformation drops those columns
# the python to r transformation drops those columns otherwise
r_ids = R.StrVector(['X{}'.format(id) for id in ids])
r_data = r_data.rx(r_ids)
r_fit = r['lmFit'](r_data, r_design)
r_contrast_matrix = r['makeContrasts'](*comparisons, levels=r_design)
r_fit_2 = r['contrasts.fit'](r_fit, r_contrast_matrix)
r_fit_2 = r['eBayes'](r_fit_2)
r_results = r['topTable'](r_fit_2, number=float('inf'), sort='none')
r_results = r['topTable'](r_fit_2, number=float('inf'),
sort='none', genelist=variables)
results = pandas2ri.ri2py(r_results)
return results
\ No newline at end of file
# let's give the gene list column an appropriate name
colnames = results.columns.values
colnames[0] = 'variable'
results.columns = colnames
return results
......@@ -18,8 +18,7 @@ _protected_colnames = ['variable']
def drop_ungrouped_samples(df: pd.DataFrame,
subsets: List[List[T]]) -> pd.DataFrame:
"""Drop samples cols that are no present in any of the subsets.
:param df: Unmodified data frame
submitted to the main method of an AnalyticTask.
:param df: Dataframe containing array data in the Fractalis format.
:param subsets: Subgroups defined by the user.
:return: Filtered data frame.
"""
......@@ -38,8 +37,7 @@ def drop_ungrouped_samples(df: pd.DataFrame,
def drop_unused_subset_ids(df: pd.DataFrame,
subsets: List[List[T]]) -> List[List[T]]:
"""Drop subset ids that are not present in the given data
:param df: Unmodified data frame
submitted to the main method of an AnalyticTask.
:param df: Dataframe containing array data in the Fractalis format.
:param subsets: Subset groups specified by the user.
:return: Modified subsets list.
"""
......@@ -52,3 +50,4 @@ def drop_unused_subset_ids(df: pd.DataFrame,
if id not in df_ids:
subset.remove(id)
return _subsets
......@@ -5,13 +5,14 @@ import json
import pytest
import responses
from fractalis.data.etls.ada.etl_double import DoubleETL
from fractalis.data.etls.ada.etl_double_array import DoubleArrayETL
# noinspection PyMissingOrEmptyDocstring,PyMissingTypeHints
@pytest.mark.skip
class TestDoubleETL:
etl = DoubleETL()
etl = DoubleArrayETL()
valid_descriptor = {
'dictionary': {
......
"""This module provides tests for the heatmap analysis main module."""
import pytest
import pandas as pd
from fractalis.analytics.tasks.heatmap.main import HeatmapTask
# noinspection PyMissingTypeHints
class TestHeatmap:
task = HeatmapTask()
def test_melt_standard_format_df_works_for_standard_df(self):
df = pd.DataFrame([['foo', 5, 10],
['bar', 10, 15]],
columns=['variable', 101, 102])
df = self.task.melt_standard_format_df(df)
assert list(df) == ['id', 'variable', 'value']
assert df.shape == (4, 3)
def test_melt_standard_format_df_works_for_minimal_df(self):
df = pd.DataFrame([['foo', 5]], columns=['variable', 101])
df = self.task.melt_standard_format_df(df)
assert list(df) == ['id', 'variable', 'value']
assert df.shape == (1, 3)
def test_melt_standard_format_df_raises_for_invalid_df(self):
df = pd.DataFrame([['foo']], columns=['variable'])
with pytest.raises(ValueError) as e:
self.task.melt_standard_format_df(df)
assert 'must be non-empty' in e
def test_get_limma_stats_raises_for_invalid_subsets(self):
df = pd.DataFrame([['foo', 5, 10, 15, 20]],
columns=['variable', 0, 1, 2, 3])
subsets = [[0, 1]]
with pytest.raises(ValueError) as e:
self.task.get_limma_stats(df=df, subsets=subsets)
assert 'requires at least two' in e
def test_get_limma_stats_raises_for_invalid_df(self):
df = pd.DataFrame([['foo']], columns=['variable'])
subsets = [[0], [0]]
with pytest.raises(ValueError) as e:
self.task.get_limma_stats(df=df, subsets=subsets)
assert 'dimension 1x2 or more' in e
def test_get_limma_stats_returns_correct_for_2_groups(self):
df = pd.DataFrame([['foo', 5, 10, 15, 20]],
columns=['variable', 0, 1, 2, 3])
subsets = [[0, 1], [2, 3]]
stats = self.task.get_limma_stats(df=df, subsets=subsets)
assert all(stat in list(stats) for stat in
['variable', 'logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B'])
def test_get_limma_stats_returns_correct_for_3_groups(self):
df = pd.DataFrame([['foo', 5, 10, 15, 20]],
columns=['variable', 0, 1, 2, 3])
subsets = [[0, 1], [2], [3]]
stats = self.task.get_limma_stats(df=df, subsets=subsets)
assert all(stat in list(stats) for stat in
['variable', 'AveExpr', 'F', 'P.Value', 'adj.P.Val'])
assert all(stat not in list(stats) for stat in ['logFC', 'B', 't'])
def test_get_limma_stats_returns_correct_for_4_groups(self):
df = pd.DataFrame([['foo', 5, 10, 15, 20]],
columns=['variable', 0, 1, 2, 3])
subsets = [[0, 1], [1, 2], [2, 3], [3, 0]]
stats = self.task.get_limma_stats(df=df, subsets=subsets)
assert all(stat in list(stats) for stat in
['variable', 'AveExpr', 'F', 'P.Value', 'adj.P.Val'])
assert all(stat not in list(stats) for stat in ['logFC', 'B', 't'])
def test_functional_1(self):
numerical_arrays = [
pd.DataFrame([['foo', 5, 10, 15, 20], ['bar', 6, 11, 16, 21]],
columns=['variable', 101, 102, 103, 104])
]
subsets = [[101, 102], [103, 104]]
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
subsets=subsets)
assert 'data' in result
assert 'stats' in result
def test_main_raises_if_invalid_data(self):
numerical_arrays = [
pd.DataFrame([['foo', 5, 10, 15, 20]],
columns=['variable', 101, 102, 103, 104])
]
subsets = [[0, 1, 2, 3]] # does not match sample colnames
with pytest.raises(ValueError) as e:
self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
subsets=subsets)
assert 'subset sample ids do not match the data' in e
def test_main_raises_if_invalid_subsets(self):
numerical_arrays = [
pd.DataFrame([['foo', 5, 10, 15, 20]],
columns=['variable', 101, 102, 103, 104])
]
subsets = [[101, 102, 103], [123]]
with pytest.raises(ValueError) as e:
result = self.task.main(numerical_arrays=numerical_arrays,
numericals=[],
categoricals=[],
subsets=subsets)
assert 'specified subsets does not match' in e
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment