Commit b09f0529 authored by Sascha Herzinger's avatar Sascha Herzinger

Some refactoring and new volcanoplot params

parent c5d8c3d0
......@@ -9,8 +9,9 @@ RUN mkdir /app/ \
&& yum clean all \
&& yum install --nogpg -y python34 python34-pip python34-devel readline-devel R wget \
&& wget https://bioconductor.org/packages/release/bioc/src/contrib/limma_3.36.1.tar.gz \
&& R CMD INSTALL limma_*.tar.gz \
&& rm limma_*.tar.gz
&& wget https://bioconductor.org/packages/release/bioc/src/contrib/DESeq2_1.20.0.tar.gz \
&& R CMD INSTALL limma_*.tar.gz DESeq2_*.tar.gz \
&& rm limma_*.tar.gz DESeq2_*.tar.gz
COPY tests/ /app/tests/
WORKDIR /app/
ARG SDIST
......
......@@ -7,8 +7,7 @@ import logging
import pandas as pd
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.heatmap.stats import StatisticTask
from fractalis.analytics.tasks.shared import utils
from fractalis.analytics.tasks.shared import utils, array_stats
T = TypeVar('T')
......@@ -20,7 +19,6 @@ class HeatmapTask(AnalyticTask):
submittable celery task."""
name = 'compute-heatmap'
stat_task = StatisticTask()
def main(self, numerical_arrays: List[pd.DataFrame],
numericals: List[pd.DataFrame],
......@@ -59,8 +57,8 @@ class HeatmapTask(AnalyticTask):
z_df = pd.DataFrame(z_df, columns=df.columns, index=df.index)
# compute statistic for ranking
stats = self.stat_task.main(df=df, subsets=subsets,
ranking_method=ranking_method)
stats = array_stats.get_stats(df=df, subsets=subsets,
ranking_method=ranking_method)
# sort by ranking_value
self.sort(df, stats[ranking_method], ranking_method)
......
"""This module provides row-wise statistics for the heat map."""
from copy import deepcopy
from typing import List, TypeVar
import logging
import pandas as pd
from rpy2 import robjects as R
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
from fractalis.analytics.task import AnalyticTask
T = TypeVar('T')
importr('limma')
pandas2ri.activate()
logger = logging.getLogger(__name__)
class StatisticTask(AnalyticTask):
name = 'expression-stats-task'
def main(self, df: pd.DataFrame, subsets: List[List[T]],
ranking_method: str) -> pd.DataFrame:
if ranking_method == 'mean':
stats = self.get_mean_stats(df)
elif ranking_method == 'median':
stats = self.get_median_stats(df)
elif ranking_method == 'variance':
stats = self.get_variance_stats(df)
else:
stats = self.get_limma_stats(df, subsets)
return stats
@staticmethod
def get_mean_stats(df: pd.DataFrame) -> pd.DataFrame:
means = [row.mean() for row in df.values]
stats = pd.DataFrame(means, columns=['mean'])
stats['feature'] = df.index
return stats
@staticmethod
def get_median_stats(df: pd.DataFrame) -> pd.DataFrame:
means = [row.median() for row in df.values]
stats = pd.DataFrame(means, columns=['median'])
stats['feature'] = df.index
return stats
@staticmethod
def get_variance_stats(df: pd.DataFrame) -> pd.DataFrame:
means = [row.var() for row in df.values]
stats = pd.DataFrame(means, columns=['var'])
stats['feature'] = df.index
return stats
@staticmethod
def get_limma_stats(df: pd.DataFrame,
subsets: List[List[T]]) -> pd.DataFrame:
"""Use the R bioconductor package 'limma' to perform a differential
gene expression analysis on the given data frame.
:param df: Matrix of measurements where each column represents a sample
and each row a gene/probe.
:param subsets: Groups to compare with each other.
:return: Results of limma analysis. More than 2 subsets will result in
a different structured result data frame. See ?topTableF in R.
"""
# prepare the df in case an id exists in more than one subset
if len(subsets) < 2:
error = "Limma analysis requires at least " \
"two non-empty groups for comparison."
logger.error(error)
raise ValueError(error)
if df.shape[0] < 1 or df.shape[1] < 2:
error = "Limma analysis requires a " \
"data frame with dimension 1x2 or more."
logger.error(error)
raise ValueError(error)
flattened_subsets = [x for subset in subsets for x in subset]
df = df[flattened_subsets]
ids = list(df)
features = df.index
# creating the design vector according to the subsets
design_vector = [''] * len(ids)
subsets_copy = deepcopy(subsets)
for i, id in enumerate(ids):
for j, subset in enumerate(subsets_copy):
try:
subset.index(id) # raises an Exception if not found
subset.remove(id)
design_vector[i] = str(j + 1)
break
except ValueError:
assert j != len(subsets_copy) - 1
assert '' not in design_vector
# create group names
groups = ['group{}'.format(i + 1) for i in list(range(len(subsets)))]
# create a string for each pairwise comparison of the groups
comparisons = []
for i in reversed(range(len(subsets))):
for j in range(i):
comparisons.append('group{}-group{}'.format(i+1, j+1))
# fitting according to limma doc Chapter 8: Linear Models Overview
r_form = R.Formula('~ 0+factor(c({}))'.format(','.join(design_vector)))
r_design = r['model.matrix'](r_form)
r_design.colnames = R.StrVector(groups)
r_data = pandas2ri.py2ri(df)
# py2ri is stupid and makes too many assumptions.
# These two lines restore the column
r_data.colnames = list(set(ids))
r_data = r_data.rx(R.StrVector(ids))
r_fit = r['lmFit'](r_data, r_design)
r_contrast_matrix = r['makeContrasts'](*comparisons, levels=r_design)
r_fit_2 = r['contrasts.fit'](r_fit, r_contrast_matrix)
r_fit_2 = r['eBayes'](r_fit_2)
r_results = r['topTable'](r_fit_2, number=float('inf'),
sort='none', genelist=features)
results = pandas2ri.ri2py(r_results)
# let's give the gene list column an appropriate name
colnames = results.columns.values
colnames[0] = 'feature'
results.columns = colnames
return results
"""This module provides statistics for mRNA and miRNA data."""
from copy import deepcopy
from typing import List, TypeVar
import logging
import pandas as pd
from rpy2 import robjects as robj
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
T = TypeVar('T')
importr('limma')
importr('DESeq2')
pandas2ri.activate()
logger = logging.getLogger(__name__)
def get_stats(df: pd.DataFrame, subsets: List[List[T]],
params: dict, ranking_method: str) -> pd.DataFrame:
if ranking_method == 'mean':
stats = get_mean_stats(df)
elif ranking_method == 'median':
stats = get_median_stats(df)
elif ranking_method == 'variance':
stats = get_variance_stats(df)
elif ranking_method == 'limma':
stats = get_limma_stats(df, subsets)
elif ranking_method == 'DESeq2':
stats = get_deseq2_stats(df, subsets, **params)
else:
error = "Static method unknown: {}".format(ranking_method)
logger.exception(error)
raise NotImplementedError(error)
return stats
def get_mean_stats(df: pd.DataFrame) -> pd.DataFrame:
means = [row.mean() for row in df.values]
stats = pd.DataFrame(means, columns=['mean'])
stats['feature'] = df.index
return stats
def get_median_stats(df: pd.DataFrame) -> pd.DataFrame:
means = [row.median() for row in df.values]
stats = pd.DataFrame(means, columns=['median'])
stats['feature'] = df.index
return stats
def get_variance_stats(df: pd.DataFrame) -> pd.DataFrame:
means = [row.var() for row in df.values]
stats = pd.DataFrame(means, columns=['var'])
stats['feature'] = df.index
return stats
def get_limma_stats(df: pd.DataFrame, subsets: List[List[T]]) -> pd.DataFrame:
"""Use the R bioconductor package 'limma' to perform a differential
gene expression analysis on the given data frame.
:param df: Matrix of measurements where each column represents a sample
and each row a gene/probe.
:param subsets: Groups to compare with each other.
:return: Results of limma analysis. More than 2 subsets will result in
a different structured result data frame. See ?topTableF in R.
"""
logger.debug("Computing limma stats")
# prepare the df in case an id exists in more than one subset
if len(subsets) < 2:
error = "Limma analysis requires at least " \
"two non-empty groups for comparison."
logger.error(error)
raise ValueError(error)
if df.shape[0] < 1 or df.shape[1] < 2:
error = "Limma analysis requires a " \
"data frame with dimension 1x2 or more."
logger.error(error)
raise ValueError(error)
flattened_subsets = [x for subset in subsets for x in subset]
df = df[flattened_subsets]
ids = list(df)
features = df.index
# creating the design vector according to the subsets
design_vector = [''] * len(ids)
subsets_copy = deepcopy(subsets)
for i, sample_id in enumerate(ids):
for j, subset in enumerate(subsets_copy):
try:
subset.index(sample_id) # raises an Exception if not found
subset.remove(sample_id)
design_vector[i] = str(j + 1)
break
except ValueError:
assert j != len(subsets_copy) - 1
assert '' not in design_vector
# create group names
groups = ['group{}'.format(i + 1) for i in list(range(len(subsets)))]
# create a string for each pairwise comparison of the groups
comparisons = []
for i in reversed(range(len(subsets))):
for j in range(i):
comparisons.append('group{}-group{}'.format(i+1, j+1))
# fitting according to limma doc Chapter 8: Linear Models Overview
r_form = robj.Formula('~ 0+factor(c({}))'.format(','.join(design_vector)))
r_design = r['model.matrix'](r_form)
r_design.colnames = robj.StrVector(groups)
r_data = pandas2ri.py2ri(df)
# py2ri is stupid and makes too many assumptions.
# These two lines restore the column order
r_data.colnames = list(set(ids))
r_data = r_data.rx(robj.StrVector(ids))
r_fit = r['lmFit'](r_data, r_design)
r_contrast_matrix = r['makeContrasts'](*comparisons, levels=r_design)
r_fit_2 = r['contrasts.fit'](r_fit, r_contrast_matrix)
r_fit_2 = r['eBayes'](r_fit_2)
r_results = r['topTable'](r_fit_2, number=float('inf'),
sort='none', genelist=features)
results = pandas2ri.ri2py(r_results)
# let's give the gene list column an appropriate name
colnames = results.columns.values
colnames[0] = 'feature'
results.columns = colnames
return results
# FIXME: Add more parameters (e.g. for filtering low count rows)
def get_deseq2_stats(df: pd.DataFrame,
subsets: List[List[T]],
min_total_row_count: int = 0) -> pd.DataFrame:
"""Use the R bioconductor package 'limma' to perform a differential
expression analysis of count like data (e.g. miRNA). See package
documentation for more details.
:param df: Matrix of counts, where each column is a sample and each row
a feature.
:param subsets: The two subsets to compare with each other.
:param min_total_row_count: Drop rows that have in total less than than
min_total_row_count reads
:return: Results of the analysis in form of a Dataframe (p, logFC, ...)
"""
logger.debug("Computing deseq2 stats")
if len(subsets) != 2:
error = "This method currently only supports exactly two " \
"subsets as this is the most common use case. Support " \
"for more subsets will be added later."
logger.exception(error)
raise ValueError(error)
# flatten subset
flattened_subsets = [x for subset in subsets for x in subset]
# discard columns that are not in a subset
df = df[flattened_subsets]
# filter rows with too few reads
total_row_counts = df.sum(axis=1)
keep = total_row_counts[total_row_counts > min_total_row_count].index
df = df.loc[keep]
# pandas df -> R df
r_count_data = pandas2ri.py2ri(df)
# py2ri is stupid and makes too many assumptions.
# These two lines restore the column order
r_count_data.colnames = list(set(flattened_subsets))
r_count_data = r_count_data.rx(robj.StrVector(flattened_subsets))
# see package documentation
condition = ['s{}'.format(i) for i, subset in enumerate(subsets)
for _ in subset]
r_condition = robj.FactorVector(robj.StrVector(condition))
r_col_data = r['DataFrame'](condition=r_condition)
r_design = robj.Formula('~ condition')
r_design.environment['condition'] = r_condition
r_dds = r['DESeqDataSetFromMatrix'](r_count_data, r_col_data, r_design)
r_dds = r['DESeq'](r_dds, parallel=True)
r_res = r['results'](r_dds)
# R result table to Python pandas
r_res = r['as.data.frame'](r_res)
results = pandas2ri.ri2py(r_res)
return results
......@@ -7,8 +7,7 @@ from functools import reduce
import pandas as pd
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.heatmap.stats import StatisticTask
from fractalis.analytics.tasks.shared import utils
from fractalis.analytics.tasks.shared import utils, array_stats
T = TypeVar('T')
logger = logging.getLogger(__name__)
......@@ -19,10 +18,11 @@ class VolcanoTask(AnalyticTask):
submittable celery task."""
name = 'compute-volcanoplot'
stat_task = StatisticTask()
def main(self, numerical_arrays: List[pd.DataFrame],
id_filter: List[T],
ranking_method: str,
params: dict,
subsets: List[List[T]]):
# merge input data into single df
df = reduce(lambda a, b: a.append(b), numerical_arrays)
......@@ -47,17 +47,14 @@ class VolcanoTask(AnalyticTask):
# make matrix of input data
df = df.pivot(index='feature', columns='id', values='value')
features = list(df.index)
stats = self.stat_task.main(df=df, subsets=subsets,
ranking_method='limma')
# prepare output for front-end
df['feature'] = df.index
df = pd.melt(df, id_vars='feature', var_name='id')
df = utils.apply_subsets(df, subsets)
# compute the stats (p / fC) for the selected ranking method
stats = array_stats.get_stats(df=df,
subsets=subsets,
params=params,
ranking_method=ranking_method)
return {
'data': df.to_dict(orient='list'),
'pValue': stats['P.Value'],
'logFC': stats['logFC']
}
\ No newline at end of file
'features': features,
'stats': stats.to_dict(orient='list')
}
amqp==2.2.2
atomicwrites==1.1.5
attrs==18.1.0
billiard==3.5.0.3
celery==4.1.0
certifi==2018.4.16
chardet==3.0.4
click==6.7
cookies==2.2.1
coverage==4.5.1
cycler==0.10.0
flake8==3.5.0
Flask==0.12.2
Flask-Compress==1.4.0
Flask-Cors==3.0.3
flask-request-id-middleware==1.1
Flask-Script==2.0.6
fractalis==0.6.0
idna==2.6
itsdangerous==0.24
Jinja2==2.10
jsonschema==2.6.0
kiwisolver==1.0.1
kombu==4.1.0
MarkupSafe==1.0
matplotlib==2.2.2
mccabe==0.6.1
more-itertools==4.2.0
numpy==1.13.3
pandas==0.20.3
piptree==0.1.3
pkginfo==1.4.2
pluggy==0.6.0
py==1.5.3
pycodestyle==2.3.1
pycryptodomex==3.4.7
pyflakes==1.6.0
pyparsing==2.2.0
pytest==3.6.0
pytest-cov==2.5.1
pytest-runner==4.2
python-dateutil==2.7.3
pytz==2018.4
PyYAML==3.12
redis==2.10.6
requests==2.18.4
requests-toolbelt==0.8.0
responses==0.9.0
rpy2==2.9.3
scikit-learn==0.19.1
scipy==0.19.1
six==1.11.0
sklearn==0.0
tqdm==4.23.4
twine==1.11.0
typing==3.6.2
tzlocal==1.5.1
urllib3==1.22
vine==1.1.4
Werkzeug==0.14.1
"""This module provides tests for the array_utils module."""
"""This module provides tests for the array_stats module."""
import pytest
import pandas as pd
from fractalis.analytics.tasks.heatmap.stats import StatisticTask
from fractalis.analytics.tasks.shared import array_stats
# noinspection PyMissingOrEmptyDocstring,PyMethodMayBeStatic,PyMissingTypeHints
class TestArrayUtils:
stat_task = StatisticTask()
# noinspection PyMissingOrEmptyDocstring,PyMethodMayBeStatic,PyMissingTypeHints
class TestArrayStats:
def test_get_limma_stats_raises_for_invalid_subsets(self):
df = pd.DataFrame([[5, 10, 15, 20]], index=['foo'],
columns=[0, 1, 2, 3])
subsets = [[0, 1]]
with pytest.raises(ValueError) as e:
self.stat_task.get_limma_stats(df=df, subsets=subsets)
array_stats.get_limma_stats(df=df, subsets=subsets)
assert 'requires at least two' in e
def test_get_limma_stats_raises_for_invalid_df(self):
df = pd.DataFrame([], index=['foo'], columns=[])
subsets = [[0], [0]]
with pytest.raises(ValueError) as e:
self.stat_task.get_limma_stats(df=df, subsets=subsets)
array_stats.get_limma_stats(df=df, subsets=subsets)
assert 'dimension 1x2 or more' in e
def test_get_limma_stats_returns_correct_for_2_groups(self):
df = pd.DataFrame([[5, 10, 15, 20]], index=['foo'],
columns=[0, 1, 2, 3])
subsets = [[0, 1], [2, 3]]
stats = self.stat_task.get_limma_stats(df=df, subsets=subsets)
stats = array_stats.get_limma_stats(df=df, subsets=subsets)
assert all(stat in list(stats) for stat in
['feature', 'logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val',
'B'])
......@@ -39,7 +38,7 @@ class TestArrayUtils:
df = pd.DataFrame([[5, 10, 15, 20]], index=['foo'],
columns=[0, 1, 2, 3])
subsets = [[0, 1], [2], [3]]
stats = self.stat_task.get_limma_stats(df=df, subsets=subsets)
stats = array_stats.get_limma_stats(df=df, subsets=subsets)
assert all(stat in list(stats) for stat in
['feature', 'AveExpr', 'F', 'P.Value', 'adj.P.Val'])
assert all(stat not in list(stats) for stat in ['logFC', 'B', 't'])
......@@ -48,7 +47,24 @@ class TestArrayUtils:
df = pd.DataFrame([[5, 10, 15, 20]], index=['foo'],
columns=[0, 1, 2, 3])
subsets = [[0, 1], [1, 2], [2, 3], [3, 0]]
stats = self.stat_task.get_limma_stats(df=df, subsets=subsets)
stats = array_stats.get_limma_stats(df=df, subsets=subsets)
assert all(stat in list(stats) for stat in
['feature', 'AveExpr', 'F', 'P.Value', 'adj.P.Val'])
assert all(stat not in list(stats) for stat in ['logFC', 'B', 't'])
def test_get_deseq2_stats_returns_correct_for_2_groups(self):
df = pd.DataFrame([[500, 1, 1, 500],
[1, 500, 500, 1]],
columns=[0, 1, 2, 3])
subsets = [[0, 1], [2, 3]]
stats = array_stats.get_deseq2_stats(df=df, subsets=subsets)
assert all(stat in list(stats) for stat in
['baseMean', 'log2FoldChange', 'lfcSE',
'stat', 'pvalue', 'padj'])
def test_deseq2_requires_exactly_2_subsets(self):
with pytest.raises(ValueError):
array_stats.get_deseq2_stats(df=pd.DataFrame(),
subsets=[[], [], []])
with pytest.raises(ValueError):
array_stats.get_deseq2_stats(df=pd.DataFrame(), subsets=[[]])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment