Commit c5d8c3d0 authored by Sascha Herzinger's avatar Sascha Herzinger

Basic volcanoplot code

parent 4a3d0d36
Pipeline #5247 failed with stages
in 8 minutes and 58 seconds
......@@ -112,10 +112,11 @@ class StatisticTask(AnalyticTask):
r_design = r['model.matrix'](r_form)
r_design.colnames = R.StrVector(groups)
r_data = pandas2ri.py2ri(df)
# the next two lines are necessary if column ids are not unique,
# because the python to r transformation drops those columns otherwise
r_ids = R.StrVector(['X{}'.format(id) for id in ids])
r_data = r_data.rx(r_ids)
# py2ri is stupid and makes too many assumptions.
# These two lines restore the column
r_data.colnames = list(set(ids))
r_data = r_data.rx(R.StrVector(ids))
r_fit = r['lmFit'](r_data, r_design)
r_contrast_matrix = r['makeContrasts'](*comparisons, levels=r_design)
r_fit_2 = r['contrasts.fit'](r_fit, r_contrast_matrix)
......
"""This module provides statistics for volcano plots."""
import logging
from typing import List, TypeVar
from functools import reduce
import pandas as pd
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.heatmap.stats import StatisticTask
from fractalis.analytics.tasks.shared import utils
T = TypeVar('T')
logger = logging.getLogger(__name__)
class VolcanoTask(AnalyticTask):
"""Volcanoplot Analysis Task implementing AnalyticsTask. This class is a
submittable celery task."""
name = 'compute-volcanoplot'
stat_task = StatisticTask()
def main(self, numerical_arrays: List[pd.DataFrame],
id_filter: List[T],
subsets: List[List[T]]):
# merge input data into single df
df = reduce(lambda a, b: a.append(b), numerical_arrays)
if not subsets:
# empty subsets equals all samples in one subset
subsets = [df['id'].unique().tolist()]
else:
# if subsets are defined we drop the rows that are not part of one
flattened_subsets = [x for subset in subsets for x in subset]
df = df[df['id'].isin(flattened_subsets)]
# apply id filter
if id_filter:
df = df[df['id'].isin(id_filter)]
# drop subset ids that are not in the df
subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets)
# make sure the input data are still valid after the pre-processing
if df.shape[0] < 1:
error = "Either the input data set is too small or " \
"the subset sample ids do not match the data."
logger.error(error)
raise ValueError(error)
# make matrix of input data
df = df.pivot(index='feature', columns='id', values='value')
stats = self.stat_task.main(df=df, subsets=subsets,
ranking_method='limma')
# prepare output for front-end
df['feature'] = df.index
df = pd.melt(df, id_vars='feature', var_name='id')
df = utils.apply_subsets(df, subsets)
return {
'data': df.to_dict(orient='list'),
'pValue': stats['P.Value'],
'logFC': stats['logFC']
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment