Commit f7878378 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

implemented limma fitting for heatmap analysis

parent 4147ecbb
Pipeline #2244 failed with stage
in 1 minute and 7 seconds
"""Module containing analysis code for heatmap analytics."""
from copy import deepcopy
from typing import List, TypeVar
from functools import reduce
import pandas as pd
from scipy.stats import zscore
from rpy2 import robjects as R
from rpy2.robjects import r, pandas2ri
from rpy2.robjects.packages import importr
from import AnalyticTask
T = TypeVar('T')
......@@ -27,7 +34,7 @@ class HeatmapTask(AnalyticTask):
df = df.drop('variable', axis=1)
zscores = df.apply(zscore, axis=1)
#prepare output for front-end
# prepare output for front-end
df = df.transpose()
df.columns = variables = 'id'
......@@ -45,4 +52,52 @@ class HeatmapTask(AnalyticTask):
return {
'data': df.to_json(orient='index')
\ No newline at end of file
def getLimmaStats(self, df: pd.DataFrame, subsets: List[List[T]]):
# we consider all subset ids rather than df ids because an id might be
# in multiple subsets. In such a case we have to copy a row in the df.
flattened_subsets = [x for subset in subsets for x in subset]
df = df[flattened_subsets]
ids = list(df)
# creating the design vector according to the subsets
design_vector = [None] * len(ids)
subsets_copy = deepcopy(subsets)
for i, id in enumerate(ids):
for j, subset in enumerate(subsets_copy):
subset.index(id) # raises an Exception if not found
design_vector[i] = str(j + 1)
except ValueError:
assert j != len(subsets_copy) - 1
assert None not in design_vector
# create group names
groups = ['group{}'.format(i + 1) for i in list(range(len(subsets)))]
# create a string for each pairwise comparison of the groups
comparisons = []
for i in reversed(range(len(subsets))):
for j in range(i):
comparisons.append('group{}-group{}'.format(i+1, j+1))
# fitting according to limma doc Chapter 8: Linear Models Overview
r_form = R.Formula('~ 0+factor(c({}))'.format(','.join(design_vector)))
r_design = r['model.matrix'](r_form)
r_design.colnames = R.StrVector(groups)
r_data = pandas2ri.py2ri(df)
# the next two lines are necessary if column ids are not unique, because
# the python to r transformation drops those columns
r_ids = R.StrVector(['X{}'.format(id) for id in ids])
r_data = r_data.rx(r_ids)
r_fit = r['lmFit'](r_data, r_design)
r_contrast_matrix = r['makeContrasts'](*comparisons, levels=r_design)
r_fit_2 = r[''](r_fit, r_contrast_matrix)
r_fit_2 = r['eBayes'](r_fit_2)
r_results = r['topTable'](r_fit_2, number=float('inf'), sort='none')
results = pandas2ri.ri2py(r_results)
return results
\ No newline at end of file
......@@ -21,7 +21,8 @@ setup(
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment