Commit be9f918c authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Fixed a very critical bug in array_stats that messed with the statistics

parent b09f0529
Pipeline #5290 failed with stages
in 2 minutes and 50 seconds
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
from copy import deepcopy from copy import deepcopy
from typing import List, TypeVar from typing import List, TypeVar
from collections import OrderedDict
import logging import logging
import pandas as pd import pandas as pd
...@@ -115,7 +116,7 @@ def get_limma_stats(df: pd.DataFrame, subsets: List[List[T]]) -> pd.DataFrame: ...@@ -115,7 +116,7 @@ def get_limma_stats(df: pd.DataFrame, subsets: List[List[T]]) -> pd.DataFrame:
r_data = pandas2ri.py2ri(df) r_data = pandas2ri.py2ri(df)
# py2ri is stupid and makes too many assumptions. # py2ri is stupid and makes too many assumptions.
# These two lines restore the column order # These two lines restore the column order
r_data.colnames = list(set(ids)) r_data.colnames = list(OrderedDict.fromkeys(ids))
r_data = r_data.rx(robj.StrVector(ids)) r_data = r_data.rx(robj.StrVector(ids))
r_fit = r['lmFit'](r_data, r_design) r_fit = r['lmFit'](r_data, r_design)
...@@ -160,13 +161,13 @@ def get_deseq2_stats(df: pd.DataFrame, ...@@ -160,13 +161,13 @@ def get_deseq2_stats(df: pd.DataFrame,
df = df[flattened_subsets] df = df[flattened_subsets]
# filter rows with too few reads # filter rows with too few reads
total_row_counts = df.sum(axis=1) total_row_counts = df.sum(axis=1)
keep = total_row_counts[total_row_counts > min_total_row_count].index keep = total_row_counts[total_row_counts >= min_total_row_count].index
df = df.loc[keep] df = df.loc[keep]
# pandas df -> R df # pandas df -> R df
r_count_data = pandas2ri.py2ri(df) r_count_data = pandas2ri.py2ri(df)
# py2ri is stupid and makes too many assumptions. # py2ri is stupid and makes too many assumptions.
# These two lines restore the column order # These two lines restore the column order
r_count_data.colnames = list(set(flattened_subsets)) r_count_data.colnames = list(OrderedDict.fromkeys(flattened_subsets))
r_count_data = r_count_data.rx(robj.StrVector(flattened_subsets)) r_count_data = r_count_data.rx(robj.StrVector(flattened_subsets))
# see package documentation # see package documentation
...@@ -183,5 +184,5 @@ def get_deseq2_stats(df: pd.DataFrame, ...@@ -183,5 +184,5 @@ def get_deseq2_stats(df: pd.DataFrame,
# R result table to Python pandas # R result table to Python pandas
r_res = r['as.data.frame'](r_res) r_res = r['as.data.frame'](r_res)
results = pandas2ri.ri2py(r_res) results = pandas2ri.ri2py(r_res)
results.insert(0, 'feature', list(r['row.names'](r_res)))
return results return results
...@@ -44,17 +44,13 @@ class VolcanoTask(AnalyticTask): ...@@ -44,17 +44,13 @@ class VolcanoTask(AnalyticTask):
"the subset sample ids do not match the data." "the subset sample ids do not match the data."
logger.error(error) logger.error(error)
raise ValueError(error) raise ValueError(error)
# make matrix of input data # make matrix of input data
df = df.pivot(index='feature', columns='id', values='value') df = df.pivot(index='feature', columns='id', values='value')
features = list(df.index)
# compute the stats (p / fC) for the selected ranking method # compute the stats (p / fC) for the selected ranking method
stats = array_stats.get_stats(df=df, stats = array_stats.get_stats(df=df,
subsets=subsets, subsets=subsets,
params=params, params=params,
ranking_method=ranking_method) ranking_method=ranking_method)
return { return {
'features': features,
'stats': stats.to_dict(orient='list') 'stats': stats.to_dict(orient='list')
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment