Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit 963f1a96 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

minor refactoring

parent 0c69e7ed
Pipeline #5922 failed with stages
in 37 minutes and 47 seconds
......@@ -43,8 +43,7 @@ class BoxplotTask(AnalyticTask):
df = reduce(lambda l, r: l.append(r), features)
df = utils.apply_transformation(df=df, transformation=transformation)
df.dropna(inplace=True)
if id_filter:
df = df[df['id'].isin(id_filter)]
df = utils.apply_id_filter(df=df, id_filter=id_filter)
df = utils.apply_subsets(df=df, subsets=subsets)
df = utils.apply_categories(df=df, categories=categories)
df['outlier'] = None
......
......@@ -48,8 +48,7 @@ class CorrelationTask(AnalyticTask):
df = self.merge_x_y(x, y)
x_label = list(df['feature_x'])[0]
y_label = list(df['feature_y'])[0]
if id_filter:
df = df[df['id'].isin(id_filter)]
df = utils.apply_id_filter(df=df, id_filter=id_filter)
df = utils.apply_subsets(df=df, subsets=subsets)
df = utils.apply_categories(df=df, categories=categories)
global_stats = self.compute_stats(df, method)
......
......@@ -38,8 +38,7 @@ class HeatmapTask(AnalyticTask):
flattened_subsets = [x for subset in subsets for x in subset]
df = df[df['id'].isin(flattened_subsets)]
# apply id filter
if id_filter:
df = df[df['id'].isin(id_filter)]
df = utils.apply_id_filter(df=df, id_filter=id_filter)
# drop subset ids that are not in the df
subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets)
# make sure the input data are still valid after the pre-processing
......
......@@ -41,8 +41,7 @@ class PCATask(AnalyticTask):
feature_labels = list(df)
# apply id filter
if id_filter:
df = df[df.index.isin(id_filter)]
df = utils.apply_id_filter(df=df, id_filter=id_filter)
# save ids so we can re-assign them after pca
ids = df.index.tolist()
......
......@@ -10,11 +10,10 @@ import numpy as np
logger = logging.getLogger(__name__)
T = TypeVar('T')
def apply_subsets(df: pd.DataFrame,
subsets: List[List[T]]) -> pd.DataFrame:
subsets: List[List[str]]) -> pd.DataFrame:
"""Build a new DataFrame that contains a new column 'subset' defining
the subset the data point belongs to. If a data point belongs to
multiple subsets then the row is duplicated.
......@@ -75,8 +74,18 @@ def apply_categories(df: pd.DataFrame,
return df
def apply_id_filter(df: pd.DataFrame, id_filter: List[str]) -> pd.DataFrame:
"""Keep only rows where id is in id_filter. If id_filter is empty keep all.
:param df: Dataframe containing array data in the Fractalis format.
:param id_filter: List of ids to keep.
"""
if id_filter:
df = df[df['id'].isin(id_filter)]
return df
def drop_unused_subset_ids(df: pd.DataFrame,
subsets: List[List[T]]) -> List[List[T]]:
subsets: List[List[str]]) -> List[List[str]]:
"""Drop subset ids that are not present in the given data
:param df: Dataframe containing array data in the Fractalis format.
:param subsets: Subset groups specified by the user.
......
......@@ -39,8 +39,7 @@ class SurvivalTask(AnalyticTask):
df = durations[0]
df.dropna(inplace=True)
if id_filter:
df = df[df['id'].isin(id_filter)]
df = utils.apply_id_filter(df=df, id_filter=id_filter)
df = utils.apply_subsets(df=df, subsets=subsets)
df = utils.apply_categories(df=df, categories=categories)
......
......@@ -35,8 +35,7 @@ class VolcanoTask(AnalyticTask):
flattened_subsets = [x for subset in subsets for x in subset]
df = df[df['id'].isin(flattened_subsets)]
# apply id filter
if id_filter:
df = df[df['id'].isin(id_filter)]
df = utils.apply_id_filter(df=df, id_filter=id_filter)
# drop subset ids that are not in the df
subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets)
# make sure the input data are still valid after the pre-processing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment