Commit 963f1a96 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

minor refactoring

parent 0c69e7ed
Pipeline #5922 failed with stages
in 37 minutes and 47 seconds
...@@ -43,8 +43,7 @@ class BoxplotTask(AnalyticTask): ...@@ -43,8 +43,7 @@ class BoxplotTask(AnalyticTask):
df = reduce(lambda l, r: l.append(r), features) df = reduce(lambda l, r: l.append(r), features)
df = utils.apply_transformation(df=df, transformation=transformation) df = utils.apply_transformation(df=df, transformation=transformation)
df.dropna(inplace=True) df.dropna(inplace=True)
if id_filter: df = utils.apply_id_filter(df=df, id_filter=id_filter)
df = df[df['id'].isin(id_filter)]
df = utils.apply_subsets(df=df, subsets=subsets) df = utils.apply_subsets(df=df, subsets=subsets)
df = utils.apply_categories(df=df, categories=categories) df = utils.apply_categories(df=df, categories=categories)
df['outlier'] = None df['outlier'] = None
......
...@@ -48,8 +48,7 @@ class CorrelationTask(AnalyticTask): ...@@ -48,8 +48,7 @@ class CorrelationTask(AnalyticTask):
df = self.merge_x_y(x, y) df = self.merge_x_y(x, y)
x_label = list(df['feature_x'])[0] x_label = list(df['feature_x'])[0]
y_label = list(df['feature_y'])[0] y_label = list(df['feature_y'])[0]
if id_filter: df = utils.apply_id_filter(df=df, id_filter=id_filter)
df = df[df['id'].isin(id_filter)]
df = utils.apply_subsets(df=df, subsets=subsets) df = utils.apply_subsets(df=df, subsets=subsets)
df = utils.apply_categories(df=df, categories=categories) df = utils.apply_categories(df=df, categories=categories)
global_stats = self.compute_stats(df, method) global_stats = self.compute_stats(df, method)
......
...@@ -38,8 +38,7 @@ class HeatmapTask(AnalyticTask): ...@@ -38,8 +38,7 @@ class HeatmapTask(AnalyticTask):
flattened_subsets = [x for subset in subsets for x in subset] flattened_subsets = [x for subset in subsets for x in subset]
df = df[df['id'].isin(flattened_subsets)] df = df[df['id'].isin(flattened_subsets)]
# apply id filter # apply id filter
if id_filter: df = utils.apply_id_filter(df=df, id_filter=id_filter)
df = df[df['id'].isin(id_filter)]
# drop subset ids that are not in the df # drop subset ids that are not in the df
subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets) subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets)
# make sure the input data are still valid after the pre-processing # make sure the input data are still valid after the pre-processing
......
...@@ -41,8 +41,7 @@ class PCATask(AnalyticTask): ...@@ -41,8 +41,7 @@ class PCATask(AnalyticTask):
feature_labels = list(df) feature_labels = list(df)
# apply id filter # apply id filter
if id_filter: df = utils.apply_id_filter(df=df, id_filter=id_filter)
df = df[df.index.isin(id_filter)]
# save ids so we can re-assign them after pca # save ids so we can re-assign them after pca
ids = df.index.tolist() ids = df.index.tolist()
......
...@@ -10,11 +10,10 @@ import numpy as np ...@@ -10,11 +10,10 @@ import numpy as np
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
T = TypeVar('T')
def apply_subsets(df: pd.DataFrame, def apply_subsets(df: pd.DataFrame,
subsets: List[List[T]]) -> pd.DataFrame: subsets: List[List[str]]) -> pd.DataFrame:
"""Build a new DataFrame that contains a new column 'subset' defining """Build a new DataFrame that contains a new column 'subset' defining
the subset the data point belongs to. If a data point belongs to the subset the data point belongs to. If a data point belongs to
multiple subsets then the row is duplicated. multiple subsets then the row is duplicated.
...@@ -75,8 +74,18 @@ def apply_categories(df: pd.DataFrame, ...@@ -75,8 +74,18 @@ def apply_categories(df: pd.DataFrame,
return df return df
def apply_id_filter(df: pd.DataFrame, id_filter: List[str]) -> pd.DataFrame:
"""Keep only rows where id is in id_filter. If id_filter is empty keep all.
:param df: Dataframe containing array data in the Fractalis format.
:param id_filter: List of ids to keep.
"""
if id_filter:
df = df[df['id'].isin(id_filter)]
return df
def drop_unused_subset_ids(df: pd.DataFrame, def drop_unused_subset_ids(df: pd.DataFrame,
subsets: List[List[T]]) -> List[List[T]]: subsets: List[List[str]]) -> List[List[str]]:
"""Drop subset ids that are not present in the given data """Drop subset ids that are not present in the given data
:param df: Dataframe containing array data in the Fractalis format. :param df: Dataframe containing array data in the Fractalis format.
:param subsets: Subset groups specified by the user. :param subsets: Subset groups specified by the user.
......
...@@ -39,8 +39,7 @@ class SurvivalTask(AnalyticTask): ...@@ -39,8 +39,7 @@ class SurvivalTask(AnalyticTask):
df = durations[0] df = durations[0]
df.dropna(inplace=True) df.dropna(inplace=True)
if id_filter: df = utils.apply_id_filter(df=df, id_filter=id_filter)
df = df[df['id'].isin(id_filter)]
df = utils.apply_subsets(df=df, subsets=subsets) df = utils.apply_subsets(df=df, subsets=subsets)
df = utils.apply_categories(df=df, categories=categories) df = utils.apply_categories(df=df, categories=categories)
......
...@@ -35,8 +35,7 @@ class VolcanoTask(AnalyticTask): ...@@ -35,8 +35,7 @@ class VolcanoTask(AnalyticTask):
flattened_subsets = [x for subset in subsets for x in subset] flattened_subsets = [x for subset in subsets for x in subset]
df = df[df['id'].isin(flattened_subsets)] df = df[df['id'].isin(flattened_subsets)]
# apply id filter # apply id filter
if id_filter: df = utils.apply_id_filter(df=df, id_filter=id_filter)
df = df[df['id'].isin(id_filter)]
# drop subset ids that are not in the df # drop subset ids that are not in the df
subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets) subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets)
# make sure the input data are still valid after the pre-processing # make sure the input data are still valid after the pre-processing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment