Commit a7439a72 authored by Sascha Herzinger's avatar Sascha Herzinger

Many small fixes and features

parent 8b89b0cb
Pipeline #5417 failed with stages
in 2 minutes and 54 seconds
variables:
PYPI_USER: SECURE
PYPI_PASS: SECURE
DOCKER_USER: SECURE
DOCKER_PASS: SECURE
PYPI_USER: secure
PYPI_PASS: secure
DOCKER_USER: secure
DOCKER_PASS: secure
DOCKER_DRIVER: overlay2
before_script:
......
......@@ -8,8 +8,7 @@ import numpy as np
import scipy.stats
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.shared.utils import \
apply_subsets, apply_categories
from fractalis.analytics.tasks.shared import utils
T = TypeVar('T')
......@@ -25,6 +24,7 @@ class BoxplotTask(AnalyticTask):
features: List[pd.DataFrame],
categories: List[pd.DataFrame],
id_filter: List[T],
transformation: str,
subsets: List[List[T]]) -> dict:
""" Compute boxplot statistics for the given parameters.
:param features: List of numerical features
......@@ -32,6 +32,7 @@ class BoxplotTask(AnalyticTask):
features.
:param id_filter: List of ids that will be considered for analysis. If
empty all ids will be used.
:param transformation: Transformation that will be applied to the data.
:param subsets: List of subsets used as another way to group the
numerical features.
"""
......@@ -40,11 +41,12 @@ class BoxplotTask(AnalyticTask):
"non empty numerical feature.")
# merge dfs into single one
df = reduce(lambda l, r: l.append(r), features)
df = utils.apply_transformation(df=df, transformation=transformation)
df.dropna(inplace=True)
if id_filter:
df = df[df['id'].isin(id_filter)]
df = apply_subsets(df=df, subsets=subsets)
df = apply_categories(df=df, categories=categories)
df = utils.apply_subsets(df=df, subsets=subsets)
df = utils.apply_categories(df=df, categories=categories)
df['outlier'] = None
results = {
'statistics': {},
......
......@@ -46,7 +46,8 @@ class CorrelationTask(AnalyticTask):
raise ValueError("Unknown method '{}'".format(method))
df = self.merge_x_y(x, y)
(x_label, y_label) = (df['feature_x'][0], df['feature_y'][0])
x_label = list(df['feature_x'])[0]
y_label = list(df['feature_y'])[0]
if id_filter:
df = df[df['id'].isin(id_filter)]
df = utils.apply_subsets(df=df, subsets=subsets)
......
......@@ -24,6 +24,7 @@ class HeatmapTask(AnalyticTask):
numericals: List[pd.DataFrame],
categoricals: List[pd.DataFrame],
ranking_method: str,
params: dict,
id_filter: List[T],
max_rows: int,
subsets: List[List[T]]) -> dict:
......@@ -58,6 +59,7 @@ class HeatmapTask(AnalyticTask):
# compute statistic for ranking
stats = array_stats.get_stats(df=df, subsets=subsets,
params=params,
ranking_method=ranking_method)
# sort by ranking_value
......
......@@ -6,6 +6,7 @@ from functools import reduce
from copy import deepcopy
import pandas as pd
import numpy as np
logger = logging.getLogger(__name__)
......@@ -89,3 +90,29 @@ def drop_unused_subset_ids(df: pd.DataFrame,
if id not in ids:
subset.remove(id)
return _subsets
def apply_transformation(df: pd.DataFrame, transformation: str) -> pd.DataFrame:
"""Apply transformation to the value column of the data frame.
E.g. log2 or 10^x scales.
NaN and Inf are dropped!
:param df: Dataframe containing array data in the Fractalis format.
:param transformation: The transformation to apply.
:return: The dataframe with an transformed value column excl. NaN and Inf
"""
transformations = {
'identity': lambda x: x,
'log2(x)': np.log2,
'log10(x)': np.log10,
'2^x': lambda x: np.power(2.0, x),
'10^x': lambda x: np.power(10.0, x)
}
# drop zeros because
df = df[df['value'] != 0]
df['value'] = transformations[transformation](df['value'])
if np.any(np.isinf(df['value'])):
error = 'Found inf after transformation. Transformation "{}" should ' \
'only be used on log scaled data.'.format(transformation)
logger.exception(error)
raise ValueError(error)
return df
......@@ -24,6 +24,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='B',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......@@ -43,6 +44,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='B',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......@@ -63,6 +65,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='mean',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......@@ -79,6 +82,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='mean',
params={},
id_filter=[],
max_rows=100,
subsets=[])
......@@ -87,6 +91,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='mean',
params={},
id_filter=[],
max_rows=100,
subsets=[[101, 102, 103, 104]])
......@@ -109,6 +114,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='B',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......@@ -132,6 +138,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='B',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......@@ -152,6 +159,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='mean',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......@@ -174,6 +182,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='mean',
params={},
id_filter=[],
max_rows=2,
subsets=subsets)
......@@ -196,6 +205,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='P.Value',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......@@ -206,6 +216,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='adj.P.Val',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......@@ -216,6 +227,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='B',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......@@ -226,6 +238,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='logFC',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......@@ -237,6 +250,7 @@ class TestHeatmap:
numericals=[],
categoricals=[],
ranking_method='t',
params={},
id_filter=[],
max_rows=100,
subsets=subsets)
......
"""This module contains tests for the common module in the shared package."""
import pytest
import pandas as pd
import numpy as np
......@@ -37,3 +38,17 @@ class TestCommonTasks:
subsets = [[], [101], [103, 104], [105]]
subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets)
assert subsets == [[], [101], [103], []]
def test_apply_transformation_raises_if_pos_inf(self):
df = pd.DataFrame([[101, 'foo', 1000]],
columns=['id', 'feature', 'value'])
with pytest.raises(ValueError) as e:
utils.apply_transformation(df=df, transformation='10^x')
assert 'only be used on log scaled data' in e
def test_apply_transformation_drops_zeros(self):
df = pd.DataFrame([[101, 'foo', 1000], [102, 'foo', 0]],
columns=['id', 'feature', 'value'])
assert df.shape[0] == 2
df = utils.apply_transformation(df=df, transformation='log2(x)')
assert df.shape[0] == 1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment