Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit ce61158d authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Added new parameters to histogram analysis

parent d6219fd3
Pipeline #5977 failed with stages
in 37 minutes and 38 seconds
......@@ -2,10 +2,12 @@
histogram."""
import logging
from functools import partial
from typing import List
import pandas as pd
import numpy as np
import scipy.stats
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.shared import utils
......@@ -21,11 +23,15 @@ class HistogramTask(AnalyticTask):
name = 'compute-histogram'
def main(self,
bw_factor: float,
num_bins: int,
id_filter: List[str],
subsets: List[List[str]],
data: pd.DataFrame,
categories: List[pd.DataFrame]) -> dict:
"""Compute several basic statistics such as bin size and kde.
:param bw_factor: KDE resolution.
:param num_bins: Number of bins to use for histogram.
:param id_filter: If specified use only given ids during the analysis.
:param subsets: List of lists of subset ids.
:param data: Numerical values to create histogram of.
......@@ -49,12 +55,23 @@ class HistogramTask(AnalyticTask):
sub_df = df[(df['category'] == category) &
(df['subset'] == subset)]
values = sub_df['value']
hist, bin_edges = np.histogram(values)
if values.shape[0] < 2:
continue
hist, bin_edges = np.histogram(values, bins=num_bins)
hist = hist.tolist()
bin_edges = bin_edges.tolist()
mean = np.mean(values)
median = np.median(values)
std = np.std(values)
def bw(obj, fac):
return np.power(obj.n, -1.0 / (obj.d + 4)) * fac
kde = scipy.stats.gaussian_kde(
values, bw_method=partial(bw, fac=bw_factor))
xs = np.linspace(
start=np.min(values), stop=np.max(values), num=200)
dist = kde(xs).tolist()
if not stats.get(category):
stats[category] = {}
stats[category][subset] = {
......@@ -62,7 +79,8 @@ class HistogramTask(AnalyticTask):
'bin_edges': bin_edges,
'mean': mean,
'median': median,
'std': std
'std': std,
'dist': dist
}
return {
'stats': stats,
......
......@@ -34,6 +34,7 @@ class TestHistogramTask:
[109, 'cat', 'B']],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
bw_factor=0.5,
subsets=[],
data=df,
categories=[cat_df])
......@@ -43,7 +44,7 @@ class TestHistogramTask:
assert 'B' in result['stats']
assert 0 in result['stats']['A']
assert all([stat in result['stats']['A'][0] for stat in
['hist', 'bin_edges', 'mean', 'median', 'std']])
['hist', 'bin_edges', 'mean', 'median', 'std', 'dist']])
def test_can_handle_nas(self):
df = pd.DataFrame([[100, 'foo', float('nan')],
......@@ -58,6 +59,7 @@ class TestHistogramTask:
[109, 'foo', 10]],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
bw_factor=0.5,
subsets=[],
data=df,
categories=[])
......@@ -77,13 +79,14 @@ class TestHistogramTask:
[109, 'foo', 10]],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
bw_factor=0.5,
subsets=[],
data=df,
categories=[])
assert result['stats'][''][0]['median'] == 0
assert result['stats'][''][0]['mean'] == 0
def test_can_handle_small_groups(self):
def test_skips_small_groups(self):
df = pd.DataFrame([[100, 'foo', 1],
[101, 'foo', 2],
[102, 'foo', float('nan')],
......@@ -107,12 +110,11 @@ class TestHistogramTask:
[109, 'cat', 'B']],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
bw_factor=0.5,
subsets=[],
data=df,
categories=[cat_df])
assert result['stats']['A'][0]['median'] == 1
assert result['stats']['A'][0]['mean'] == 1
assert result['stats']['A'][0]['std'] == 0
assert 'A' not in result['stats']
def test_skips_empty_groups(self):
df = pd.DataFrame([[100, 'foo', float('nan')],
......@@ -138,6 +140,7 @@ class TestHistogramTask:
[109, 'cat', 'B']],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
bw_factor=0.5,
subsets=[],
data=df,
categories=[cat_df])
......@@ -169,6 +172,7 @@ class TestHistogramTask:
columns=['id', 'feature', 'value'])
with pytest.raises(ValueError) as e:
self.task.main(id_filter=[],
bw_factor=0.5,
subsets=[],
data=df,
categories=[cat_df])
......@@ -198,6 +202,7 @@ class TestHistogramTask:
[109, 'cat', 'B']],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
bw_factor=0.5,
subsets=[],
data=df,
categories=[cat_df])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment