Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit fa7b4cbc authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

adding histogram statistics

parent 17b809d5
Pipeline #5926 passed with stages
in 37 minutes and 14 seconds
"""This module contains several statistics necessary for creating a
histogram."""
import logging
from typing import List
import pandas as pd
import numpy as np
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.shared import utils
logger = logging.getLogger(__name__)
class HistogramTask(AnalyticTask):
"""Histogram Analysis Task implementing AnalyticsTask. This class is a
submittable celery task."""
name = 'compute-histogram'
def main(self,
id_filter: List[str],
subsets: List[List[str]],
data: pd.DataFrame,
categories: List[pd.DataFrame]) -> dict:
"""Compute several basic statistics such as bin size and variance.
:param id_filter: If specified use only given ids during the analysis.
:param subsets: List of lists of subset ids.
:param data: Numerical values to create histogram of.
:param categories: The groups to split the values into.
"""
df = data
del data
df.dropna(inplace=True)
if df.shape[0] == 0:
error = 'The selected numerical variable must be non-empty.'
logger.exception(error)
raise ValueError(error)
df = utils.apply_id_filter(df=df, id_filter=id_filter)
df = utils.apply_subsets(df=df, subsets=subsets)
df = utils.apply_categories(df=df, categories=categories)
stats = {}
categories = df['category'].unique().tolist()
subsets = df['subset'].unique().tolist()
for category in categories:
for subset in subsets:
sub_df = df[(df['category'] == category) &
(df['subset'] == subset)]
values = sub_df['value']
hist, bin_edges = np.histogram(values)
hist = list(hist)
bin_edges = list(bin_edges)
mean = np.mean(values)
median = np.median(values)
variance = np.var(values)
if not stats.get(category):
stats[category] = {}
stats[category][subset] = {
'hist': hist,
'bin_edges': bin_edges,
'mean': mean,
'median': median,
'variance': variance
}
return {
'stats': stats
}
import pytest
import pandas as pd
from fractalis.analytics.tasks.histogram.main import HistogramTask
class TestHistogramTask:
task = HistogramTask()
def test_correct_output(self):
df = pd.DataFrame([[100, 'foo', 1],
[101, 'foo', 2],
[102, 'foo', 3],
[103, 'foo', 4],
[104, 'foo', 5],
[105, 'foo', 6],
[106, 'foo', 7],
[107, 'foo', 8],
[108, 'foo', 9],
[109, 'foo', 10]],
columns=['id', 'feature', 'value'])
cat_df = pd.DataFrame([[100, 'cat', 'A'],
[101, 'cat', 'B'],
[102, 'cat', 'A'],
[103, 'cat', 'B'],
[104, 'cat', 'A'],
[105, 'cat', 'B'],
[106, 'cat', 'A'],
[107, 'cat', 'B'],
[108, 'cat', 'A'],
[109, 'cat', 'B']],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
subsets=[],
data=df,
categories=[cat_df])
assert 'stats' in result
assert 'A' in result['stats']
assert 'B' in result['stats']
assert 0 in result['stats']['A']
assert all([stat in result['stats']['A'][0] for stat in
['hist', 'bin_edges', 'mean', 'median', 'variance']])
def test_can_handle_nas(self):
df = pd.DataFrame([[100, 'foo', float('nan')],
[101, 'foo', 2],
[102, 'foo', float('nan')],
[103, 'foo', 4],
[104, 'foo', float('nan')],
[105, 'foo', 6],
[106, 'foo', float('nan')],
[107, 'foo', 8],
[108, 'foo', float('nan')],
[109, 'foo', 10]],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
subsets=[],
data=df,
categories=[])
assert result['stats'][''][0]['median'] == 6
assert result['stats'][''][0]['mean'] == 6
assert result['stats'][''][0]['variance'] == 8
def test_can_handle_negatives(self):
df = pd.DataFrame([[100, 'foo', -2],
[101, 'foo', 2],
[102, 'foo', -4],
[103, 'foo', 4],
[104, 'foo', -6],
[105, 'foo', 6],
[106, 'foo', -8],
[107, 'foo', 8],
[108, 'foo', -10],
[109, 'foo', 10]],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
subsets=[],
data=df,
categories=[])
assert result['stats'][''][0]['median'] == 0
assert result['stats'][''][0]['mean'] == 0
def test_can_handle_small_groups(self):
df = pd.DataFrame([[100, 'foo', 1],
[101, 'foo', 2],
[102, 'foo', float('nan')],
[103, 'foo', 4],
[104, 'foo', float('nan')],
[105, 'foo', 6],
[106, 'foo', float('nan')],
[107, 'foo', 8],
[108, 'foo', float('nan')],
[109, 'foo', 10]],
columns=['id', 'feature', 'value'])
cat_df = pd.DataFrame([[100, 'cat', 'A'],
[101, 'cat', 'B'],
[102, 'cat', 'A'],
[103, 'cat', 'B'],
[104, 'cat', 'A'],
[105, 'cat', 'B'],
[106, 'cat', 'A'],
[107, 'cat', 'B'],
[108, 'cat', 'A'],
[109, 'cat', 'B']],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
subsets=[],
data=df,
categories=[cat_df])
assert result['stats']['A'][0]['median'] == 1
assert result['stats']['A'][0]['mean'] == 1
assert result['stats']['A'][0]['variance'] == 0
def test_skips_empty_groups(self):
df = pd.DataFrame([[100, 'foo', float('nan')],
[101, 'foo', 2],
[102, 'foo', float('nan')],
[103, 'foo', 4],
[104, 'foo', float('nan')],
[105, 'foo', 6],
[106, 'foo', float('nan')],
[107, 'foo', 8],
[108, 'foo', float('nan')],
[109, 'foo', 10]],
columns=['id', 'feature', 'value'])
cat_df = pd.DataFrame([[100, 'cat', 'A'],
[101, 'cat', 'B'],
[102, 'cat', 'A'],
[103, 'cat', 'B'],
[104, 'cat', 'A'],
[105, 'cat', 'B'],
[106, 'cat', 'A'],
[107, 'cat', 'B'],
[108, 'cat', 'A'],
[109, 'cat', 'B']],
columns=['id', 'feature', 'value'])
result = self.task.main(id_filter=[],
subsets=[],
data=df,
categories=[cat_df])
assert 'A' not in result['stats']
assert 'B' in result['stats']
def test_throws_error_if_all_groups_empty(self):
df = pd.DataFrame([[100, 'foo', float('nan')],
[101, 'foo', float('nan')],
[102, 'foo', float('nan')],
[103, 'foo', float('nan')],
[104, 'foo', float('nan')],
[105, 'foo', float('nan')],
[106, 'foo', float('nan')],
[107, 'foo', float('nan')],
[108, 'foo', float('nan')],
[109, 'foo', float('nan')]],
columns=['id', 'feature', 'value'])
cat_df = pd.DataFrame([[100, 'cat', 'A'],
[101, 'cat', 'B'],
[102, 'cat', 'A'],
[103, 'cat', 'B'],
[104, 'cat', 'A'],
[105, 'cat', 'B'],
[106, 'cat', 'A'],
[107, 'cat', 'B'],
[108, 'cat', 'A'],
[109, 'cat', 'B']],
columns=['id', 'feature', 'value'])
with pytest.raises(ValueError) as e:
self.task.main(id_filter=[],
subsets=[],
data=df,
categories=[cat_df])
assert 'selected numerical variable must be non-empty' in e
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment