main.py 3.63 KB
Newer Older
1
2
3
4
5
6
7
"""Module containing the Celery task for Boxplot statistics."""

from typing import List, TypeVar
from functools import reduce

import pandas as pd
import numpy as np
8
import scipy.stats
9
10

from fractalis.analytics.task import AnalyticTask
11
12
from fractalis.analytics.tasks.shared.utils import \
    apply_subsets, apply_categories
13
14
15
16
17
18
19
20
21
22
23
24


T = TypeVar('T')


class BoxplotTask(AnalyticTask):
    """Boxplot Analysis Task implementing AnalyticsTask. This class is a
    submittable celery task."""

    name = 'compute-boxplot'

    def main(self,
25
             features: List[pd.DataFrame],
26
             categories: List[pd.DataFrame],
Sascha Herzinger's avatar
Sascha Herzinger committed
27
             id_filter: List[T],
28
29
             subsets: List[List[T]]) -> dict:
        """ Compute boxplot statistics for the given parameters.
30
31
32
        :param features: List of numerical features
        :param categories: List of categorical features used to group numerical
        features.
Sascha Herzinger's avatar
Sascha Herzinger committed
33
34
        :param id_filter: List of ids that will be considered for analysis. If
        empty all ids will be used.
35
        :param subsets: List of subsets used as another way to group the
36
        numerical features.
37
        """
38
        if not len(features):
Sascha Herzinger's avatar
Sascha Herzinger committed
39
            raise ValueError("Must at least specify one "
40
41
42
43
44
                             "non empty numerical feature.")
        # merge dfs into single one
        df = reduce(lambda l, r: l.append(r), features)
        if id_filter:
            df = df[df['id'].isin(id_filter)]
45
        df = apply_subsets(df=df, subsets=subsets)
46
47
        df = apply_categories(df=df, categories=categories)
        results = {
48
            'data': df.to_json(orient='records'),
Sascha Herzinger's avatar
Sascha Herzinger committed
49
            'statistics': {},
50
51
52
            'features': df['feature'].unique().tolist(),
            'categories': df['category'].unique().tolist(),
            'subsets': df['subset'].unique().tolist()
53
        }
54
        for feature in results['features']:
55
56
            for subset in results['subsets']:
                for category in results['categories']:
Sascha Herzinger's avatar
Sascha Herzinger committed
57
                    values = df[(df['subset'] == subset) &
58
59
                                (df['category'] == category) &
                                (df['feature'] == feature)]['value'].tolist()
Sascha Herzinger's avatar
Sascha Herzinger committed
60
                    values = [value for value in values if not np.isnan(value)]
61
                    if len(values) < 2:
Sascha Herzinger's avatar
Sascha Herzinger committed
62
                        continue
63
                    stats = self.boxplot_statistics(values)
64
65
66
67
                    kde = scipy.stats.gaussian_kde(values)
                    xs = np.linspace(start=stats['l_wsk'],
                                     stop=stats['u_wsk'], num=100)
                    stats['kde'] = kde(xs).tolist()
68
                    label = '{}//{}//s{}'.format(feature, category, subset + 1)
Sascha Herzinger's avatar
Sascha Herzinger committed
69
                    results['statistics'][label] = stats
70
71
72
73
74
75
76
77
78
79
80
81
        return results

    @staticmethod
    def boxplot_statistics(values: List[float]) -> dict:
        """Compute boxplot statistics for the given values.
        :param values: A one dimensional list of numbers.
        :return: A dictionary containing all important boxplot statistics.
        """
        l_qrt = np.percentile(values, 25)
        median = np.percentile(values, 50)
        u_qrt = np.percentile(values, 75)
        iqr = u_qrt - l_qrt
Sascha Herzinger's avatar
Sascha Herzinger committed
82
83
84
85
86
87
        values.sort()
        # lower whisker as defined by John W. Tukey
        l_wsk = next(value for value in values if value >= l_qrt - 1.5 * iqr)
        values.sort(reverse=True)
        # upper whisker as defined by John W. Tukey
        u_wsk = next(value for value in values if value <= u_qrt + 1.5 * iqr)
88
89
90
91
        return {
            'l_qrt': l_qrt,
            'median': median,
            'u_qrt': u_qrt,
Sascha Herzinger's avatar
Sascha Herzinger committed
92
93
            'l_wsk': l_wsk,
            'u_wsk': u_wsk
94
        }