test_boxplot.py 4.1 KB
Newer Older
1
2
"""This module contains the tests for the Boxplot analysis code."""

Sascha Herzinger's avatar
Sascha Herzinger committed
3
4
5
6
7
8
9
10
11
import json

import numpy as np
import pandas as pd

from fractalis.analytics.tasks.boxplot.main import BoxplotTask


# noinspection PyMissingOrEmptyDocstring,PyMissingTypeHints
12
class TestBoxplotAnalytics:
Sascha Herzinger's avatar
Sascha Herzinger committed
13
14
15

    task = BoxplotTask()

16
17
18
19
20
21
22
23
    def test_correct_output(self):
        df_1 = pd.DataFrame([[101, 'foo', 1], [102, 'foo', 2], [103, 'foo', 3],
                             [104, 'foo', 4]],
                            columns=['id', 'feature', 'value'])
        df_2 = pd.DataFrame([[101, 'bar', 1], [102, 'bar', 2], [103, 'bar', 3],
                             [104, 'bar', 4]],
                            columns=['id', 'feature', 'value'])
        results = self.task.main(features=[df_1, df_2],
Sascha Herzinger's avatar
Sascha Herzinger committed
24
25
26
                                 categories=[],
                                 id_filter=[],
                                 subsets=[])
27
        json.dumps(results)  # check if result is json serializable
Sascha Herzinger's avatar
Sascha Herzinger committed
28
29
        assert 'data' in results
        assert 'statistics' in results
30
31
32
        assert 'anova' in results
        assert results['anova']['p_value'] == 1
        assert results['anova']['f_value'] == 0
33
34
35
36
37
        assert len(json.loads(results['data'])) == 8
        assert len(results['statistics']) == 2
        assert 'foo////s1' in results['statistics']
        assert 'bar////s1' in results['statistics']
        stats = results['statistics']['foo////s1']
Sascha Herzinger's avatar
Sascha Herzinger committed
38
39
40
41
42
        assert not np.isnan(stats['median'])
        assert not np.isnan(stats['l_qrt'])
        assert not np.isnan(stats['u_qrt'])
        assert not np.isnan(stats['l_wsk'])
        assert not np.isnan(stats['u_wsk'])
43
44

    def test_marks_outliers(self):
45
        df_1 = pd.DataFrame([[100, 'foo', -50],
46
47
48
49
50
                             [101, 'foo', 1],
                             [102, 'foo', 2],
                             [103, 'foo', 3],
                             [104, 'foo', 100]],
                            columns=['id', 'feature', 'value'])
51
52
53
54
55
56
        df_2 = pd.DataFrame([[201, 'bar', 1],
                             [202, 'bar', 2],
                             [203, 'bar', 3],
                             [204, 'bar', 100]],
                            columns=['id', 'feature', 'value'])
        results = self.task.main(features=[df_1, df_2], categories=[],
57
58
                                 id_filter=[], subsets=[])
        df = pd.DataFrame.from_dict(json.loads(results['data']))
59
60
        assert np.all(df['outlier'] == [True, False, False, False, True,
                                        False, False, False, True])
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

    def test_can_handle_nan(self):
        df = pd.DataFrame([[100, 'foo', -50],
                           [101, 'foo', 1],
                           [102, 'foo', float('nan')],
                           [103, 'foo', 3],
                           [104, 'foo', 100]],
                          columns=['id', 'feature', 'value'])
        results = self.task.main(features=[df], categories=[],
                                 id_filter=[], subsets=[])
        assert results['statistics']['foo////s1']['median'] == 2

    def test_can_handle_groups_with_only_nan(self):
        df = pd.DataFrame([[100, 'foo', -50],
                           [101, 'foo', 1],
                           [102, 'foo', float('nan')],
                           [103, 'foo', 3],
                           [104, 'foo', 100],
                           [105, 'foo', float('nan')]],
                          columns=['id', 'feature', 'value'])
        categories = pd.DataFrame([[100, 'gender', 'female'],
                                   [101, 'gender', 'female'],
                                   [102, 'gender', 'male'],
                                   [103, 'gender', 'female'],
                                   [104, 'gender', 'female'],
                                   [105, 'gender', 'male']],
                                  columns=['id', 'feature', 'value'])
        results = self.task.main(features=[df], categories=[categories],
                                 id_filter=[], subsets=[])
        assert 'foo//female//s1' in results['statistics']
        assert 'foo//male//s1' not in results['statistics']