test_main.py 8.11 KB
Newer Older
1
2
"""This module provides tests for the heatmap analysis main module."""

3
4
import json

5
6
import pytest
import pandas as pd
7
import numpy as np
8
9
10
11
12
13
14
15
16

from fractalis.analytics.tasks.heatmap.main import HeatmapTask


# noinspection PyMissingTypeHints
class TestHeatmap:

    task = HeatmapTask()

17
    def test_functional(self):
18
        numerical_arrays = [
19
20
21
22
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
                          [102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
                          [104, 'foo', 20], [104, 'bar', 21]],
                         columns=['id', 'feature', 'value'])
23
24
25
26
27
        ]
        subsets = [[101, 102], [103, 104]]
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
28
29
                                ranking_method='B',
                                id_filter=[],
30
                                max_rows=100,
31
32
33
34
                                subsets=subsets)
        assert 'data' in result
        assert 'stats' in result

35
    def test_functional_with_nans_and_missing(self):
36
        numerical_arrays = [
37
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 5],
38
                          [102, 'foo', 10],
39
40
                          [103, 'foo', float('nan')], [103, 'bar', 15],
                          [104, 'foo', 20], [104, 'bar', 20]],
41
                         columns=['id', 'feature', 'value'])
42
        ]
43
44
45
46
47
48
49
50
        subsets = [[101, 102], [103, 104]]
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='B',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
51
52
53
        for stat in result['stats']:
            if stat != 'feature' and stat != 'AveExpr':
                assert result['stats'][stat][0] == result['stats'][stat][1]
54

55
    def test_main_raises_if_invalid_data(self):
56
        numerical_arrays = [
57
58
59
60
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
                          [102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
                          [104, 'foo', 20], [104, 'bar', 21]],
                         columns=['id', 'feature', 'value'])
61
        ]
62
        subsets = [[1, 2, 3, 4]]  # does not match sample colnames
63
        with pytest.raises(ValueError) as e:
64
65
66
67
68
            self.task.main(numerical_arrays=numerical_arrays,
                           numericals=[],
                           categoricals=[],
                           ranking_method='mean',
                           id_filter=[],
69
                           max_rows=100,
70
                           subsets=subsets)
71
            assert 'data set is too small' in e
72
73
74
75
76
77
78
79
80
81
82
83
84

    def test_empty_subset_equals_full_subset(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
                          [102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
                          [104, 'foo', 20], [104, 'bar', 21]],
                         columns=['id', 'feature', 'value'])
        ]
        result_1 = self.task.main(numerical_arrays=numerical_arrays,
                                  numericals=[],
                                  categoricals=[],
                                  ranking_method='mean',
                                  id_filter=[],
85
                                  max_rows=100,
86
87
88
89
90
91
92
                                  subsets=[])

        result_2 = self.task.main(numerical_arrays=numerical_arrays,
                                  numericals=[],
                                  categoricals=[],
                                  ranking_method='mean',
                                  id_filter=[],
93
                                  max_rows=100,
94
95
                                  subsets=[[101, 102, 103, 104]])
        assert result_1 == result_2
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

    def test_multiple_numerical_array_data(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
                          [102, 'foo', 10], [102, 'bar', 11],
                          [103, 'foo', 15], [103, 'bar', 16],
                          [104, 'foo', 20], [104, 'bar', 21]],
                         columns=['id', 'feature', 'value']),
            pd.DataFrame([[101, 'baz', 10], [102, 'baz', 11],
                          [105, 'foo', 20], [105, 'baz', 21],
                          [106, 'bar', 15]],
                         columns=['id', 'feature', 'value'])
        ]
        subsets = [[101, 102, 106], [103, 104, 105]]
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='B',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
        assert 'data' in result
        assert 'stats' in result

    def test_zscore_is_not_nan_if_data_misses_values(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
                          [102, 'foo', 10], [102, 'bar', 11],
                          [103, 'foo', 15], [103, 'bar', 16],
                          [104, 'foo', 20], [104, 'bar', 21]],
                         columns=['id', 'feature', 'value']),
            pd.DataFrame([[101, 'baz', 10], [102, 'baz', 11],
                          [105, 'foo', 20], [105, 'baz', 21],
                          [106, 'bar', 15]],
                         columns=['id', 'feature', 'value'])
        ]
        subsets = [[101, 102, 106], [103, 104, 105]]
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='B',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
140
        data = result['data']
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
        data = pd.DataFrame(data)
        assert not np.isnan(np.min(data['zscore']))

    def test_results_are_sorted(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'A', 5], [102, 'A', 5],
                          [101, 'B', 2], [102, 'B', 2],
                          [101, 'C', 8], [102, 'C', 8],
                          [101, 'D', 10], [102, 'D', 10]],
                         columns=['id', 'feature', 'value'])
        ]
        subsets = []
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='mean',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
160
        data = result['data']
161
162
        data = pd.DataFrame(data)
        feature_col = data['feature'].tolist()
163
        assert ['D', 'C', 'A', 'B', 'D', 'C', 'A', 'B'] == feature_col
164
        assert ['D', 'C', 'A', 'B'] == result['stats']['feature']
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

    def test_max_rows_works(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'A', 5], [102, 'A', 5],
                          [101, 'B', 2], [102, 'B', 2],
                          [101, 'C', 8], [102, 'C', 8],
                          [101, 'D', 10], [102, 'D', 10]],
                         columns=['id', 'feature', 'value'])
        ]
        subsets = []
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='mean',
                                id_filter=[],
                                max_rows=2,
                                subsets=subsets)
182
        data = result['data']
183
184
        data = pd.DataFrame(data)
        feature_col = data['feature'].tolist()
185
        assert ['D', 'C', 'D', 'C'] == feature_col
186
        assert result['stats']['feature'] == ['D', 'C']