test_main.py 10.9 KB
Newer Older
1
2
3
4
"""This module provides tests for the heatmap analysis main module."""

import pytest
import pandas as pd
5
import numpy as np
6
7
8
9
10
11
12
13
14

from fractalis.analytics.tasks.heatmap.main import HeatmapTask


# noinspection PyMissingTypeHints
class TestHeatmap:

    task = HeatmapTask()

15
    def test_functional(self):
16
        numerical_arrays = [
17
18
19
20
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
                          [102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
                          [104, 'foo', 20], [104, 'bar', 21]],
                         columns=['id', 'feature', 'value'])
21
22
23
24
25
        ]
        subsets = [[101, 102], [103, 104]]
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
26
27
                                ranking_method='B',
                                id_filter=[],
28
                                max_rows=100,
29
30
31
32
                                subsets=subsets)
        assert 'data' in result
        assert 'stats' in result

33
    def test_functional_with_nans_and_missing(self):
34
        numerical_arrays = [
35
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 5],
36
                          [102, 'foo', 10],
37
38
                          [103, 'foo', float('nan')], [103, 'bar', 15],
                          [104, 'foo', 20], [104, 'bar', 20]],
39
                         columns=['id', 'feature', 'value'])
40
        ]
41
42
43
44
45
46
47
48
        subsets = [[101, 102], [103, 104]]
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='B',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
49
50
51
        for stat in result['stats']:
            if stat != 'feature' and stat != 'AveExpr':
                assert result['stats'][stat][0] == result['stats'][stat][1]
52

53
    def test_main_raises_if_invalid_data(self):
54
        numerical_arrays = [
55
56
57
58
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
                          [102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
                          [104, 'foo', 20], [104, 'bar', 21]],
                         columns=['id', 'feature', 'value'])
59
        ]
60
        subsets = [[1, 2, 3, 4]]  # does not match sample colnames
61
        with pytest.raises(ValueError) as e:
62
63
64
65
66
            self.task.main(numerical_arrays=numerical_arrays,
                           numericals=[],
                           categoricals=[],
                           ranking_method='mean',
                           id_filter=[],
67
                           max_rows=100,
68
                           subsets=subsets)
69
            assert 'data set is too small' in e
70
71
72
73
74
75
76
77
78
79
80
81
82

    def test_empty_subset_equals_full_subset(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6], [102, 'foo', 10],
                          [102, 'bar', 11], [103, 'foo', 15], [103, 'bar', 16],
                          [104, 'foo', 20], [104, 'bar', 21]],
                         columns=['id', 'feature', 'value'])
        ]
        result_1 = self.task.main(numerical_arrays=numerical_arrays,
                                  numericals=[],
                                  categoricals=[],
                                  ranking_method='mean',
                                  id_filter=[],
83
                                  max_rows=100,
84
85
86
87
88
89
90
                                  subsets=[])

        result_2 = self.task.main(numerical_arrays=numerical_arrays,
                                  numericals=[],
                                  categoricals=[],
                                  ranking_method='mean',
                                  id_filter=[],
91
                                  max_rows=100,
92
93
                                  subsets=[[101, 102, 103, 104]])
        assert result_1 == result_2
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

    def test_multiple_numerical_array_data(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
                          [102, 'foo', 10], [102, 'bar', 11],
                          [103, 'foo', 15], [103, 'bar', 16],
                          [104, 'foo', 20], [104, 'bar', 21]],
                         columns=['id', 'feature', 'value']),
            pd.DataFrame([[101, 'baz', 10], [102, 'baz', 11],
                          [105, 'foo', 20], [105, 'baz', 21],
                          [106, 'bar', 15]],
                         columns=['id', 'feature', 'value'])
        ]
        subsets = [[101, 102, 106], [103, 104, 105]]
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='B',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
        assert 'data' in result
        assert 'stats' in result

    def test_zscore_is_not_nan_if_data_misses_values(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
                          [102, 'foo', 10], [102, 'bar', 11],
                          [103, 'foo', 15], [103, 'bar', 16],
                          [104, 'foo', 20], [104, 'bar', 21]],
                         columns=['id', 'feature', 'value']),
            pd.DataFrame([[101, 'baz', 10], [102, 'baz', 11],
                          [105, 'foo', 20], [105, 'baz', 21],
                          [106, 'bar', 15]],
                         columns=['id', 'feature', 'value'])
        ]
        subsets = [[101, 102, 106], [103, 104, 105]]
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='B',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
138
        data = result['data']
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
        data = pd.DataFrame(data)
        assert not np.isnan(np.min(data['zscore']))

    def test_results_are_sorted(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'A', 5], [102, 'A', 5],
                          [101, 'B', 2], [102, 'B', 2],
                          [101, 'C', 8], [102, 'C', 8],
                          [101, 'D', 10], [102, 'D', 10]],
                         columns=['id', 'feature', 'value'])
        ]
        subsets = []
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='mean',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
158
        data = result['data']
159
160
        data = pd.DataFrame(data)
        feature_col = data['feature'].tolist()
161
        assert ['D', 'C', 'A', 'B', 'D', 'C', 'A', 'B'] == feature_col
162
        assert ['D', 'C', 'A', 'B'] == result['stats']['feature']
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179

    def test_max_rows_works(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'A', 5], [102, 'A', 5],
                          [101, 'B', 2], [102, 'B', 2],
                          [101, 'C', 8], [102, 'C', 8],
                          [101, 'D', 10], [102, 'D', 10]],
                         columns=['id', 'feature', 'value'])
        ]
        subsets = []
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='mean',
                                id_filter=[],
                                max_rows=2,
                                subsets=subsets)
180
        data = result['data']
181
182
        data = pd.DataFrame(data)
        feature_col = data['feature'].tolist()
183
        assert ['D', 'C', 'D', 'C'] == feature_col
184
        assert result['stats']['feature'] == ['D', 'C']
Sascha Herzinger's avatar
Sascha Herzinger committed
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244

    def test_sorts_correct_for_different_criteria(self):
        numerical_arrays = [
            pd.DataFrame([[101, 'foo', 5], [101, 'bar', -12],
                          [102, 'foo', 10], [102, 'bar', -25],
                          [103, 'foo', 15], [103, 'bar', -20],
                          [104, 'foo', 20], [104, 'bar', -50]],
                         columns=['id', 'feature', 'value'])
        ]
        subsets = [[101, 102], [103, 104]]
        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='P.Value',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
        stats = result['stats']['P.Value']
        assert all([stats[i] < stats[i + 1] for i in range(len(stats) - 1)])

        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='adj.P.Val',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
        stats = result['stats']['adj.P.Val']
        assert all([stats[i] < stats[i + 1] for i in range(len(stats) - 1)])

        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='B',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
        stats = result['stats']['B']
        assert all([stats[i] > stats[i + 1] for i in range(len(stats) - 1)])

        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='logFC',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
        stats = result['stats']['logFC']
        assert all([abs(stats[i]) > abs(stats[i + 1])
                    for i in range(len(stats) - 1)])

        result = self.task.main(numerical_arrays=numerical_arrays,
                                numericals=[],
                                categoricals=[],
                                ranking_method='t',
                                id_filter=[],
                                max_rows=100,
                                subsets=subsets)
        stats = result['stats']['t']
        assert all([stats[i] > stats[i + 1] for i in range(len(stats) - 1)])