test_etlhandler.py 13.1 KB
Newer Older
Sascha Herzinger's avatar
Sascha Herzinger committed
1
2
3
4
5
6
7
8
9
10
11
12
"""This module provides tests for the etlhandler module."""

import pytest

from fractalis import celery
from fractalis.data.etlhandler import ETLHandler


# noinspection PyMissingOrEmptyDocstring,PyMissingTypeHints,PyPep8Naming
class TestETLHandler:

    @pytest.fixture(scope='function')
Sascha Herzinger's avatar
Sascha Herzinger committed
13
    def redis(self):
Sascha Herzinger's avatar
Sascha Herzinger committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
        from fractalis import redis, sync
        yield redis
        sync.cleanup_all()

    etlhandler = ETLHandler.factory(handler='test', server='localfoo', auth={})

    def test_descriptor_to_hash_produces_unique_hash(self, redis):
        hash_1 = self.etlhandler.descriptor_to_hash(descriptor={'a': 1})
        hash_2 = self.etlhandler.descriptor_to_hash(descriptor={'': ''})
        hash_3 = self.etlhandler.descriptor_to_hash(descriptor={'a': 1})
        self.etlhandler._server = 'localbar'
        hash_4 = self.etlhandler.descriptor_to_hash(descriptor={'a': 1})
        assert isinstance(hash_1, int)
        assert isinstance(hash_4, int)
        assert hash_1 == hash_3
        assert hash_1 != hash_2
        assert hash_1 != hash_4

    def test_find_duplicates_finds_duplicates_by_hash(self, redis):
        descriptor = {'a': {'b': 3}, 'c': 4}
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        duplicates = self.etlhandler.find_duplicates(data_tasks=['123'],
                                                     descriptor=descriptor)
        assert len(duplicates) == 1
Sascha Herzinger's avatar
Sascha Herzinger committed
41
        assert duplicates[0] == '123'
Sascha Herzinger's avatar
Sascha Herzinger committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145

    def test_finds_all_duplicates(self, redis):
        descriptor = {'a': {'b': 3}, 'c': 4}
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        self.etlhandler.create_redis_entry(task_id='456',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        self.etlhandler.create_redis_entry(task_id='789',
                                           file_path='',
                                           descriptor={'a': 5},
                                           data_type='')
        duplicates = self.etlhandler.find_duplicates(
            data_tasks=['123', '456', '789'], descriptor=descriptor)
        assert len(duplicates) == 2
        assert '123' in duplicates
        assert '456' in duplicates

    def test_find_duplicates_only_operates_on_given_list(self, redis):
        descriptor = {'a': {'b': 3}, 'c': 4}
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        self.etlhandler.create_redis_entry(task_id='456',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        self.etlhandler.create_redis_entry(task_id='789',
                                           file_path='',
                                           descriptor={'a': 5},
                                           data_type='')
        duplicates = self.etlhandler.find_duplicates(
            data_tasks=['456', '789'], descriptor=descriptor)
        assert len(duplicates) == 1
        assert '456' in duplicates

    def test_remove_duplicates_removes_duplicate(self, redis):
        descriptor = {'a': {'b': 3}, 'c': 4}
        self.etlhandler.create_redis_entry(task_id='456',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        assert redis.exists('data:456')
        self.etlhandler.remove_duplicates(data_tasks=['456'],
                                          descriptor=descriptor)
        assert not redis.exists('data:456')

    def test_remove_duplicates_removes_all_duplicates(self, redis):
        descriptor = {'a': {'b': 3}, 'c': 4}
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        self.etlhandler.create_redis_entry(task_id='456',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        self.etlhandler.create_redis_entry(task_id='789',
                                           file_path='',
                                           descriptor={'a': 1},
                                           data_type='')
        assert redis.exists('data:123')
        assert redis.exists('data:456')
        assert redis.exists('data:789')
        self.etlhandler.remove_duplicates(data_tasks=['123', '456', '789'],
                                          descriptor=descriptor)
        assert not redis.exists('data:123')
        assert not redis.exists('data:456')
        assert redis.exists('data:789')

    def test_remove_duplicates_only_operates_on_given_list(self, redis):
        descriptor = {'a': {'b': 3}, 'c': 4}
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        self.etlhandler.create_redis_entry(task_id='456',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        self.etlhandler.create_redis_entry(task_id='789',
                                           file_path='',
                                           descriptor={'a': 1},
                                           data_type='')
        assert redis.exists('data:123')
        assert redis.exists('data:456')
        assert redis.exists('data:789')
        self.etlhandler.remove_duplicates(data_tasks=['123', '789'],
                                          descriptor=descriptor)
        assert not redis.exists('data:123')
        assert redis.exists('data:456')
        assert redis.exists('data:789')

    def test_find_duplicate_task_id_returns_task_id_of_SUCCESS(
            self, monkeypatch, redis):
        descriptor = {'a': {'b': 3}, 'c': 4}
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
Sascha Herzinger's avatar
Sascha Herzinger committed
146
147
148
149
150
151
152
153

        class FakeAsyncResult:
            def __init__(self, *args, **kwargs):
                self.state = 'SUCCESS'

            def get(self, *args, **kwargs):
                pass
        monkeypatch.setattr(celery, 'AsyncResult', FakeAsyncResult)
Sascha Herzinger's avatar
Sascha Herzinger committed
154
155
        task_id = self.etlhandler.find_duplicate_task_id(
            data_tasks=['123'], descriptor=descriptor)
Sascha Herzinger's avatar
Sascha Herzinger committed
156
        assert task_id == '123'
Sascha Herzinger's avatar
Sascha Herzinger committed
157
158
159
160
161
162
163
164

    def test_find_duplicate_task_id_returns_task_id_of_SUBMITTED(
            self, monkeypatch, redis):
        descriptor = {'a': {'b': 3}, 'c': 4}
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
Sascha Herzinger's avatar
Sascha Herzinger committed
165
166
167
168
169
170
171
172

        class FakeAsyncResult:
            def __init__(self, *args, **kwargs):
                self.state = 'SUBMITTED'

            def get(self, *args, **kwargs):
                pass
        monkeypatch.setattr(celery, 'AsyncResult', FakeAsyncResult)
Sascha Herzinger's avatar
Sascha Herzinger committed
173
174
        task_id = self.etlhandler.find_duplicate_task_id(
            data_tasks=['123'], descriptor=descriptor)
Sascha Herzinger's avatar
Sascha Herzinger committed
175
        assert task_id == '123'
Sascha Herzinger's avatar
Sascha Herzinger committed
176
177
178
179
180
181
182
183

    def test_find_duplicate_task_id_returns_None_for_FAILURE(
            self, monkeypatch, redis):
        descriptor = {'a': {'b': 3}, 'c': 4}
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
Sascha Herzinger's avatar
Sascha Herzinger committed
184
185
186
187
188
189
190
191

        class FakeAsyncResult:
            def __init__(self, *args, **kwargs):
                self.state = 'FAILURE'

            def get(self, *args, **kwargs):
                pass
        monkeypatch.setattr(celery, 'AsyncResult', FakeAsyncResult)
Sascha Herzinger's avatar
Sascha Herzinger committed
192
193
194
195
        task_id = self.etlhandler.find_duplicate_task_id(
            data_tasks=['123'], descriptor=descriptor)
        assert task_id is None

Sascha Herzinger's avatar
Sascha Herzinger committed
196
197
    def test_find_duplicate_limits_search_to_data_tasks(
            self, monkeypatch, redis):
Sascha Herzinger's avatar
Sascha Herzinger committed
198
199
200
201
202
        descriptor = {'a': {'b': 3}, 'c': 4}
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
Sascha Herzinger's avatar
Sascha Herzinger committed
203
204
205
206
207
208
209
210

        class FakeAsyncResult:
            def __init__(self, *args, **kwargs):
                self.state = 'SUCCESS'

            def get(self, *args, **kwargs):
                pass
        monkeypatch.setattr(celery, 'AsyncResult', FakeAsyncResult)
Sascha Herzinger's avatar
Sascha Herzinger committed
211
212
213
214
215
216
217
        task_id = self.etlhandler.find_duplicate_task_id(
            data_tasks=['456'], descriptor=descriptor)
        assert task_id is None

    def test_find_duplicate_task_id_returns_None_for_not_existing(
            self, monkeypatch, redis):
        descriptor = {'a': {'b': 3}, 'c': 4}
Sascha Herzinger's avatar
Sascha Herzinger committed
218
219
220
221
222
223
224
225

        class FakeAsyncResult:
            def __init__(self, *args, **kwargs):
                self.state = 'FAILURE'

            def get(self, *args, **kwargs):
                pass
        monkeypatch.setattr(celery, 'AsyncResult', FakeAsyncResult)
Sascha Herzinger's avatar
Sascha Herzinger committed
226
227
228
229
230
231
        task_id = self.etlhandler.find_duplicate_task_id(
            data_tasks=['123'], descriptor=descriptor)
        assert task_id is None

    def test_handle_reuses_existing_task_ids_if_use_existing(
            self, monkeypatch, redis):
Sascha Herzinger's avatar
Sascha Herzinger committed
232
        descriptor = {'data_type': 'default'}
Sascha Herzinger's avatar
Sascha Herzinger committed
233
234
235
236
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
Sascha Herzinger's avatar
Sascha Herzinger committed
237
238
239
240
241
242
243
244

        class FakeAsyncResult:
            def __init__(self, *args, **kwargs):
                self.state = 'SUBMITTED'

            def get(self, *args, **kwargs):
                pass
        monkeypatch.setattr(celery, 'AsyncResult', FakeAsyncResult)
Sascha Herzinger's avatar
Sascha Herzinger committed
245
246
247
248
        task_ids = self.etlhandler.handle(descriptors=[descriptor],
                                          data_tasks=['123'],
                                          use_existing=True)
        assert len(task_ids) == 1
Sascha Herzinger's avatar
Sascha Herzinger committed
249
        assert task_ids[0] == '123'
Sascha Herzinger's avatar
Sascha Herzinger committed
250
251

    def test_handle_limits_search_to_tasks_ids(self, monkeypatch, redis):
Sascha Herzinger's avatar
Sascha Herzinger committed
252
        descriptor = {'data_type': 'default'}
Sascha Herzinger's avatar
Sascha Herzinger committed
253
254
255
256
257
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        task_ids = self.etlhandler.handle(descriptors=[descriptor],
Sascha Herzinger's avatar
Sascha Herzinger committed
258
                                          data_tasks=[],
Sascha Herzinger's avatar
Sascha Herzinger committed
259
260
                                          use_existing=True)
        assert len(task_ids) == 1
Sascha Herzinger's avatar
Sascha Herzinger committed
261
        assert task_ids[0] != '123'
Sascha Herzinger's avatar
Sascha Herzinger committed
262
263
264

    def test_handle_removes_old_and_returns_new_if_not_use_existing(
            self, monkeypatch, redis):
Sascha Herzinger's avatar
Sascha Herzinger committed
265
        descriptor = {'data_type': 'default'}
Sascha Herzinger's avatar
Sascha Herzinger committed
266
267
268
269
270
271
272
273
        self.etlhandler.create_redis_entry(task_id='123',
                                           file_path='',
                                           descriptor=descriptor,
                                           data_type='')
        task_ids = self.etlhandler.handle(descriptors=[descriptor],
                                          data_tasks=['123'],
                                          use_existing=False)
        assert len(task_ids) == 1
Sascha Herzinger's avatar
Sascha Herzinger committed
274
        assert task_ids[0] != '123'
Sascha Herzinger's avatar
Sascha Herzinger committed
275
276
277

    def test_handle_removes_duplicate_of_previous_iteration(
            self, monkeypatch, redis):
Sascha Herzinger's avatar
Sascha Herzinger committed
278
        descriptor = {'data_type': 'default'}
Sascha Herzinger's avatar
Sascha Herzinger committed
279
280
281
        task_ids = self.etlhandler.handle(descriptors=[descriptor, descriptor],
                                          data_tasks=[],
                                          use_existing=False)
Sascha Herzinger's avatar
Sascha Herzinger committed
282
        assert len(task_ids) == 2
Sascha Herzinger's avatar
Sascha Herzinger committed
283
284
285
286
287
        assert task_ids[0] != task_ids[1]
        assert len(redis.keys('data:*')) == 1

    def test_handle_uses_duplicate_of_previous_iteration(
            self, monkeypatch, redis):
Sascha Herzinger's avatar
Sascha Herzinger committed
288
        descriptor = {'data_type': 'default'}
Sascha Herzinger's avatar
Sascha Herzinger committed
289
290
291
        task_ids = self.etlhandler.handle(descriptors=[descriptor, descriptor],
                                          data_tasks=[],
                                          use_existing=True)
Sascha Herzinger's avatar
Sascha Herzinger committed
292
        assert len(task_ids) == 1
Sascha Herzinger's avatar
Sascha Herzinger committed
293
        assert len(redis.keys('data:*')) == 1