dish_xls_exporter.py 20.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
import logging
import pyexcel
import hashlib
from os import fsencode
from .export_utils import get_partners_from_daisy, process_possible_date, process_yes_no_dontknow_answer, get_value_list_from_row, is_data, is_study, is_submission,  process_yes_no_answer, get_names_from_string


class DishXlsExporter:

    def __init__(self):
11
12
13
14
        logging.basicConfig(filename='export_dishxls.log', 
                            level=logging.DEBUG,
                            format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
        institutions = get_partners_from_daisy()
        self.inst_dict = {}
        self.inst_ac_dict = {}
        for inst in institutions:
            self.inst_dict[inst.get('name').lower()] = inst.get('name')
        for inst in institutions:
            if inst.get('acronym'):
                self.inst_ac_dict[inst.get('acronym').lower()] = inst.get('name')

        self.h = hashlib.md5()
        self.predefined_data_types = set([
            "Omics data",
            "Genotype data",
            "Whole genome sequencing",
            "Exome sequencing",
            "Genomics variant array",
            "RNASeq",
            "Genetic and derived genetic data",
            "Transcriptome array",
            "Methylation array",
            "MicroRNA array",
            "Metabolomics",
            "Metagenomics",
            "Proteomics",
            "Other omics data",
            "Clinical Imaging",
            "Cell Imaging",
            "Human subject data",
            "Clinical data",
            "Lifestyle data",
            "Socio Economic Data",
            "Environmental Data",
            "Other Phenotype data",
            "Other"
        ])

    def export_submission(self, full_file_path):
        idx = 1
        logging.info('Processing start for ----> {}'.format(full_file_path))
        book = pyexcel.get_book(file_name=full_file_path)
        is_dish = any("_Help" in elem for elem in book.sheet_names())
        if is_dish:
            dataset_dict = {
                "source":   book.filename,
                "contacts": [],
                "data_declarations": [],
                "studies": [],
                "legal_bases": []
            }

            while idx < book.number_of_sheets():
                sheet = book.sheet_by_index(idx)
                logging.info('Processing sheet ----> {}'.format(book.sheet_names()[idx]))

                if is_study(sheet):
70
                    cohort_dict = {'name':  sheet[1, 1].strip(),
71
                                   'description' : sheet[2, 1] + ' ' + sheet[6, 1] + ' ' + sheet[16,1],
72
73
74
75
76
77
78
79
80
81
82
83
                                   'has_ethics_approval' : process_yes_no_answer(sheet[4, 1]),
                                   "ethics_approval_notes": sheet[5, 1],
                                   "url": sheet[3, 1],
                                   "contacts": [{"first_name": get_names_from_string(sheet[8,1])[0],
                                                 "last_name":get_names_from_string(sheet[8,1])[1],
                                                 "role": sheet[11,1],
                                                 "email":sheet[9,1],
                                                 "affiliations": [self.process_institution(sheet[10,1])]
                                                 }]
                                   }

                    if sheet[12, 1] and sheet[15, 1]:
84
                        cohort_dict["contacts"].append({"first_name": get_names_from_string(sheet[12,1])[0],
85
86
87
88
89
90
91
92
                                                                                  "last_name": get_names_from_string(sheet[12,1])[1],
                                                                                  "role": sheet[15,1],
                                                                                  "email":sheet[13,1],
                                                                                  "affiliations": [self.process_institution(sheet[14,1])]
                                                                                  })
                    dataset_dict["studies"].append(cohort_dict)

                elif is_data(sheet):
93
94
                    datadec_dict = {'title' : sheet[1, 1].strip(),
                                    'source_study' : sheet[2, 1].strip(),
95
96
97
98
99
                                    "data_types":[]}
                    datadec_dict["data_type_notes"] = sheet[7, 1]

                    data_type_info = self.process_data_types(get_value_list_from_row(sheet, 6))
                    datadec_dict["data_types"].extend(data_type_info[0])
Vilem Ded's avatar
Vilem Ded committed
100
101
102
103
                    if data_type_info[1]:
                        datadec_dict["data_type_notes"] += " " + data_type_info[1]
                    if sheet[10, 1]:
                        datadec_dict["data_type_notes"] += " Notes on samples: " + sheet[10, 1]
104
105
106
107
108
109
110
111
112
113
114

                    #if it involves samples add this as a datatype
                    if process_yes_no_answer(sheet[9, 1]):
                        datadec_dict["data_types"].append('Samples')

                    if sheet[8, 1]:
                        datadec_dict["de_identification"] = self.process_deidentification(sheet[8,1])
                    datadec_dict["consent_status"] = self.process_consent_status(sheet[32,1])
                    datadec_dict["consent_status_description"] = sheet[33, 1]

                    if sheet[20, 1]:
115
                        datadec_dict['subjects_category'] = sheet[20, 1].replace(' & ', '_and_')
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

                    if sheet[12, 1]:
                        lb_code = self.extract_lb_code(sheet[12, 1])
                        dataset_dict["legal_bases"].append({"data_declarations": [sheet[1, 1]],
                                                            "personal_data_codes":["Standard"],
                                                            "legal_basis_codes":[lb_code],
                                                            "legal_basis_notes": sheet[12, 0]
                                                            })

                    if sheet[13, 1]:
                        lb_code = self.extract_lb_code(sheet[13, 1])
                        dataset_dict["legal_bases"].append({"data_declarations": [sheet[1, 1]],
                                                            "personal_data_codes":["Standard"],
                                                            "legal_basis_codes":[lb_code],
                                                            "legal_basis_notes": sheet[13, 0]
                                                            })
                    if sheet[16, 1]:
                        lb_code = self.extract_lb_code(sheet[16, 1])
                        dataset_dict["legal_bases"].append({"data_declarations": [sheet[1, 1]],
                                                            "personal_data_codes":["Special"],
                                                            "legal_basis_codes":[lb_code],
                                                            "legal_basis_notes": sheet[16, 0]
                                                            })
                    if sheet[17, 1]:
                        lb_code = self.extract_lb_code(sheet[17, 1])
                        dataset_dict["legal_bases"].append({"data_declarations": [sheet[1, 1]],
                                                            "personal_data_codes":["Special"],
                                                            "legal_basis_codes":[lb_code],
                                                            "legal_basis_notes": sheet[17, 0]
                                                            })
                    if sheet[21, 1]:
                        datadec_dict['has_special_subjects'] = process_yes_no_dontknow_answer(
                            sheet[21, 1])
                        if datadec_dict.get('has_special_subjects') == True and sheet[22, 1]:
                            datadec_dict['special_subject_notes'] = sheet[22, 1]

                    use_restrictions = []
153
                    # Is data consented to be used only in specific research/disease areas?
154
                    if process_yes_no_dontknow_answer(sheet[24, 1]):
155
                        use_restrictions.append({'use_class': 'RS-[XX]',
156
                                                 'use_restriction_rule': "CONSTRAINED_PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
157
158
                                                 'use_class_note': sheet[24, 0],
                                                 'use_restriction_note': sheet[25,1]})
159
                    elif process_yes_no_dontknow_answer(sheet[24, 1]) is not None:
160
                        use_restrictions.append({'use_class': 'RS-[XX]',
161
                                                 'use_restriction_rule': "PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
162
163
                                                 'use_class_note': sheet[24, 0]})

164
                    # Does consent contain clauses that put geographical restrictions to the sharing of data?
165
                    if process_yes_no_dontknow_answer(sheet[26, 1]):
166
                        use_restrictions.append({'use_class': 'GS-[XX]',
167
                                                 'use_restriction_rule': "PROHIBITION",
Pinar Alper's avatar
Pinar Alper committed
168
169
                                                 'use_class_note': sheet[26, 0],
                                                 'use_restriction_note': sheet[27,1]})
170
                    elif process_yes_no_dontknow_answer(sheet[26, 1]) is not None:
171
                        use_restrictions.append({'use_class': 'GS-[XX]',
172
                                                 'use_restriction_rule': "PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
173
                                                 'use_class_note': sheet[26, 0]})
174

175
                    # Does the consent limit the type of recipients?
176
177
                    if process_yes_no_dontknow_answer(sheet[28, 1]):
                        use_restrictions.append({'use_class': 'IS',
178
                                                 'use_restriction_rule': "CONSTRAINED_PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
179
180
                                                 'use_class_note': sheet[28, 0],
                                                 'use_restriction_note': sheet[29,1]})
181
182
                    elif process_yes_no_dontknow_answer(sheet[28, 1]) is not None:
                        use_restrictions.append({'use_class': 'IS',
183
                                                 'use_restriction_rule': "PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
184
                                                 'use_class_note': sheet[28, 0]})
185

186
                    # Does the consent contain clauses that put time-limits on the use of data?
187
188
                    if process_yes_no_dontknow_answer(sheet[30, 1]):
                        use_restrictions.append({'use_class': 'TS-[XX]',
189
                                                 'use_restriction_rule': "OBLIGATION",
Pinar Alper's avatar
Pinar Alper committed
190
191
                                                 'use_class_note': sheet[30, 0],
                                                 'use_restriction_note': sheet[31,1]})
192
                    elif process_yes_no_dontknow_answer(sheet[30, 1]) is not None:
193
                        use_restrictions.append({'use_class': 'TS-[XX]',
194
                                                 'use_restriction_rule': "PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
195
                                                 'use_class_note': sheet[30, 0]})
196

197
                    # Is the use of data limited to the project named in the Submission sheet?
198
199
                    if process_yes_no_answer(sheet[35, 1]):
                        use_restrictions.append({'use_class': 'PS',
200
                                                 'use_restriction_rule': "CONSTRAINED_PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
201
202
                                                 'use_class_note': sheet[35, 0],
                                                 'use_restriction_note': dataset_dict["project"]})
203
204
                    else:
                        use_restrictions.append({'use_class': 'PS',
205
                                                 'use_restriction_rule': "PERMISSION",
206
207
                                                 'use_class_note': sheet[35, 0]})

208
                    # Is the data being sent to ELIXIR-LU/LCSB for a limited duration?
209
210
211
                    if process_yes_no_answer(sheet[36, 1]):
                        datadec_dict["storage_end_date"] = process_possible_date(sheet[37, 1])
                        use_restrictions.append({'use_class': 'TS-[XX]',
212
                                                 'use_restriction_rule': "OBLIGATION",
Pinar Alper's avatar
Pinar Alper committed
213
214
                                                 'use_class_note':  sheet[36, 0],
                                                 'use_restriction_note':  process_possible_date(sheet[37, 1])})
215
216
                    else:
                        use_restrictions.append({'use_class': 'TS-[XX]',
217
                                                 'use_restriction_rule': "PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
218
                                                 'use_class_note': sheet[36, 0]})
219

220
                    # Are there any requirements in case of publications based on the DATA?
221
222
                    if process_yes_no_answer(sheet[38, 1]):
                        use_restrictions.append({'use_class': 'PUB',
223
                                                 'use_restriction_rule': "OBLIGATION",
Pinar Alper's avatar
Pinar Alper committed
224
225
                                                 'use_class_note': sheet[38, 0],
                                                 'use_restriction_note':  sheet[39, 1]})
226
227
                    else:
                        use_restrictions.append({'use_class': 'PUB',
228
                                                 'use_restriction_rule': "PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
229
230
                                                 'use_class_note': sheet[38, 0]})

231
                    # Is there a requirement to return data or documents to the database/resource?
Pinar Alper's avatar
Pinar Alper committed
232
233
                    if process_yes_no_answer(sheet[40, 1]):
                        use_restrictions.append({'use_class': 'RTN',
234
                                                 'use_restriction_rule': "OBLIGATION",
Pinar Alper's avatar
Pinar Alper committed
235
236
237
238
                                                 'use_class_note': sheet[40, 0],
                                                 'use_restriction_note':  sheet[41, 1]})
                    else:
                        use_restrictions.append({'use_class': 'RTN',
239
                                                 'use_restriction_rule': "PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
240
241
                                                 'use_class_note': sheet[40, 0]})

242

Pinar Alper's avatar
Pinar Alper committed
243
                    if sheet[42, 1]:
244
                        use_restrictions.append({'use_class': 'Other',
245
                                                 'use_restriction_rule': "CONSTRAINED_PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
246
247
                                                 'use_class_note': sheet[42, 0],
                                                 'use_restriction_note':  sheet[42, 1]})
248

249
                    # Are there any IP restrictions/requirements when using the DATA?
250
251
                    if process_yes_no_answer(sheet[47, 1]):
                        use_restrictions.append({'use_class': 'IP',
252
                                                 'use_restriction_rule': "CONSTRAINED_PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
253
254
                                                 'use_class_note': sheet[47, 0],
                                                 'use_restriction_note':  sheet[48, 1]})
255
256
                    else:
                        use_restrictions.append({'use_class': 'IP',
257
                                                 'use_restriction_rule': "PERMISSION",
Pinar Alper's avatar
Pinar Alper committed
258
                                                 'use_class_note': sheet[47, 1]})
259
260
261
262
263
264
                    datadec_dict['use_restrictions'] = use_restrictions

                    datadec_dict["access_procedure"] = ""

                    if sheet[45, 1] and ('not' in sheet[45, 1]):
                        if sheet[44, 1] and ('no' in sheet[44, 1]):
Nene Barry's avatar
Nene Barry committed
265
                            datadec_dict["access_category"] = "open_access" #this is just an initial interpretation and should be further curated in catalog
Pinar Alper's avatar
Pinar Alper committed
266
                            datadec_dict["access_procedure"] = datadec_dict["access_procedure"] + "No additional form is needed to request access."
267
                        else:
Nene Barry's avatar
Nene Barry committed
268
                            datadec_dict["access_category"] = "registered_access" #this is just an initial interpretation and should be further curated in catalog
Pinar Alper's avatar
Pinar Alper committed
269
                            datadec_dict["access_procedure"] = datadec_dict["access_procedure"] + "Additional form is needed to request access."
270
                    else:
Nene Barry's avatar
Nene Barry committed
271
                        datadec_dict["access_category"] = "controlled_access"
272
273
274
275
276
277
                        datadec_dict["access_procedure"] = datadec_dict["access_procedure"] +  sheet[46, 1]


                    dataset_dict["data_declarations"].append(datadec_dict)

                elif is_submission(sheet):
278
279
                    dataset_dict["name"] = sheet[2, 1].strip()
                    dataset_dict["project"] = sheet[5, 1].strip()
280
                    dataset_dict["contacts"].extend(self.get_submission_contacts(sheet))
281
282
283
284
285
286
287
288
289
290
                else:
                    pass

                idx += 1

            logging.info('Processing end for ----> {}'.format(full_file_path))
            return dataset_dict
        else:
            raise ValueError("{} not a valid DISH Excel file".format(full_file_path))

291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
    def get_submission_contacts(self, sheet):
        contacts = []
        affiliations = [self.process_institution(sheet[7,1])]
        if sheet[9, 1]:
            contacts.append({
                "first_name": get_names_from_string(sheet[9, 1])[0],
                "last_name":get_names_from_string(sheet[9, 1])[1],
                "role": sheet[11,1],
                "email":sheet[10,1].strip(),
                "affiliations": affiliations
            })
        else:
            logging.error("Submission contact missing - Main contact!")

        if sheet[12,1]:
            contacts.append({
                "first_name": get_names_from_string(sheet[12, 1])[0],
                "last_name": get_names_from_string(sheet[12, 1])[1],
                "role": "Legal_Representative",
                "email": sheet[13,1].strip(),
                "affiliations": affiliations
                })
        else:
            logging.error("Submission contact missing - Legal Representative!")

        if sheet[14, 1]:
            contacts.append({
                "first_name": get_names_from_string(sheet[14, 1])[0],
                "last_name": get_names_from_string(sheet[14, 1])[1],
                "role": "Data_Protection_Officer",
                "email": sheet[15,1].strip(),
                "affiliations": affiliations
                })
        else:
            logging.error("Submission contact missing - DPO!")

        if sheet[16, 1] and sheet[18, 1]:
            contacts.append({
                "first_name": get_names_from_string(sheet[16, 1])[0],
                "last_name": get_names_from_string(sheet[16, 1])[1],
                "role": sheet[18,1],
                "email": sheet[17,1].strip(),
                "affiliations": affiliations
                })

        return contacts

338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356

    def get_hash_for_path(self, path):
        self.h.update(fsencode(path))
        return str(int(self.h.hexdigest(), 16))

    def process_data_types(self, xls_data_type_list):
        result = []
        data_type_notes = ''
        for type_name in xls_data_type_list:
            type_name = type_name.strip()
            if type_name:
                if type_name in self.predefined_data_types:
                    result.append(type_name.replace(" ", "_"))
                else:
                    data_type_notes += type_name + '\n'
        return (result, data_type_notes)

    def process_deidentification(self, deid_str):
        if 'seu' in deid_str:
Nene Barry's avatar
Nene Barry committed
357
            return 'pseudonymization'
358
        elif 'non' in deid_str:
Nene Barry's avatar
Nene Barry committed
359
            return 'anonymization'
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

    def process_consent_status(self, consent_str):
        if 'et' in consent_str:
            return 'heterogeneous'
        elif 'om' in consent_str:
            return 'homogeneous'

    def process_institution(self, institution_str):
        if institution_str:
            if self.inst_ac_dict.get(institution_str.lower()):
                return self.inst_ac_dict.get(institution_str.lower())
            elif self.inst_dict.get(institution_str.lower()):
                return self.inst_dict.get(institution_str.lower())
            else:
                logging.error('Unknown institution  -- > {}'.format(institution_str))
                return institution_str
        else:
            return ""

    def extract_lb_code(self, lb_value):
        op = lb_value.index('(')
        cp = lb_value.rindex(')')
        return lb_value[op+1:cp]