dish_xls_exporter.py 19.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
import logging
import pyexcel
import hashlib
from os import fsencode
from .export_utils import get_partners_from_daisy, process_possible_date, process_yes_no_dontknow_answer, get_value_list_from_row, is_data, is_study, is_submission,  process_yes_no_answer, get_names_from_string


class DishXlsExporter:

    def __init__(self):
11
12
13
14
        logging.basicConfig(filename='export_dishxls.log', 
                            level=logging.DEBUG,
                            format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
        institutions = get_partners_from_daisy()
        self.inst_dict = {}
        self.inst_ac_dict = {}
        for inst in institutions:
            self.inst_dict[inst.get('name').lower()] = inst.get('name')
        for inst in institutions:
            if inst.get('acronym'):
                self.inst_ac_dict[inst.get('acronym').lower()] = inst.get('name')

        self.h = hashlib.md5()
        self.predefined_data_types = set([
            "Omics data",
            "Genotype data",
            "Whole genome sequencing",
            "Exome sequencing",
            "Genomics variant array",
            "RNASeq",
            "Genetic and derived genetic data",
            "Transcriptome array",
            "Methylation array",
            "MicroRNA array",
            "Metabolomics",
            "Metagenomics",
            "Proteomics",
            "Other omics data",
            "Clinical Imaging",
            "Cell Imaging",
            "Human subject data",
            "Clinical data",
            "Lifestyle data",
            "Socio Economic Data",
            "Environmental Data",
            "Other Phenotype data",
            "Other"
        ])

    def export_submission(self, full_file_path):
        idx = 1
        logging.info('Processing start for ----> {}'.format(full_file_path))
        book = pyexcel.get_book(file_name=full_file_path)
        is_dish = any("_Help" in elem for elem in book.sheet_names())
        if is_dish:
            dataset_dict = {
                "source":   book.filename,
                "contacts": [],
                "data_declarations": [],
                "studies": [],
                "legal_bases": []
            }

            while idx < book.number_of_sheets():
                sheet = book.sheet_by_index(idx)
                logging.info('Processing sheet ----> {}'.format(book.sheet_names()[idx]))

                if is_study(sheet):
70
                    cohort_dict = {'name':  sheet[1, 1].strip(),
71
                                   'description' : sheet[2, 1] + ' ' + sheet[6, 1] + ' ' + sheet[16,1],
72
73
74
75
76
77
78
79
80
81
82
83
                                   'has_ethics_approval' : process_yes_no_answer(sheet[4, 1]),
                                   "ethics_approval_notes": sheet[5, 1],
                                   "url": sheet[3, 1],
                                   "contacts": [{"first_name": get_names_from_string(sheet[8,1])[0],
                                                 "last_name":get_names_from_string(sheet[8,1])[1],
                                                 "role": sheet[11,1],
                                                 "email":sheet[9,1],
                                                 "affiliations": [self.process_institution(sheet[10,1])]
                                                 }]
                                   }

                    if sheet[12, 1] and sheet[15, 1]:
84
                        cohort_dict["contacts"].append({"first_name": get_names_from_string(sheet[12,1])[0],
85
86
87
88
89
90
91
92
                                                                                  "last_name": get_names_from_string(sheet[12,1])[1],
                                                                                  "role": sheet[15,1],
                                                                                  "email":sheet[13,1],
                                                                                  "affiliations": [self.process_institution(sheet[14,1])]
                                                                                  })
                    dataset_dict["studies"].append(cohort_dict)

                elif is_data(sheet):
93
94
                    datadec_dict = {'title' : sheet[1, 1].strip(),
                                    'source_study' : sheet[2, 1].strip(),
95
96
97
98
99
                                    "data_types":[]}
                    datadec_dict["data_type_notes"] = sheet[7, 1]

                    data_type_info = self.process_data_types(get_value_list_from_row(sheet, 6))
                    datadec_dict["data_types"].extend(data_type_info[0])
Vilem Ded's avatar
Vilem Ded committed
100
101
102
103
                    if data_type_info[1]:
                        datadec_dict["data_type_notes"] += " " + data_type_info[1]
                    if sheet[10, 1]:
                        datadec_dict["data_type_notes"] += " Notes on samples: " + sheet[10, 1]
104
105
106
107
108
109
110
111
112
113
114

                    #if it involves samples add this as a datatype
                    if process_yes_no_answer(sheet[9, 1]):
                        datadec_dict["data_types"].append('Samples')

                    if sheet[8, 1]:
                        datadec_dict["de_identification"] = self.process_deidentification(sheet[8,1])
                    datadec_dict["consent_status"] = self.process_consent_status(sheet[32,1])
                    datadec_dict["consent_status_description"] = sheet[33, 1]

                    if sheet[20, 1]:
115
                        datadec_dict['subjects_category'] = sheet[20, 1].replace(' & ', '_and_')
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

                    if sheet[12, 1]:
                        lb_code = self.extract_lb_code(sheet[12, 1])
                        dataset_dict["legal_bases"].append({"data_declarations": [sheet[1, 1]],
                                                            "personal_data_codes":["Standard"],
                                                            "legal_basis_codes":[lb_code],
                                                            "legal_basis_notes": sheet[12, 0]
                                                            })

                    if sheet[13, 1]:
                        lb_code = self.extract_lb_code(sheet[13, 1])
                        dataset_dict["legal_bases"].append({"data_declarations": [sheet[1, 1]],
                                                            "personal_data_codes":["Standard"],
                                                            "legal_basis_codes":[lb_code],
                                                            "legal_basis_notes": sheet[13, 0]
                                                            })
                    if sheet[16, 1]:
                        lb_code = self.extract_lb_code(sheet[16, 1])
                        dataset_dict["legal_bases"].append({"data_declarations": [sheet[1, 1]],
                                                            "personal_data_codes":["Special"],
                                                            "legal_basis_codes":[lb_code],
                                                            "legal_basis_notes": sheet[16, 0]
                                                            })
                    if sheet[17, 1]:
                        lb_code = self.extract_lb_code(sheet[17, 1])
                        dataset_dict["legal_bases"].append({"data_declarations": [sheet[1, 1]],
                                                            "personal_data_codes":["Special"],
                                                            "legal_basis_codes":[lb_code],
                                                            "legal_basis_notes": sheet[17, 0]
                                                            })
                    if sheet[21, 1]:
                        datadec_dict['has_special_subjects'] = process_yes_no_dontknow_answer(
                            sheet[21, 1])
                        if datadec_dict.get('has_special_subjects') == True and sheet[22, 1]:
                            datadec_dict['special_subject_notes'] = sheet[22, 1]

                    use_restrictions = []
                    if process_yes_no_dontknow_answer(sheet[24, 1]):
154
                        use_restrictions.append({'use_class': 'RS-[XX]',
155
                                                 'use_restriction_rule': "CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
156
157
                                                 'use_class_note': sheet[24, 0],
                                                 'use_restriction_note': sheet[25,1]})
158
                    elif process_yes_no_dontknow_answer(sheet[24, 1]) is not None:
159
                        use_restrictions.append({'use_class': 'RS-[XX]',
160
                                                 'use_restriction_rule': "NO_CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
161
162
                                                 'use_class_note': sheet[24, 0]})

163
                    if process_yes_no_dontknow_answer(sheet[26, 1]):
164
                        use_restrictions.append({'use_class': 'GS-[XX]',
165
                                                 'use_restriction_rule': "CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
166
167
                                                 'use_class_note': sheet[26, 0],
                                                 'use_restriction_note': sheet[27,1]})
168
                    elif process_yes_no_dontknow_answer(sheet[26, 1]) is not None:
169
                        use_restrictions.append({'use_class': 'GS-[XX]',
170
                                                 'use_restriction_rule': "NO_CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
171
                                                 'use_class_note': sheet[26, 0]})
172
173
174
175

                    if process_yes_no_dontknow_answer(sheet[28, 1]):
                        use_restrictions.append({'use_class': 'IS',
                                                 'use_restriction_rule': "CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
176
177
                                                 'use_class_note': sheet[28, 0],
                                                 'use_restriction_note': sheet[29,1]})
178
179
180
                    elif process_yes_no_dontknow_answer(sheet[28, 1]) is not None:
                        use_restrictions.append({'use_class': 'IS',
                                                 'use_restriction_rule': "NO_CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
181
                                                 'use_class_note': sheet[28, 0]})
182
183
184
185

                    if process_yes_no_dontknow_answer(sheet[30, 1]):
                        use_restrictions.append({'use_class': 'TS-[XX]',
                                                 'use_restriction_rule': "CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
186
187
                                                 'use_class_note': sheet[30, 0],
                                                 'use_restriction_note': sheet[31,1]})
188
189
190
                    elif process_yes_no_dontknow_answer(sheet[30, 1]) is not None:
                        use_restrictions.append({'use_class': 'IS',
                                                 'use_restriction_rule': "NO_CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
191
                                                 'use_class_note': sheet[30, 0]})
192
193
194
195

                    if process_yes_no_answer(sheet[35, 1]):
                        use_restrictions.append({'use_class': 'PS',
                                                 'use_restriction_rule': "CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
196
197
                                                 'use_class_note': sheet[35, 0],
                                                 'use_restriction_note': dataset_dict["project"]})
198
199
200
201
202
203
204
205
206
                    else:
                        use_restrictions.append({'use_class': 'PS',
                                                 'use_restriction_rule': "NO_CONSTRAINTS",
                                                 'use_class_note': sheet[35, 0]})

                    if process_yes_no_answer(sheet[36, 1]):
                        datadec_dict["storage_end_date"] = process_possible_date(sheet[37, 1])
                        use_restrictions.append({'use_class': 'TS-[XX]',
                                                 'use_restriction_rule': "CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
207
208
                                                 'use_class_note':  sheet[36, 0],
                                                 'use_restriction_note':  process_possible_date(sheet[37, 1])})
209
210
211
                    else:
                        use_restrictions.append({'use_class': 'TS-[XX]',
                                                 'use_restriction_rule': "NO_CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
212
                                                 'use_class_note': sheet[36, 0]})
213
214
215
216

                    if process_yes_no_answer(sheet[38, 1]):
                        use_restrictions.append({'use_class': 'PUB',
                                                 'use_restriction_rule': "CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
217
218
                                                 'use_class_note': sheet[38, 0],
                                                 'use_restriction_note':  sheet[39, 1]})
219
220
221
                    else:
                        use_restrictions.append({'use_class': 'PUB',
                                                 'use_restriction_rule': "NO_CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
222
223
224
225
226
227
228
229
230
231
232
233
                                                 'use_class_note': sheet[38, 0]})

                    if process_yes_no_answer(sheet[40, 1]):
                        use_restrictions.append({'use_class': 'RTN',
                                                 'use_restriction_rule': "CONSTRAINTS",
                                                 'use_class_note': sheet[40, 0],
                                                 'use_restriction_note':  sheet[41, 1]})
                    else:
                        use_restrictions.append({'use_class': 'RTN',
                                                 'use_restriction_rule': "NO_CONSTRAINTS",
                                                 'use_class_note': sheet[40, 0]})

234

Pinar Alper's avatar
Pinar Alper committed
235
                    if sheet[42, 1]:
236
237
                        use_restrictions.append({'use_class': 'Other',
                                                 'use_restriction_rule': "CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
238
239
                                                 'use_class_note': sheet[42, 0],
                                                 'use_restriction_note':  sheet[42, 1]})
240
241
242
243

                    if process_yes_no_answer(sheet[47, 1]):
                        use_restrictions.append({'use_class': 'IP',
                                                 'use_restriction_rule': "CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
244
245
                                                 'use_class_note': sheet[47, 0],
                                                 'use_restriction_note':  sheet[48, 1]})
246
247
248
                    else:
                        use_restrictions.append({'use_class': 'IP',
                                                 'use_restriction_rule': "NO_CONSTRAINTS",
Pinar Alper's avatar
Pinar Alper committed
249
                                                 'use_class_note': sheet[47, 1]})
250
251
252
253
254
255
                    datadec_dict['use_restrictions'] = use_restrictions

                    datadec_dict["access_procedure"] = ""

                    if sheet[45, 1] and ('not' in sheet[45, 1]):
                        if sheet[44, 1] and ('no' in sheet[44, 1]):
Pinar Alper's avatar
Pinar Alper committed
256
257
                            datadec_dict["access_category"] = "open-access" #this is just an initial interpretation and should be further curated in catalog
                            datadec_dict["access_procedure"] = datadec_dict["access_procedure"] + "No additional form is needed to request access."
258
                        else:
Pinar Alper's avatar
Pinar Alper committed
259
260
                            datadec_dict["access_category"] = "registered-access" #this is just an initial interpretation and should be further curated in catalog
                            datadec_dict["access_procedure"] = datadec_dict["access_procedure"] + "Additional form is needed to request access."
261
262
263
264
265
266
267
268
                    else:
                        datadec_dict["access_category"] = "controlled-access"
                        datadec_dict["access_procedure"] = datadec_dict["access_procedure"] +  sheet[46, 1]


                    dataset_dict["data_declarations"].append(datadec_dict)

                elif is_submission(sheet):
269
270
                    dataset_dict["name"] = sheet[2, 1].strip()
                    dataset_dict["project"] = sheet[5, 1].strip()
271
272
273
                    dataset_dict["contacts"].extend([{"first_name": get_names_from_string(sheet[9, 1])[0],
                                                      "last_name":get_names_from_string(sheet[9, 1])[1],
                                                      "role": sheet[11,1],
274
                                                      "email":sheet[10,1].strip(),
275
276
277
278
279
                                                      "affiliations": [self.process_institution(sheet[7,1])]
                                                      },
                                                     {"first_name": get_names_from_string(sheet[12, 1])[0],
                                                      "last_name":get_names_from_string(sheet[12, 1])[1],
                                                      "role": "Legal_Representative",
280
                                                      "email":sheet[13,1].strip(),
281
282
283
284
285
                                                      "affiliations": [self.process_institution(sheet[7,1])]
                                                      },
                                                     {"first_name": get_names_from_string(sheet[14, 1])[0],
                                                      "last_name":get_names_from_string(sheet[14, 1])[1],
                                                      "role": "Data_Protection_Officer",
286
                                                      "email":sheet[15,1].strip(),
287
288
289
                                                      "affiliations": [self.process_institution(sheet[7,1])]
                                                      }])
                    if sheet[16, 1] and sheet[18, 1]:
290
291
                        dataset_dict["contacts"].append({"first_name": get_names_from_string(sheet[16, 1])[0],
                                                         "last_name": get_names_from_string(sheet[16, 1])[1],
292
                                                         "role": sheet[18,1],
293
                                                         "email": sheet[17,1].strip(),
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
                                                         "affiliations": [self.process_institution(sheet[7,1])]
                                                         })

                else:
                    pass

                idx += 1

            logging.info('Processing end for ----> {}'.format(full_file_path))
            return dataset_dict
        else:
            raise ValueError("{} not a valid DISH Excel file".format(full_file_path))


    def get_hash_for_path(self, path):
        self.h.update(fsencode(path))
        return str(int(self.h.hexdigest(), 16))

    def process_data_types(self, xls_data_type_list):
        result = []
        data_type_notes = ''
        for type_name in xls_data_type_list:
            type_name = type_name.strip()
            if type_name:
                if type_name in self.predefined_data_types:
                    result.append(type_name.replace(" ", "_"))
                else:
                    data_type_notes += type_name + '\n'
        return (result, data_type_notes)

    def process_deidentification(self, deid_str):
        if 'seu' in deid_str:
            return 'pseudonymized'
        elif 'non' in deid_str:
            return 'anonymized'

    def process_consent_status(self, consent_str):
        if 'et' in consent_str:
            return 'heterogeneous'
        elif 'om' in consent_str:
            return 'homogeneous'

    def process_institution(self, institution_str):
        if institution_str:
            if self.inst_ac_dict.get(institution_str.lower()):
                return self.inst_ac_dict.get(institution_str.lower())
            elif self.inst_dict.get(institution_str.lower()):
                return self.inst_dict.get(institution_str.lower())
            else:
                logging.error('Unknown institution  -- > {}'.format(institution_str))
                return institution_str
        else:
            return ""

    def extract_lb_code(self, lb_value):
        op = lb_value.index('(')
        cp = lb_value.rindex(')')
        return lb_value[op+1:cp]