test_from_repo.py 5.48 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import hashlib
import json
import os
from unittest import TestCase

import pyexcel

from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_dontknow_answer, \
    process_yes_no_answer, add_storage_locations, SHEETS_FOLDER


class TestProjectsParser(TestCase):
    def test_export_from_repository(self):

        h = hashlib.md5()

        for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER):
            for fname in fileList:
                if fname.startswith('from-repository'):
                    full_file_path = os.path.join(dirName, fname)
                    dataset_list = []
                    h.update(os.fsencode(full_file_path))
                    submission_id = 'IMP_FR_{}'.format(str(int(h.hexdigest(), 16)))
                    book = pyexcel.get_book(file_name=full_file_path)
                    idx = 1
                    # print('----> {}'.format(full_file_path))
                    while idx < book.number_of_sheets():
                        sheet = book.sheet_by_index(idx)
                        dataset_data = {}
                        dataset_data['source_type'] = 'From_Repository'
                        dataset_data['submission_id'] = submission_id
                        dataset_data['local_custodian'] = get_value_list_from_row(sheet, 2)
                        dataset_data['title'] = sheet[5, 2]

                        if sheet[4, 2]:
                            dataset_data['other_external_id'] = sheet[4, 2]

                        datatype_info = process_data_types(get_value_list_from_row(sheet, 7))
                        dataset_data['data_types'] = datatype_info[0]
                        if datatype_info[1]:
                            dataset_data['data_type_notes'] = datatype_info[1]

                        if sheet[8, 2]:
                            dataset_data['de_identification'] = sheet[8, 2]

                        if sheet[9, 2]:
                            dataset_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')

                        if sheet[10, 2]:
                            dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
                                sheet[10, 2])

                        if dataset_data.get('has_special_subjects'):
                            if dataset_data.get('has_special_subjects') == True and sheet[11, 2]:
                                dataset_data['special_subject_notes'] = sheet[11, 2]

                        if sheet[14, 2]:
                            dataset_data['access_category'] = sheet[14, 2]

                        dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 19)

                        use_restrictions = []
                        if process_yes_no_answer(sheet[17, 2]):
                            use_restrictions.append({'ga4gh_code': 'PS',
                                                     'note': 'Contract restricts data use to projects ' + ', '.join(
                                                         get_value_list_from_row(sheet, 18))})

                        has_time_limis = process_yes_no_dontknow_answer(sheet[27, 2])
                        if has_time_limis and sheet[28, 2]:
                            use_restrictions.append({'ga4gh_code': 'TS-[XX]',
                                                     'note': 'Data is obtained for a limited duration.' + sheet[28, 2]})

                        if process_yes_no_answer(sheet[29, 2]):
                            use_restrictions.append({'ga4gh_code': 'PUB',
                                                     'note': 'Acknowledgement required.'})
                        dataset_data['use_restrictions'] = use_restrictions

                        storage_locations = []

                        master_locations = get_value_list_from_row(sheet, 21)
                        try:
                            add_storage_locations(storage_locations, master_locations, 'master')
                        except ValueError as e:
                            print('Invalid Master Data Location Row  {} \n'.format(full_file_path))

                        if process_yes_no_answer(sheet[23, 2]):
                            backup_locations = get_value_list_from_row(sheet, 24)
                            try:
                                add_storage_locations(storage_locations, backup_locations, 'backup')
                            except ValueError as e:
                                print('Uneven Backup Data Location Row {} \n'.format(full_file_path))

                        if process_yes_no_answer(sheet[25, 2]):
                            copy_locations = get_value_list_from_row(sheet, 26)
                            try:
                                add_storage_locations(storage_locations, copy_locations, 'copy')
                            except ValueError as e:
                                print('Uneven Copy Data Location Row {} \n'.format(full_file_path))

                        acl_list = get_value_list_from_row(sheet, 22)
                        if len(acl_list) > 0:
                            dataset_data['storage_acl_info'] = ', '.join(acl_list)
                        dataset_data['storage_locations'] = storage_locations
                        dataset_list.append(dataset_data)
                        idx += 1

                    with open('datasets-{}.json'.format(submission_id), 'w') as outfile:
                        json.dump(dataset_list, outfile, indent=4)
                        # print(json.dumps(dataset_list, indent=4))