Commit d24c302a authored by Pinar Alper's avatar Pinar Alper
Browse files

Extended storage resource export

parent 0ba1f1d4
...@@ -239,9 +239,9 @@ ...@@ -239,9 +239,9 @@
}, },
{ {
"elu_accession": "ELU_I_44", "elu_accession": "ELU_I_44",
"institution_name": "London School of Hygiene & Tropical Medicine", "institution_name": "London School of Hygiene & Tropical Medicine, Medical Research Council Unit The Gambia",
"geo_category": "EU", "geo_category": "EU",
"acronym": "LSHTM" "acronym": "LSHTM MRU The Gambia"
}, },
{ {
"elu_accession": "ELU_I_45", "elu_accession": "ELU_I_45",
...@@ -495,10 +495,10 @@ ...@@ -495,10 +495,10 @@
"geo_category": "EU" "geo_category": "EU"
}, },
{ {
"elu_accession": "ELU_I_91", "elu_accession": "ELU_I_91",
"institution_name": "Helmholtz Zentrum München", "institution_name": "Helmholtz Zentrum München",
"geo_category": "EU", "geo_category": "EU",
"acronym": "HMGU" "acronym": "HMGU"
}, },
{ {
"elu_accession": "ELU_I_92", "elu_accession": "ELU_I_92",
......
...@@ -6,7 +6,7 @@ from unittest import TestCase ...@@ -6,7 +6,7 @@ from unittest import TestCase
import pyexcel import pyexcel
from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_answer, \ from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_answer, \
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date, process_share_list
class TestProjectsParser(TestCase): class TestProjectsParser(TestCase):
...@@ -24,7 +24,7 @@ class TestProjectsParser(TestCase): ...@@ -24,7 +24,7 @@ class TestProjectsParser(TestCase):
submission_id = 'IMP_FC_{}'.format(str(int(h.hexdigest(), 16))) submission_id = 'IMP_FC_{}'.format(str(int(h.hexdigest(), 16)))
book = pyexcel.get_book(file_name=full_file_path) book = pyexcel.get_book(file_name=full_file_path)
idx = 1 idx = 1
#print('----> {}'.format(full_file_path)) print('Processing ----> {}'.format(full_file_path))
while idx < book.number_of_sheets(): while idx < book.number_of_sheets():
# dataset_count+=1 # dataset_count+=1
sheet = book.sheet_by_index(idx) sheet = book.sheet_by_index(idx)
...@@ -58,8 +58,6 @@ class TestProjectsParser(TestCase): ...@@ -58,8 +58,6 @@ class TestProjectsParser(TestCase):
if sheet[7, 2]: if sheet[7, 2]:
dataset_data['samples_location'] = sheet[7, 2] dataset_data['samples_location'] = sheet[7, 2]
if dataset_data['involves_samples'] == False:
print('----> {}'.format('Inconsistent samples information' + full_file_path))
if sheet[8, 2]: if sheet[8, 2]:
dataset_data['de_identification'] = sheet[8, 2] dataset_data['de_identification'] = sheet[8, 2]
...@@ -78,7 +76,7 @@ class TestProjectsParser(TestCase): ...@@ -78,7 +76,7 @@ class TestProjectsParser(TestCase):
collab_insts = get_value_list_from_row(sheet, 13) collab_insts = get_value_list_from_row(sheet, 13)
collab_pis = get_value_list_from_row(sheet, 14) collab_pis = get_value_list_from_row(sheet, 14)
if len(collab_insts) == len(collab_pis) and len(collab_insts) > 0: if (len(collab_insts) == len(collab_pis)) and len(collab_insts) > 0:
i = 0 i = 0
src_collab_list = [] src_collab_list = []
while i < len(collab_insts): while i < len(collab_insts):
...@@ -99,6 +97,9 @@ class TestProjectsParser(TestCase): ...@@ -99,6 +97,9 @@ class TestProjectsParser(TestCase):
else: else:
print('Mismatched Collab PI-Institution length {} \n'.format(full_file_path)) print('Mismatched Collab PI-Institution length {} \n'.format(full_file_path))
if len(collab_insts)>1:
print('Multi source collab ----> {}'.format(full_file_path))
if sheet[18, 2]: if sheet[18, 2]:
dataset_data['source_project'] = sheet[18, 2] dataset_data['source_project'] = sheet[18, 2]
...@@ -125,12 +126,8 @@ class TestProjectsParser(TestCase): ...@@ -125,12 +126,8 @@ class TestProjectsParser(TestCase):
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 33) dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 33)
if process_yes_no_answer(sheet[29, 2]): if process_yes_no_answer(sheet[29, 2]):
shares = get_value_list_from_row(sheet, 30) dataset_data['shares'] = process_share_list(get_value_list_from_row(sheet, 30))
if len(shares) > 0:
share_list = []
for shr in shares:
share_list.append({'share_notes': shr})
dataset_data['shares'] = share_list
storage_locations = [] storage_locations = []
......
...@@ -26,7 +26,7 @@ class TestProjectsParser(TestCase): ...@@ -26,7 +26,7 @@ class TestProjectsParser(TestCase):
submission_id = 'IMP_FR_{}'.format(str(int(h.hexdigest(), 16))) submission_id = 'IMP_FR_{}'.format(str(int(h.hexdigest(), 16)))
book = pyexcel.get_book(file_name=full_file_path) book = pyexcel.get_book(file_name=full_file_path)
idx = 1 idx = 1
# print('----> {}'.format(full_file_path)) print('Processing ----> {}'.format(full_file_path))
while idx < book.number_of_sheets(): while idx < book.number_of_sheets():
# dataset_count+=1 # dataset_count+=1
sheet = book.sheet_by_index(idx) sheet = book.sheet_by_index(idx)
...@@ -45,7 +45,7 @@ class TestProjectsParser(TestCase): ...@@ -45,7 +45,7 @@ class TestProjectsParser(TestCase):
dataset_data['other_external_id'] = sheet[4, 2] dataset_data['other_external_id'] = sheet[4, 2]
if sheet[5, 2]: if sheet[5, 2]:
dataset_data['title'] = sheet[5, 2] dataset_data['title'] = sheet[5, 2].strip()
if not dataset_data['title']: if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path)) print('Missing dataset title ----> {}'.format(full_file_path))
...@@ -55,7 +55,8 @@ class TestProjectsParser(TestCase): ...@@ -55,7 +55,8 @@ class TestProjectsParser(TestCase):
dataset_data['data_types'] = datatype_info[0] dataset_data['data_types'] = datatype_info[0]
if datatype_info[1]: if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1] dataset_data['data_type_notes'] = datatype_info[1]
if datatype_info[1].__contains__('..'):
print('INVLAID DATA TYPE NOTES----> {}'.format(full_file_path))
# for dd in datatype_info[0]: # for dd in datatype_info[0]:
# if dd in datatype_count.keys(): # if dd in datatype_count.keys():
# datatype_count[dd] +=1 # datatype_count[dd] +=1
......
...@@ -6,7 +6,7 @@ from unittest import TestCase ...@@ -6,7 +6,7 @@ from unittest import TestCase
import pyexcel import pyexcel
from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_answer, \ from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_answer, \
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date, process_share_list
class TestProjectsParser(TestCase): class TestProjectsParser(TestCase):
...@@ -14,8 +14,7 @@ class TestProjectsParser(TestCase): ...@@ -14,8 +14,7 @@ class TestProjectsParser(TestCase):
h = hashlib.md5() h = hashlib.md5()
# count = 0 # count = 0
# custodian_count = {}
# datatype_count={}
for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER): for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER):
for fname in fileList: for fname in fileList:
...@@ -34,11 +33,7 @@ class TestProjectsParser(TestCase): ...@@ -34,11 +33,7 @@ class TestProjectsParser(TestCase):
dataset_data['source_type'] = 'Own_Cohort' dataset_data['source_type'] = 'Own_Cohort'
dataset_data['submission_id'] = submission_id dataset_data['submission_id'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3) dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
# for cc in dataset_data['local_custodian']:
# if cc in custodian_count.keys():
# custodian_count[cc] +=1
# else:
# custodian_count[cc] =1
dataset_data['title'] = sheet[4, 2] dataset_data['title'] = sheet[4, 2]
if not dataset_data['title']: if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path)) print('Missing dataset title ----> {}'.format(full_file_path))
...@@ -50,11 +45,6 @@ class TestProjectsParser(TestCase): ...@@ -50,11 +45,6 @@ class TestProjectsParser(TestCase):
if datatype_info[1]: if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1] dataset_data['data_type_notes'] = datatype_info[1]
# for dd in datatype_info[0]:
# if dd in datatype_count.keys():
# datatype_count[dd] +=1
# else:
# datatype_count[dd] =1
dataset_data['involves_samples'] = process_yes_no_answer(sheet[7, 2]) dataset_data['involves_samples'] = process_yes_no_answer(sheet[7, 2])
...@@ -116,22 +106,13 @@ class TestProjectsParser(TestCase): ...@@ -116,22 +106,13 @@ class TestProjectsParser(TestCase):
share_list = [] share_list = []
if process_yes_no_answer(sheet[27, 2]): if process_yes_no_answer(sheet[27, 2]):
luxembourg_shares = get_value_list_from_row(sheet, 28) share_list += process_share_list(get_value_list_from_row(sheet, 28))
if len(luxembourg_shares) > 0:
for shr in luxembourg_shares:
share_list.append({'share_notes': shr, 'share_location_type': 'National'})
if process_yes_no_answer(sheet[30, 2]): if process_yes_no_answer(sheet[30, 2]):
eu_shares = get_value_list_from_row(sheet, 31) share_list += process_share_list(get_value_list_from_row(sheet, 31))
if len(eu_shares) > 0:
for shr in eu_shares:
share_list.append({'share_notes': shr, 'share_location_type': 'EU'})
if process_yes_no_answer(sheet[33, 2]): if process_yes_no_answer(sheet[33, 2]):
noneu_shares = get_value_list_from_row(sheet, 34) share_list += process_share_list(get_value_list_from_row(sheet, 34))
if len(noneu_shares) > 0:
for shr in noneu_shares:
share_list.append({'share_notes': shr, 'share_location_type': 'Non-EU'})
dataset_data['shares'] = share_list dataset_data['shares'] = share_list
......
...@@ -57,7 +57,6 @@ class TestProjectsParser(TestCase): ...@@ -57,7 +57,6 @@ class TestProjectsParser(TestCase):
def test_duplicate_dataset_title(self): def test_duplicate_dataset_title(self):
titles = set() titles = set()
for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER): for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER):
......
...@@ -103,41 +103,63 @@ def process_data_types(xls_data_type_list): ...@@ -103,41 +103,63 @@ def process_data_types(xls_data_type_list):
data_type_notes += type_name + '\n' data_type_notes += type_name + '\n'
return (result, data_type_notes) return (result, data_type_notes)
predefined_types = set([
'hpc_chaos_home',
'hpc_chaos_project',
'hpc_gaia_home',
'hpc_gaia_project',
'hpc_gaia_work',
'hpc_iris_home',
'hpc_iris_project',
'hpc_scratch_personal',
'hpc_scratch_project',
'hpc_isilon',
'atlas_personal',
'atlas_project',
'hpc_backup_chaos',
'hpc_backup_gaia',
'bertha',
'certon_block',
'lcsb_group_server',
'lcsb_desktop',
'lcsb_laptop',
'personal_laptop',
'Owncloud',
'External Storage (e.g. Hard disk, DVD)',
'Other'
])
def is_storage_resource(resource):
if resource in predefined_types:
return True
else:
print('Unknow Storage resource --> {}'.format(resource))
return False
def is_storage_resource(location): def get_storage_location(resource, path, category):
result = []
predefined_types = set([ result = {}
'hpc_chaos_home',
'hpc_chaos_project', if is_application(path):
'hpc_gaia_home', result['storage_resource'] = 'application'
'hpc_gaia_project', elif resource in predefined_types:
'hpc_gaia_work', result['storage_resource'] = resource
'hpc_iris_home', else:
'hpc_iris_project', result['storage_resource'] = 'Other'
'hpc_scratch_personal',
'hpc_scratch_project', result['location'] = {'location':path}
'hpc_isilon', result['category'] = category
'atlas_personal',
'atlas_project',
'hpc_backup_chaos',
'hpc_backup_gaia',
'bertha',
'certon_block',
'lcsb_group_server',
'lcsb_desktop',
'lcsb_laptop',
'personal_laptop',
'Owncloud',
'External Storage (e.g. Hard disk, DVD)',
'OTHER'
])
if location in predefined_types: return result
def is_application(path):
if ("transmart" in path.lower()) or ( "redcap" in path.lower()):
return True return True
else: else:
return False return False
def process_yes_no_answer(answer): def process_yes_no_answer(answer):
""" """
convert yes/no answers to boolean we take empty answers as no convert yes/no answers to boolean we take empty answers as no
...@@ -167,18 +189,26 @@ def process_yes_no_dontknow_answer(answer): ...@@ -167,18 +189,26 @@ def process_yes_no_dontknow_answer(answer):
return None return None
def process_share_list(shares):
share_list = []
for shr in shares:
if ";" not in shr:
share_list.append({'share_notes': shr})
else:
infos = shr.split(";")
share_list.append({'share_inst': infos[0].strip(),
'share_notes': infos[1].strip()})
return share_list
def add_storage_locations(storage_dict, locations_list, category): def add_storage_locations(storage_dict, locations_list, category):
if len(locations_list) % 2 != 0 and len(locations_list) > 0: if len(locations_list) % 2 != 0 and len(locations_list) > 0:
if len(locations_list) == 1: if len(locations_list) == 1:
if is_storage_resource(locations_list[0]): if is_storage_resource(locations_list[0]):
storage_dict.append( storage_dict.append(get_storage_location(locations_list[0],'<missing_info>',category))
{'storage_resource': locations_list[0], 'location': '<missing_info>',
'category': category})
else: else:
for line in get_lines_from_string(locations_list[0]): for line in get_lines_from_string(locations_list[0]):
storage_dict.append( storage_dict.append(get_storage_location('Other',line,category))
{'storage_resource': 'Other', 'location': line,
'category': category})
else: else:
raise ValueError('Uneven Master Data Location Row') raise ValueError('Uneven Master Data Location Row')
elif len(locations_list) % 2 == 0 and len(locations_list) > 0: elif len(locations_list) % 2 == 0 and len(locations_list) > 0:
...@@ -187,21 +217,10 @@ def add_storage_locations(storage_dict, locations_list, category): ...@@ -187,21 +217,10 @@ def add_storage_locations(storage_dict, locations_list, category):
while s < e: while s < e:
if is_storage_resource(locations_list[s * 2]): if is_storage_resource(locations_list[s * 2]):
for line in get_lines_from_string(locations_list[s * 2 + 1]): for line in get_lines_from_string(locations_list[s * 2 + 1]):
storage_dict.append( storage_dict.append(get_storage_location(locations_list[s * 2], line, category))
{'storage_resource': locations_list[s * 2], 'location': line,
'category': category})
else: else:
for line in get_lines_from_string(locations_list[s * 2]): for line in get_lines_from_string(locations_list[s * 2]):
storage_dict.append( storage_dict.append(get_storage_location('Other', line, category))
{'storage_resource': 'Other', 'location': line,
'category': category})
# res = locations_list[s * 2] if locations_list[s * 2] else 'Other'
#
# storage_dict.append({'storage_resource': res,
# 'location': locations_list[s * 2 + 1],
# 'category': category})
s += 1 s += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment