Commit d24c302a authored by Pinar Alper's avatar Pinar Alper
Browse files

Extended storage resource export

parent 0ba1f1d4
......@@ -239,9 +239,9 @@
},
{
"elu_accession": "ELU_I_44",
"institution_name": "London School of Hygiene & Tropical Medicine",
"institution_name": "London School of Hygiene & Tropical Medicine, Medical Research Council Unit The Gambia",
"geo_category": "EU",
"acronym": "LSHTM"
"acronym": "LSHTM MRU The Gambia"
},
{
"elu_accession": "ELU_I_45",
......@@ -495,10 +495,10 @@
"geo_category": "EU"
},
{
"elu_accession": "ELU_I_91",
"institution_name": "Helmholtz Zentrum München",
"geo_category": "EU",
"acronym": "HMGU"
"elu_accession": "ELU_I_91",
"institution_name": "Helmholtz Zentrum München",
"geo_category": "EU",
"acronym": "HMGU"
},
{
"elu_accession": "ELU_I_92",
......
......@@ -6,7 +6,7 @@ from unittest import TestCase
import pyexcel
from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_answer, \
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date, process_share_list
class TestProjectsParser(TestCase):
......@@ -24,7 +24,7 @@ class TestProjectsParser(TestCase):
submission_id = 'IMP_FC_{}'.format(str(int(h.hexdigest(), 16)))
book = pyexcel.get_book(file_name=full_file_path)
idx = 1
#print('----> {}'.format(full_file_path))
print('Processing ----> {}'.format(full_file_path))
while idx < book.number_of_sheets():
# dataset_count+=1
sheet = book.sheet_by_index(idx)
......@@ -58,8 +58,6 @@ class TestProjectsParser(TestCase):
if sheet[7, 2]:
dataset_data['samples_location'] = sheet[7, 2]
if dataset_data['involves_samples'] == False:
print('----> {}'.format('Inconsistent samples information' + full_file_path))
if sheet[8, 2]:
dataset_data['de_identification'] = sheet[8, 2]
......@@ -78,7 +76,7 @@ class TestProjectsParser(TestCase):
collab_insts = get_value_list_from_row(sheet, 13)
collab_pis = get_value_list_from_row(sheet, 14)
if len(collab_insts) == len(collab_pis) and len(collab_insts) > 0:
if (len(collab_insts) == len(collab_pis)) and len(collab_insts) > 0:
i = 0
src_collab_list = []
while i < len(collab_insts):
......@@ -99,6 +97,9 @@ class TestProjectsParser(TestCase):
else:
print('Mismatched Collab PI-Institution length {} \n'.format(full_file_path))
if len(collab_insts)>1:
print('Multi source collab ----> {}'.format(full_file_path))
if sheet[18, 2]:
dataset_data['source_project'] = sheet[18, 2]
......@@ -125,12 +126,8 @@ class TestProjectsParser(TestCase):
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 33)
if process_yes_no_answer(sheet[29, 2]):
shares = get_value_list_from_row(sheet, 30)
if len(shares) > 0:
share_list = []
for shr in shares:
share_list.append({'share_notes': shr})
dataset_data['shares'] = share_list
dataset_data['shares'] = process_share_list(get_value_list_from_row(sheet, 30))
storage_locations = []
......
......@@ -26,7 +26,7 @@ class TestProjectsParser(TestCase):
submission_id = 'IMP_FR_{}'.format(str(int(h.hexdigest(), 16)))
book = pyexcel.get_book(file_name=full_file_path)
idx = 1
# print('----> {}'.format(full_file_path))
print('Processing ----> {}'.format(full_file_path))
while idx < book.number_of_sheets():
# dataset_count+=1
sheet = book.sheet_by_index(idx)
......@@ -45,7 +45,7 @@ class TestProjectsParser(TestCase):
dataset_data['other_external_id'] = sheet[4, 2]
if sheet[5, 2]:
dataset_data['title'] = sheet[5, 2]
dataset_data['title'] = sheet[5, 2].strip()
if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
......@@ -55,7 +55,8 @@ class TestProjectsParser(TestCase):
dataset_data['data_types'] = datatype_info[0]
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
if datatype_info[1].__contains__('..'):
print('INVLAID DATA TYPE NOTES----> {}'.format(full_file_path))
# for dd in datatype_info[0]:
# if dd in datatype_count.keys():
# datatype_count[dd] +=1
......
......@@ -6,7 +6,7 @@ from unittest import TestCase
import pyexcel
from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_answer, \
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER, process_possible_date, process_share_list
class TestProjectsParser(TestCase):
......@@ -14,8 +14,7 @@ class TestProjectsParser(TestCase):
h = hashlib.md5()
# count = 0
# custodian_count = {}
# datatype_count={}
for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER):
for fname in fileList:
......@@ -34,11 +33,7 @@ class TestProjectsParser(TestCase):
dataset_data['source_type'] = 'Own_Cohort'
dataset_data['submission_id'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
# for cc in dataset_data['local_custodian']:
# if cc in custodian_count.keys():
# custodian_count[cc] +=1
# else:
# custodian_count[cc] =1
dataset_data['title'] = sheet[4, 2]
if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
......@@ -50,11 +45,6 @@ class TestProjectsParser(TestCase):
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
# for dd in datatype_info[0]:
# if dd in datatype_count.keys():
# datatype_count[dd] +=1
# else:
# datatype_count[dd] =1
dataset_data['involves_samples'] = process_yes_no_answer(sheet[7, 2])
......@@ -116,22 +106,13 @@ class TestProjectsParser(TestCase):
share_list = []
if process_yes_no_answer(sheet[27, 2]):
luxembourg_shares = get_value_list_from_row(sheet, 28)
if len(luxembourg_shares) > 0:
for shr in luxembourg_shares:
share_list.append({'share_notes': shr, 'share_location_type': 'National'})
share_list += process_share_list(get_value_list_from_row(sheet, 28))
if process_yes_no_answer(sheet[30, 2]):
eu_shares = get_value_list_from_row(sheet, 31)
if len(eu_shares) > 0:
for shr in eu_shares:
share_list.append({'share_notes': shr, 'share_location_type': 'EU'})
share_list += process_share_list(get_value_list_from_row(sheet, 31))
if process_yes_no_answer(sheet[33, 2]):
noneu_shares = get_value_list_from_row(sheet, 34)
if len(noneu_shares) > 0:
for shr in noneu_shares:
share_list.append({'share_notes': shr, 'share_location_type': 'Non-EU'})
share_list += process_share_list(get_value_list_from_row(sheet, 34))
dataset_data['shares'] = share_list
......
......@@ -57,7 +57,6 @@ class TestProjectsParser(TestCase):
def test_duplicate_dataset_title(self):
titles = set()
for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER):
......
......@@ -103,41 +103,63 @@ def process_data_types(xls_data_type_list):
data_type_notes += type_name + '\n'
return (result, data_type_notes)
predefined_types = set([
'hpc_chaos_home',
'hpc_chaos_project',
'hpc_gaia_home',
'hpc_gaia_project',
'hpc_gaia_work',
'hpc_iris_home',
'hpc_iris_project',
'hpc_scratch_personal',
'hpc_scratch_project',
'hpc_isilon',
'atlas_personal',
'atlas_project',
'hpc_backup_chaos',
'hpc_backup_gaia',
'bertha',
'certon_block',
'lcsb_group_server',
'lcsb_desktop',
'lcsb_laptop',
'personal_laptop',
'Owncloud',
'External Storage (e.g. Hard disk, DVD)',
'Other'
])
def is_storage_resource(resource):
if resource in predefined_types:
return True
else:
print('Unknow Storage resource --> {}'.format(resource))
return False
def is_storage_resource(location):
result = []
predefined_types = set([
'hpc_chaos_home',
'hpc_chaos_project',
'hpc_gaia_home',
'hpc_gaia_project',
'hpc_gaia_work',
'hpc_iris_home',
'hpc_iris_project',
'hpc_scratch_personal',
'hpc_scratch_project',
'hpc_isilon',
'atlas_personal',
'atlas_project',
'hpc_backup_chaos',
'hpc_backup_gaia',
'bertha',
'certon_block',
'lcsb_group_server',
'lcsb_desktop',
'lcsb_laptop',
'personal_laptop',
'Owncloud',
'External Storage (e.g. Hard disk, DVD)',
'OTHER'
])
def get_storage_location(resource, path, category):
result = {}
if is_application(path):
result['storage_resource'] = 'application'
elif resource in predefined_types:
result['storage_resource'] = resource
else:
result['storage_resource'] = 'Other'
result['location'] = {'location':path}
result['category'] = category
if location in predefined_types:
return result
def is_application(path):
if ("transmart" in path.lower()) or ( "redcap" in path.lower()):
return True
else:
return False
def process_yes_no_answer(answer):
"""
convert yes/no answers to boolean we take empty answers as no
......@@ -167,18 +189,26 @@ def process_yes_no_dontknow_answer(answer):
return None
def process_share_list(shares):
share_list = []
for shr in shares:
if ";" not in shr:
share_list.append({'share_notes': shr})
else:
infos = shr.split(";")
share_list.append({'share_inst': infos[0].strip(),
'share_notes': infos[1].strip()})
return share_list
def add_storage_locations(storage_dict, locations_list, category):
if len(locations_list) % 2 != 0 and len(locations_list) > 0:
if len(locations_list) == 1:
if is_storage_resource(locations_list[0]):
storage_dict.append(
{'storage_resource': locations_list[0], 'location': '<missing_info>',
'category': category})
storage_dict.append(get_storage_location(locations_list[0],'<missing_info>',category))
else:
for line in get_lines_from_string(locations_list[0]):
storage_dict.append(
{'storage_resource': 'Other', 'location': line,
'category': category})
storage_dict.append(get_storage_location('Other',line,category))
else:
raise ValueError('Uneven Master Data Location Row')
elif len(locations_list) % 2 == 0 and len(locations_list) > 0:
......@@ -187,21 +217,10 @@ def add_storage_locations(storage_dict, locations_list, category):
while s < e:
if is_storage_resource(locations_list[s * 2]):
for line in get_lines_from_string(locations_list[s * 2 + 1]):
storage_dict.append(
{'storage_resource': locations_list[s * 2], 'location': line,
'category': category})
storage_dict.append(get_storage_location(locations_list[s * 2], line, category))
else:
for line in get_lines_from_string(locations_list[s * 2]):
storage_dict.append(
{'storage_resource': 'Other', 'location': line,
'category': category})
# res = locations_list[s * 2] if locations_list[s * 2] else 'Other'
#
# storage_dict.append({'storage_resource': res,
# 'location': locations_list[s * 2 + 1],
# 'category': category})
storage_dict.append(get_storage_location('Other', line, category))
s += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment