Commit 13742219 authored by Pinar Alper's avatar Pinar Alper
Browse files

XLS to JSON exporters adapted to the new DAISY model

parent 95d87fe4
......@@ -103,20 +103,25 @@ class DatasetExporter:
if self.is_application(path):
result['storage_resource'] = 'application'
elif resource in self.predefined_data_types:
elif resource in self.predefined_storage_types:
result['storage_resource'] = resource
else:
result['storage_resource'] = 'Other'
result['location'] = {'location':path}
path_lines = []
path_lines.extend(get_lines_from_string(path))
result['locations'] = path_lines
result['category'] = category
return result
def get_samples_storage(self, sample_location):
return [{'storage_resource':'sample-storage', 'locations':[sample_location], 'category':'master'}]
def is_application(self, path):
if ("transmart" in path.lower()) or ( "redcap" in path.lower()):
if ("transmart" in path.lower()) or ("redcap" in path.lower()):
return True
else:
return False
......@@ -137,15 +142,14 @@ class DatasetExporter:
'share_notes': infos[1].strip()})
return share_list
def add_storage_locations(self, storage_dict, locations_list, category):
def build_storage_locations(self, locations_list, category):
result = []
if len(locations_list) % 2 != 0 and len(locations_list) > 0:
if len(locations_list) == 1:
if self.is_storage_resource(locations_list[0]):
storage_dict.append(self.get_storage_location(locations_list[0],'<missing_info>',category))
result.append(self.get_storage_location(locations_list[0],'<missing_info>',category))
else:
for line in get_lines_from_string(locations_list[0]):
storage_dict.append(self.get_storage_location('Other',line,category))
result.append(self.get_storage_location('Other',locations_list[0],category))
else:
raise ValueError('Uneven Master Data Location Row')
elif len(locations_list) % 2 == 0 and len(locations_list) > 0:
......@@ -153,11 +157,9 @@ class DatasetExporter:
e = len(locations_list) // 2
while s < e:
if self.is_storage_resource(locations_list[s * 2]):
for line in get_lines_from_string(locations_list[s * 2 + 1]):
storage_dict.append(self.get_storage_location(locations_list[s * 2], line, category))
result.append(self.get_storage_location(locations_list[s * 2], locations_list[s*2+1], category))
else:
for line in get_lines_from_string(locations_list[s * 2]):
storage_dict.append(self.get_storage_location('Other', line, category))
result.append(self.get_storage_location('Other', [locations_list[s * 2]], category))
s += 1
return result
import datetime
from datetime import datetime
import datetime as dt
def process_yes_no_answer(answer):
"""
convert yes/no answers to boolean we take empty answers as no
......@@ -7,7 +7,7 @@ def process_yes_no_answer(answer):
"""
result = False
if answer:
if answer == 'Yes':
if answer.lower() == 'yes':
result = True
return result
......@@ -19,9 +19,9 @@ def process_yes_no_dontknow_answer(answer):
:param xls_data_type_list:
"""
if answer:
if answer == 'Yes':
if answer.lower() == 'yes':
return True
elif answer == 'No':
elif answer.lower() == 'no':
return False
else:
return None
......@@ -45,10 +45,16 @@ def get_value_list_from_row(sheet, row_idx):
def process_possible_date(possible_date):
if isinstance(possible_date, datetime.date):
return possible_date.strftime("%Y/%m/%d")
if isinstance(possible_date, dt.date):
return possible_date.strftime("%Y-%m-%d")
elif isinstance(possible_date, int):
return ""
else:
return str(possible_date).replace('.', '/')
try:
d = datetime.strptime(possible_date.replace('/','.'), '%d.%m.%Y')
return d.strftime("%Y-%m-%d")
except ValueError as e:
return ""
......
......@@ -6,75 +6,62 @@ from metadata_tools.importxls.export_utils import get_value_list_from_row, proce
class FromCollabXlsExporter(DatasetExporter):
def export(self, full_file_path):
submission_id = 'IMP_FC_{}'.format(self.get_hash_for_path(full_file_path))
def export_datadecs(self, full_file_path):
result = []
idx = 1
print('Processing ----> {}'.format(full_file_path))
#print('Processing ----> {}'.format(full_file_path))
book = pyexcel.get_book(file_name=full_file_path)
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['source_type'] = 'From_Collaborator'
dataset_data['submission_id'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
dataset_data['title'] = sheet[4, 2]
if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
datadec_data = {}
datadec_data['dataset'] = sheet[1, 2]
datadec_data['title'] = sheet[4, 2]
if not datadec_data['title']:
print('Missing data title ----> {}'.format(full_file_path))
datatype_info = self.process_data_types(get_value_list_from_row(sheet, 5))
dataset_data['data_types'] = datatype_info[0]
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
dataset_data['involves_samples'] = process_yes_no_answer(sheet[6, 2])
if datatype_info[1]:
datadec_data['data_type_notes'] = datatype_info[1]
#if it involces samples add this as a datatype
if process_yes_no_answer(sheet[6, 2]):
datatype_info[0].append('Samples')
datadec_data['data_types'] = datatype_info[0]
if sheet[7, 2]:
dataset_data['samples_location'] = sheet[7, 2]
if sheet[8, 2]:
dataset_data['de_identification'] = sheet[8, 2]
datadec_data['de_identification'] = sheet[8, 2]
if sheet[9, 2]:
dataset_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
datadec_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
if sheet[10, 2]:
dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
datadec_data['has_special_subjects'] = process_yes_no_dontknow_answer(
sheet[10, 2])
if dataset_data.get('has_special_subjects'):
if dataset_data.get('has_special_subjects') == True and sheet[11, 2]:
dataset_data['special_subject_notes'] = sheet[11, 2]
collab_insts = get_value_list_from_row(sheet, 13)
collab_pis = get_value_list_from_row(sheet, 14)
if datadec_data.get('has_special_subjects'):
if datadec_data.get('has_special_subjects') == True and sheet[11, 2]:
datadec_data['special_subject_notes'] = sheet[11, 2]
if (len(collab_insts) == len(collab_pis)) and len(collab_insts) > 0:
i = 0
src_collab_list = []
while i < len(collab_insts):
collab_inst = sheet[13, 2]
collab_pi = sheet[14, 2]
collab_data = {'collab_inst': self.lookup_institution_accession(collab_insts[i]),
'collab_pi': collab_pis[i],
'collab_project': sheet[18, 2]}
if collab_inst and collab_pi:
collab_dict = {}
collab_dict['collab_inst'] = self.lookup_institution_accession(collab_inst)
collab_dict['collab_pi'] = collab_pi
if sheet[18, 2]:
collab_dict['collab_project'] = sheet[18, 2]
if process_yes_no_dontknow_answer(sheet[17, 2]) == False:
collab_data['collab_role'] = 'controller'
if process_yes_no_dontknow_answer(sheet[17, 2]) == False:
collab_dict['collab_role'] = 'controller'
elif process_yes_no_dontknow_answer(sheet[17, 2]) == True:
collab_data['collab_role'] = 'joint-controller'
src_collab_list.append(collab_data)
i += 1
dataset_data['source_collaborations'] = src_collab_list
elif process_yes_no_dontknow_answer(sheet[17, 2]) == True:
collab_dict['collab_role'] = 'joint_controller'
datadec_data['source_collaboration'] = collab_dict
datadec_data['source_notes'] = 'Data is from collaborator.'
else:
print('Mismatched Collab PI-Institution length {} \n'.format(full_file_path))
if len(collab_insts)>1:
print('Multi source collab ----> {}'.format(full_file_path))
if sheet[18, 2]:
dataset_data['source_project'] = sheet[18, 2]
print('Missing collaborator information{} \n'.format(full_file_path))
use_restrictions = []
if process_yes_no_answer(sheet[25, 2]):
......@@ -94,41 +81,62 @@ class FromCollabXlsExporter(DatasetExporter):
use_restrictions.append({'ga4gh_code': 'TS-[XX]',
'note': 'Data is obtained for a limited duration.' + process_possible_date(sheet[42, 2])})
dataset_data['use_restrictions'] = use_restrictions
datadec_data['use_restrictions'] = use_restrictions
idx += 1
result.append(datadec_data)
return result
def export_datasets(self, full_file_path):
result = []
idx = 1
print('Processing ----> {}'.format(full_file_path))
book = pyexcel.get_book(file_name=full_file_path)
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['title'] = sheet[1, 2]
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 33)
if sheet[33, 2]:
dataset_data['project'] = sheet[33, 2]
if process_yes_no_answer(sheet[29, 2]):
dataset_data['shares'] = self.process_share_list(get_value_list_from_row(sheet, 30))
storage_locations = []
master_locations = get_value_list_from_row(sheet, 35)
try:
self.add_storage_locations(storage_locations, master_locations, 'master')
storage_locations.extend(self.build_storage_locations(master_locations, 'master'))
except ValueError as e:
print('Invalid Master Data Location Row {} \n'.format(full_file_path))
master_acl_list = get_value_list_from_row(sheet, 36)
if len(master_acl_list) > 0:
for loc in storage_locations:
loc['storage_acl_info'] = ', '.join(master_acl_list)
if process_yes_no_answer(sheet[37, 2]):
backup_locations = get_value_list_from_row(sheet, 38)
try:
self.add_storage_locations(storage_locations, backup_locations, 'backup')
storage_locations.extend(self.build_storage_locations(backup_locations, 'backup'))
except ValueError as e:
print('Uneven Backup Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[39, 2]):
copy_locations = get_value_list_from_row(sheet, 40)
try:
self.add_storage_locations(storage_locations, copy_locations, 'copy')
storage_locations.extend(self.build_storage_locations(copy_locations, 'copy'))
except ValueError as e:
print('Uneven Copy Data Location Row {} \n'.format(full_file_path))
acl_list = get_value_list_from_row(sheet, 36)
if len(acl_list) > 0:
dataset_data['storage_acl_info'] = ', '.join(acl_list)
if process_yes_no_answer(sheet[6, 2]):
storage_locations.extend(self.get_samples_storage(sheet[7, 2]))
dataset_data['storage_locations'] = storage_locations
idx += 1
with open('{}_.json'.format(submission_id), 'w') as outfile:
json.dump(dataset_data, outfile, indent=4)
result.append(dataset_data)
return result
......@@ -6,56 +6,51 @@ from metadata_tools.importxls.export_utils import get_value_list_from_row, proce
class FromOwncohortXlsExporter(DatasetExporter):
def export(self, full_file_path):
#submission_id = 'IMP_OC_{}'.format(self.get_hash_for_path(full_file_path))
def export_datadecs(self, full_file_path):
result = []
book = pyexcel.get_book(file_name=full_file_path)
idx = 1
print('----> {}'.format(full_file_path))
#print('----> {}'.format(full_file_path))
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
#dataset_data['source_type'] = 'Own_Cohort'
dataset_data['dataset'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
dataset_data['title'] = sheet[4, 2]
if not dataset_data['title']:
datadec_data = {}
datadec_data['dataset'] = sheet[1, 2]
datadec_data['title'] = sheet[4, 2]
if not datadec_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
if sheet[5, 2]:
dataset_data['source_project'] = sheet[5, 2]
datadec_data['source_study'] = sheet[5, 2]
datatype_info = self.process_data_types(get_value_list_from_row(sheet, 6))
dataset_data['data_types'] = datatype_info[0]
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
datadec_data['data_type_notes'] = datatype_info[1]
dataset_data['involves_samples'] = process_yes_no_answer(sheet[7, 2])
datadec_data['source_notes'] = 'Data is from own cohort.'
#if it involces samples add this as a datatype
if process_yes_no_answer(sheet[7, 2]):
datatype_info[0].append('Samples')
datadec_data['data_types'] = datatype_info[0]
if sheet[7, 2]:
dataset_data['samples_location'] = sheet[8, 2]
if sheet[9, 2]:
dataset_data['de_identification'] = sheet[9, 2]
datadec_data['de_identification'] = sheet[9, 2]
if sheet[10, 2]:
dataset_data['ombudsman'] = sheet[10, 2]
datadec_data['ombudsman'] = sheet[10, 2]
if sheet[11, 2]:
dataset_data['subject_categories'] = sheet[11, 2].replace(' & ', '_and_')
datadec_data['subject_categories'] = sheet[11, 2].replace(' & ', '_and_')
if sheet[12, 2]:
dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
datadec_data['has_special_subjects'] = process_yes_no_dontknow_answer(
sheet[12, 2])
if dataset_data.get('has_special_subjects'):
if dataset_data.get('has_special_subjects') == True and sheet[13, 2]:
dataset_data['special_subject_notes'] = sheet[13, 2]
if datadec_data.get('has_special_subjects'):
if datadec_data.get('has_special_subjects') == True and sheet[13, 2]:
datadec_data['special_subject_notes'] = sheet[13, 2]
if sheet[19, 2]:
dataset_data['consent_status'] = sheet[19, 2]
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 22)
datadec_data['consent_status'] = sheet[19, 2].lower()
use_restrictions = []
if process_yes_no_answer(sheet[21, 2]):
......@@ -84,11 +79,28 @@ class FromOwncohortXlsExporter(DatasetExporter):
if has_time_limis and sheet[43, 2]:
use_restrictions.append({'ga4gh_code': 'TS-[XX]',
'note': 'Data is obtained for a limited duration.' + process_possible_date(sheet[43, 2])})
datadec_data['use_restrictions'] = use_restrictions
idx += 1
result.append(datadec_data)
dataset_data['use_restrictions'] = use_restrictions
return result
share_list = []
def export_datasets(self, full_file_path):
result = []
book = pyexcel.get_book(file_name=full_file_path)
idx = 1
print('----> {}'.format(full_file_path))
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['title'] = sheet[1, 2]
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
if sheet[22, 2]:
dataset_data['project'] = sheet[22, 2]
share_list = []
if process_yes_no_answer(sheet[27, 2]):
share_list += self.process_share_list(get_value_list_from_row(sheet, 28))
......@@ -101,33 +113,38 @@ class FromOwncohortXlsExporter(DatasetExporter):
dataset_data['shares'] = share_list
storage_locations = []
master_locations = get_value_list_from_row(sheet, 36)
try:
self.add_storage_locations(storage_locations, master_locations, 'master')
storage_locations.extend(self.build_storage_locations( master_locations, 'master'))
except ValueError as e:
print('Invalid Master Data Location Row {} \n'.format(full_file_path))
master_acl_list = get_value_list_from_row(sheet, 37)
if len(master_acl_list) > 0:
for loc in storage_locations:
loc['storage_acl_info'] = ', '.join(master_acl_list)
if process_yes_no_answer(sheet[38, 2]):
backup_locations = get_value_list_from_row(sheet, 39)
try:
self.add_storage_locations(storage_locations, backup_locations, 'backup')
storage_locations.extend(self.build_storage_locations(backup_locations, 'backup'))
except ValueError as e:
print('Uneven Backup Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[40, 2]):
copy_locations = get_value_list_from_row(sheet, 41)
try:
self.add_storage_locations(storage_locations, copy_locations, 'copy')
storage_locations.extend(self.build_storage_locations(copy_locations, 'copy'))
except ValueError as e:
print('Uneven Copy Data Location Row {} \n'.format(full_file_path))
acl_list = get_value_list_from_row(sheet, 37)
if len(acl_list) > 0:
dataset_data['storage_acl_info'] = ', '.join(acl_list)
if process_yes_no_answer(sheet[7, 2]):
storage_locations.extend(self.get_samples_storage(sheet[8, 2]))
dataset_data['storage_locations'] = storage_locations
idx += 1
with open('datasets-{}.json'.format(submission_id), 'w') as outfile:
json.dump(dataset_data, outfile, indent=4)
result.append(dataset_data)
return result
......@@ -7,53 +7,55 @@ from metadata_tools.importxls.export_utils import get_value_list_from_row, proce
class FromRepoXlsExporter(DatasetExporter):
def export(self, full_file_path):
submission_id = 'IMP_FR_{}'.format(self.get_hash_for_path(full_file_path))
def export_datadecs(self, full_file_path):
result = []
idx = 1
print('Processing ----> {}'.format(full_file_path))
#print('Processing ----> {}'.format(full_file_path))
book = pyexcel.get_book(file_name=full_file_path)
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['source_type'] = 'From_Repository'
dataset_data['submission_id'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 2)
dataset_data['source_repository'] = self.lookup_institution_accession(sheet[6, 2].strip())
datadec_data = {}
datadec_data['dataset'] = sheet[3, 2]
if sheet[5, 2]:
datadec_data['title'] = sheet[5, 2].strip()
if not datadec_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
if sheet[4, 2]:
dataset_data['other_external_id'] = sheet[4, 2]
if sheet[5, 2]:
dataset_data['title'] = sheet[5, 2].strip()
collab_dict = {}
collab_dict['collab_inst'] = self.lookup_institution_accession(sheet[6, 2].strip())
if sheet[19, 2]:
collab_dict['collab_project'] = sheet[19, 2].strip()
datadec_data['source_collaboration'] = collab_dict
datadec_data['source_notes'] = 'Data is obtained from repository.'
if sheet[4, 2]:
datadec_data['other_external_id'] = sheet[4, 2]
if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
datatype_info = self.process_data_types(get_value_list_from_row(sheet, 7))
dataset_data['data_types'] = datatype_info[0]
datadec_data['data_types'] = datatype_info[0]
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
if datatype_info[1].__contains__('..'):
print('INVALID DATA TYPE NOTES----> {}'.format(full_file_path))
datadec_data['data_type_notes'] = datatype_info[1]
if sheet[8, 2]:
dataset_data['de_identification'] = sheet[8, 2]
datadec_data['de_identification'] = sheet[8, 2]
if sheet[9, 2]:
dataset_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
datadec_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
if sheet[10, 2]:
dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
datadec_data['has_special_subjects'] = process_yes_no_dontknow_answer(
sheet[10, 2])
if dataset_data.get('has_special_subjects'):
if dataset_data.get('has_special_subjects') == True and sheet[11, 2]:
dataset_data['special_subject_notes'] = sheet[11, 2]
if datadec_data.get('has_special_subjects'):
if datadec_data.get('has_special_subjects') == True and sheet[11, 2]:
datadec_data['special_subject_notes'] = sheet[11, 2]
if sheet[14, 2]:
dataset_data['access_category'] = sheet[14, 2]
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 19)
datadec_data['access_category'] = sheet[14, 2].replace('-', '_')
use_restrictions = []
if process_yes_no_answer(sheet[17, 2]):
......@@ -69,35 +71,53 @@ class FromRepoXlsExporter(DatasetExporter):
if process_yes_no_answer(sheet[29, 2]):
use_restrictions.append({'ga4gh_code': 'PUB',
'note': 'Acknowledgement required.'})
dataset_data['use_restrictions'] = use_restrictions
datadec_data['use_restrictions'] = use_restrictions
idx += 1
result.append(datadec_data)
return result
def export_datasets(self, full_file_path):
result = []
idx = 1
print('Processing ----> {}'.format(full_file_path))
book = pyexcel.get_book(file_name=full_file_path)
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['title'] = sheet[3, 2]
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 2)
if sheet[19, 2]:
dataset_data['project'] = sheet[19, 2]
storage_locations = []
master_locations = get_value_list_from_row(sheet, 21)
try:
self.add_storage_locations(storage_locations, master_locations, 'master')
storage_locations.extend(self.build_storage_locations(master_locations, 'master'))
except ValueError as e:
print('Invalid Master Data Location Row {} \n'.format(full_file_path))
master_acl_list = get_value_list_from_row(sheet, 22)
if len(master_acl_list) > 0:
for loc in storage_locations:
loc['storage_acl_info'] = ', '.join(master_acl_list)
if process_yes_no_answer(sheet[23, 2]):