Commit fbd3c4e1 authored by Pinar Alper's avatar Pinar Alper
Browse files

Refactored code from tests to classes

parent d24c302a
import pkg_resources
import json
import hashlib
from os import fsencode
from metadata_tools.importxls.export_utils import get_lines_from_string
class DatasetExporter:
def __init__(self):
with open(pkg_resources.resource_filename('metadata_tools', 'resources/elu_institutions.json'), encoding='utf-8') as institutions_file:
institutions = json.loads(institutions_file.read())
self.institution_dict = {}
for inst in institutions:
self.institution_dict[inst.get('institution_name')] = inst.get('elu_accession')
self.h = hashlib.md5()
self.predefined_data_types = set([
"Omics data",
"Genotype data",
"Whole genome sequencing",
"Exome sequencing",
"Genomics variant array",
"RNASeq",
"Genetic and derived genetic data",
"Transcriptome array",
"Methylation array",
"MicroRNA array",
"Metabolomics",
"Metagenomics",
"Proteomics",
"Other omics data",
"Clinical Imaging",
"Cell Imaging",
"Human subject data",
"Clinical data",
"Lifestyle data",
"Socio Economic Data",
"Environmental Data",
"Other Phenotype data",
"Other"
])
self.predefined_storage_types = set([
'hpc_chaos_home',
'hpc_chaos_project',
'hpc_gaia_home',
'hpc_gaia_project',
'hpc_gaia_work',
'hpc_iris_home',
'hpc_iris_project',
'hpc_scratch_personal',
'hpc_scratch_project',
'hpc_isilon',
'atlas_personal',
'atlas_project',
'hpc_backup_chaos',
'hpc_backup_gaia',
'bertha',
'certon_block',
'lcsb_group_server',
'lcsb_desktop',
'lcsb_laptop',
'personal_laptop',
'Owncloud',
'External Storage (e.g. Hard disk, DVD)',
'Other'
])
def get_hash_for_path(self, path):
self.h.update(fsencode(path))
return str(int(self.h.hexdigest(), 16))
def lookup_institution_accession(self, institution_name):
if institution_name not in self.institution_dict.keys():
print ('Undefined institution -- > {}'.format(institution_name))
return None
else:
return self.institution_dict[institution_name]
def process_data_types(self, xls_data_type_list):
result = []
data_type_notes = ''
for type_name in xls_data_type_list:
type_name = type_name.strip()
if type_name:
if type_name in self.predefined_data_types:
result.append(type_name.replace(" ", "_"))
else:
data_type_notes += type_name + '\n'
return (result, data_type_notes)
def is_storage_resource(self, resource):
if resource in self.predefined_storage_types:
return True
else:
print('Unknow Storage resource --> {}'.format(resource))
return False
def get_storage_location(self, resource, path, category):
result = {}
if self.is_application(path):
result['storage_resource'] = 'application'
elif resource in self.predefined_data_types:
result['storage_resource'] = resource
else:
result['storage_resource'] = 'Other'
result['location'] = {'location':path}
result['category'] = category
return result
def is_application(self, path):
if ("transmart" in path.lower()) or ( "redcap" in path.lower()):
return True
else:
return False
def process_share_list(self, shares):
share_list = []
for shr in shares:
if ";" not in shr:
if self.lookup_institution_accession(shr.strip()):
share_list.append({'share_inst': self.lookup_institution_accession(shr.strip())})
else:
share_list.append({'share_notes': shr})
else:
infos = shr.split(";")
share_list.append({'share_inst': self.lookup_institution_accession(infos[0].strip()),
'share_notes': infos[1].strip()})
return share_list
def add_storage_locations(self, storage_dict, locations_list, category):
if len(locations_list) % 2 != 0 and len(locations_list) > 0:
if len(locations_list) == 1:
if self.is_storage_resource(locations_list[0]):
storage_dict.append(self.get_storage_location(locations_list[0],'<missing_info>',category))
else:
for line in get_lines_from_string(locations_list[0]):
storage_dict.append(self.get_storage_location('Other',line,category))
else:
raise ValueError('Uneven Master Data Location Row')
elif len(locations_list) % 2 == 0 and len(locations_list) > 0:
s = 0
e = len(locations_list) // 2
while s < e:
if self.is_storage_resource(locations_list[s * 2]):
for line in get_lines_from_string(locations_list[s * 2 + 1]):
storage_dict.append(self.get_storage_location(locations_list[s * 2], line, category))
else:
for line in get_lines_from_string(locations_list[s * 2]):
storage_dict.append(self.get_storage_location('Other', line, category))
s += 1
import datetime
def process_yes_no_answer(answer):
"""
convert yes/no answers to boolean we take empty answers as no
:param xls_data_type_list:
"""
result = False
if answer:
if answer == 'Yes':
result = True
return result
def process_yes_no_dontknow_answer(answer):
"""
convert yes/no/dontknow answers to boolean
we return empty and dontknow answers as None
:param xls_data_type_list:
"""
if answer:
if answer == 'Yes':
return True
elif answer == 'No':
return False
else:
return None
else:
return None
def is_data_sheet(fname):
return fname.startswith('from-repository') or fname.startswith('from-collaborator') or fname.startswith(
'own-cohort')
def get_value_list_from_row(sheet, row_idx):
result = []
vals = sheet.row[row_idx]
data_vals = vals[2:]
for val in data_vals:
if val:
result.append(val)
return result
def process_possible_date(possible_date):
if isinstance(possible_date, datetime.date):
return possible_date.strftime("%Y/%m/%d")
else:
return str(possible_date).replace('.', '/')
def get_names_from_string(full_name):
result = ['', '']
name = full_name.strip()
if name.endswith(',') or name.endswith(','):
name = name[:-1]
if name is not None:
if " " in name:
name_list = name.split(" ")
len_name = len(name_list)
result[0] = name_list[0]
if len_name > 1:
result[1] = name_list[1]
if len_name == 3:
result[1] = result[1] + ' ' + name_list[2]
else:
result[0] = name
return result
def get_lines_from_string(a_string):
result = []
stripped = a_string.strip()
line_list = stripped.splitlines()
for line in line_list:
if line:
result.append(line)
return result
\ No newline at end of file
from .dataset_exporter import DatasetExporter
import pyexcel
import json
from metadata_tools.importxls.export_utils import get_value_list_from_row, process_yes_no_answer, \
process_yes_no_dontknow_answer, process_possible_date
class FromCollabXlsExporter(DatasetExporter):
def export(self, full_file_path):
submission_id = 'IMP_FC_{}'.format(self.get_hash_for_path(full_file_path))
idx = 1
print('Processing ----> {}'.format(full_file_path))
book = pyexcel.get_book(file_name=full_file_path)
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['source_type'] = 'From_Collaborator'
dataset_data['submission_id'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
dataset_data['title'] = sheet[4, 2]
if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
datatype_info = self.process_data_types(get_value_list_from_row(sheet, 5))
dataset_data['data_types'] = datatype_info[0]
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
dataset_data['involves_samples'] = process_yes_no_answer(sheet[6, 2])
if sheet[7, 2]:
dataset_data['samples_location'] = sheet[7, 2]
if sheet[8, 2]:
dataset_data['de_identification'] = sheet[8, 2]
if sheet[9, 2]:
dataset_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
if sheet[10, 2]:
dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
sheet[10, 2])
if dataset_data.get('has_special_subjects'):
if dataset_data.get('has_special_subjects') == True and sheet[11, 2]:
dataset_data['special_subject_notes'] = sheet[11, 2]
collab_insts = get_value_list_from_row(sheet, 13)
collab_pis = get_value_list_from_row(sheet, 14)
if (len(collab_insts) == len(collab_pis)) and len(collab_insts) > 0:
i = 0
src_collab_list = []
while i < len(collab_insts):
collab_data = {'collab_inst': self.lookup_institution_accession(collab_insts[i]),
'collab_pi': collab_pis[i],
'collab_project': sheet[18, 2]}
if process_yes_no_dontknow_answer(sheet[17, 2]) == False:
collab_data['collab_role'] = 'controller'
elif process_yes_no_dontknow_answer(sheet[17, 2]) == True:
collab_data['collab_role'] = 'joint-controller'
src_collab_list.append(collab_data)
i += 1
dataset_data['source_collaborations'] = src_collab_list
else:
print('Mismatched Collab PI-Institution length {} \n'.format(full_file_path))
if len(collab_insts)>1:
print('Multi source collab ----> {}'.format(full_file_path))
if sheet[18, 2]:
dataset_data['source_project'] = sheet[18, 2]
use_restrictions = []
if process_yes_no_answer(sheet[25, 2]):
use_restrictions.append({'ga4gh_code': 'PS',
'note': 'Use is restricted to projects: ' + ', '.join(
get_value_list_from_row(sheet, 26))})
if process_yes_no_answer(sheet[27, 2]):
use_restrictions.append({'ga4gh_code': 'RS-[XX]',
'note': 'Use is restricted to research areas: ' + ', '.join(
get_value_list_from_row(sheet, 28))})
if process_yes_no_answer(sheet[43, 2]):
use_restrictions.append({'ga4gh_code': 'PUB',
'note': 'Acknowledgement required.'})
has_time_limis = process_yes_no_dontknow_answer(sheet[41, 2])
if has_time_limis and sheet[42, 2]:
use_restrictions.append({'ga4gh_code': 'TS-[XX]',
'note': 'Data is obtained for a limited duration.' + process_possible_date(sheet[42, 2])})
dataset_data['use_restrictions'] = use_restrictions
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 33)
if process_yes_no_answer(sheet[29, 2]):
dataset_data['shares'] = self.process_share_list(get_value_list_from_row(sheet, 30))
storage_locations = []
master_locations = get_value_list_from_row(sheet, 35)
try:
self.add_storage_locations(storage_locations, master_locations, 'master')
except ValueError as e:
print('Invalid Master Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[37, 2]):
backup_locations = get_value_list_from_row(sheet, 38)
try:
self.add_storage_locations(storage_locations, backup_locations, 'backup')
except ValueError as e:
print('Uneven Backup Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[39, 2]):
copy_locations = get_value_list_from_row(sheet, 40)
try:
self.add_storage_locations(storage_locations, copy_locations, 'copy')
except ValueError as e:
print('Uneven Copy Data Location Row {} \n'.format(full_file_path))
acl_list = get_value_list_from_row(sheet, 36)
if len(acl_list) > 0:
dataset_data['storage_acl_info'] = ', '.join(acl_list)
dataset_data['storage_locations'] = storage_locations
idx += 1
with open('{}_.json'.format(submission_id), 'w') as outfile:
json.dump(dataset_data, outfile, indent=4)
import pyexcel
import json
from .dataset_exporter import DatasetExporter
from metadata_tools.importxls.export_utils import get_value_list_from_row, process_yes_no_answer, \
process_yes_no_dontknow_answer, process_possible_date
class FromOwncohortXlsExporter(DatasetExporter):
def export(self, full_file_path):
submission_id = 'IMP_OC_{}'.format(self.get_hash_for_path(full_file_path))
book = pyexcel.get_book(file_name=full_file_path)
idx = 1
print('----> {}'.format(full_file_path))
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['source_type'] = 'Own_Cohort'
dataset_data['submission_id'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
dataset_data['title'] = sheet[4, 2]
if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
if sheet[5, 2]:
dataset_data['source_project'] = sheet[5, 2]
datatype_info = self.process_data_types(get_value_list_from_row(sheet, 6))
dataset_data['data_types'] = datatype_info[0]
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
dataset_data['involves_samples'] = process_yes_no_answer(sheet[7, 2])
if sheet[7, 2]:
dataset_data['samples_location'] = sheet[8, 2]
if sheet[9, 2]:
dataset_data['de_identification'] = sheet[9, 2]
if sheet[10, 2]:
dataset_data['ombudsman'] = sheet[10, 2]
if sheet[11, 2]:
dataset_data['subject_categories'] = sheet[11, 2].replace(' & ', '_and_')
if sheet[12, 2]:
dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
sheet[12, 2])
if dataset_data.get('has_special_subjects'):
if dataset_data.get('has_special_subjects') == True and sheet[13, 2]:
dataset_data['special_subject_notes'] = sheet[13, 2]
if sheet[19, 2]:
dataset_data['consent_status'] = sheet[19, 2]
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 22)
use_restrictions = []
if process_yes_no_answer(sheet[21, 2]):
use_restrictions.append({'ga4gh_code': 'PS',
'note': 'Consent form restricts data use to projects ' + ', '.join(
get_value_list_from_row(sheet, 23))})
if process_yes_no_answer(sheet[24, 2]):
use_restrictions.append({'ga4gh_code': 'RS-[XX]',
'note': 'Data is consented for research on' + ', '.join(
get_value_list_from_row(sheet, 25))})
if process_yes_no_answer(sheet[26, 2]):
use_restrictions.append({'ga4gh_code': 'GS-[XX]',
'note': 'Data is consented for sharing outside LCSB (Within Luxembourg)'})
if process_yes_no_answer(sheet[29, 2]):
use_restrictions.append({'ga4gh_code': 'GS-[XX]',
'note': 'Data is consented for sharing outside Luxembourg (within EU)'})
if process_yes_no_answer(sheet[32, 2]):
use_restrictions.append({'ga4gh_code': 'GS-[XX]',
'note': 'Data is consented for sharing outside EU'})
has_time_limis = process_yes_no_dontknow_answer(sheet[42, 2])
if has_time_limis and sheet[43, 2]:
use_restrictions.append({'ga4gh_code': 'TS-[XX]',
'note': 'Data is obtained for a limited duration.' + process_possible_date(sheet[43, 2])})
dataset_data['use_restrictions'] = use_restrictions
share_list = []
if process_yes_no_answer(sheet[27, 2]):
share_list += self.process_share_list(get_value_list_from_row(sheet, 28))
if process_yes_no_answer(sheet[30, 2]):
share_list += self.process_share_list(get_value_list_from_row(sheet, 31))
if process_yes_no_answer(sheet[33, 2]):
share_list += self.process_share_list(get_value_list_from_row(sheet, 34))
dataset_data['shares'] = share_list
storage_locations = []
master_locations = get_value_list_from_row(sheet, 36)
try:
self.add_storage_locations(storage_locations, master_locations, 'master')
except ValueError as e:
print('Invalid Master Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[38, 2]):
backup_locations = get_value_list_from_row(sheet, 39)
try:
self.add_storage_locations(storage_locations, backup_locations, 'backup')
except ValueError as e:
print('Uneven Backup Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[40, 2]):
copy_locations = get_value_list_from_row(sheet, 41)
try:
self.add_storage_locations(storage_locations, copy_locations, 'copy')
except ValueError as e:
print('Uneven Copy Data Location Row {} \n'.format(full_file_path))
acl_list = get_value_list_from_row(sheet, 37)
if len(acl_list) > 0:
dataset_data['storage_acl_info'] = ', '.join(acl_list)
dataset_data['storage_locations'] = storage_locations
idx += 1
with open('datasets-{}.json'.format(submission_id), 'w') as outfile:
json.dump(dataset_data, outfile, indent=4)
import pyexcel
import json
from .dataset_exporter import DatasetExporter
from metadata_tools.importxls.export_utils import get_value_list_from_row, process_yes_no_answer, \
process_yes_no_dontknow_answer, process_possible_date
class FromRepoXlsExporter(DatasetExporter):
def export(self, full_file_path):
submission_id = 'IMP_FR_{}'.format(self.get_hash_for_path(full_file_path))
idx = 1
print('Processing ----> {}'.format(full_file_path))
book = pyexcel.get_book(file_name=full_file_path)
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['source_type'] = 'From_Repository'
dataset_data['submission_id'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 2)
dataset_data['source_repository'] = self.lookup_institution_accession(sheet[6, 2].strip())
if sheet[4, 2]:
dataset_data['other_external_id'] = sheet[4, 2]
if sheet[5, 2]:
dataset_data['title'] = sheet[5, 2].strip()
if not dataset_data['title']:
print('Missing dataset title ----> {}'.format(full_file_path))
datatype_info = self.process_data_types(get_value_list_from_row(sheet, 7))
dataset_data['data_types'] = datatype_info[0]
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
if datatype_info[1].__contains__('..'):
print('INVALID DATA TYPE NOTES----> {}'.format(full_file_path))
if sheet[8, 2]:
dataset_data['de_identification'] = sheet[8, 2]
if sheet[9, 2]:
dataset_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
if sheet[10, 2]:
dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
sheet[10, 2])
if dataset_data.get('has_special_subjects'):
if dataset_data.get('has_special_subjects') == True and sheet[11, 2]:
dataset_data['special_subject_notes'] = sheet[11, 2]
if sheet[14, 2]:
dataset_data['access_category'] = sheet[14, 2]
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 19)
use_restrictions = []
if process_yes_no_answer(sheet[17, 2]):
use_restrictions.append({'ga4gh_code': 'PS',
'note': 'Contract restricts data use to projects ' + ', '.join(
get_value_list_from_row(sheet, 18))})
has_time_limis = process_yes_no_dontknow_answer(sheet[27, 2])