Commit c206bff7 authored by Pinar Alper's avatar Pinar Alper
Browse files

Data extractor methods for projects and from-collaborator sheets added

Data extractor methods for  projects and from-collaborator sheets added
parent 027a9ff6
{
"description": "Dataset Schema",
"schema": {
"title": "A JSON Schema for describing Datasets within the ELIXIR Data Submission System.",
"$schema": "http://json-schema.org/draft-04/schema#",
"type": "object",
"required": [
"title",
"source_type",
"data_types"
],
"properties": {
"title": {
"type": "string"
},
"submission_id": {
"type": "string"
},
"source_type": {
"type": "string",
"enum": [
"From_Collaborator",
"From_Repository",
"Own_Cohort",
"From_Elixir_Data_Submitter"
]
},
"local_custodian": {
"type": "array",
"minItems": 1,
"items": [
{
"type": "string"
}
]
},
"data_types": {
"type": "array",
"items": {
"type": "string",
"enum": [
"Omics_data",
"Genotype_data",
"Whole_genome_sequencing",
"Exome_sequencing",
"Genomics_variant_array",
"RNASeq",
"Genetic_and_derived_genetic_data",
"Transcriptome_array",
"Methylation_array",
"MicroRNA_array",
"Metabolomics",
"Metagenomics",
"Proteomics",
"Other_omics_data",
"Clinical_Imaging",
"Cell_Imaging",
"Human_subject_data",
"Clinical_data",
"Lifestyle_data",
"Socio_Economic_Data",
"Environmental_Data",
"Other_Phenotype_data",
"Other"
]
},
"data_type_notes": {
"type": "string"
},
"subject_categories": {
"type": "array",
"items": {
"type": "string",
"enum": [
"cases",
"controls",
"cases_and_controls"
]
}
},
"source_type": {
"type": "string"
},
"source_project_acronym": {
"type": "string"
},
"de_identification": {
"type": "string",
"enum": [
"anonymization",
"pseudonymization"
]
},
"generated_inhouse": {
"type": "boolean"
},
"has_special_subjects": {
"type": "boolean"
},
"special_subject_notes": {
"type": "string"
},
"samples_location": {
"type": "string"
}
}
}
}
}
......@@ -5,13 +5,18 @@
"$schema": "http://json-schema.org/draft-04/schema#",
"type": "object",
"required": [
"acronym"
"acronym",
"title",
"description",
"descriptors",
"funding_sources",
"contacts"
],
"properties": {
"acronym": {
"type": "string"
},
"name": {
"title": {
"type": "string"
},
"description": {
......
from unittest import TestCase
import os
import json
import datetime
from .test_utils import collect_prj_info, get_value_list_from_row, process_data_types, process_yes_no_answer, \
process_yes_no_dontknow_answer, is_data_sheet, is_storage_resource, add_storage_locations
import hashlib
import pyexcel
class TestProjectsParser(TestCase):
SHEETS_FOLDER = '/Users/pinar_alper/desktop/test-ANSWERS'
def get_names_from_string(self, full_name):
result = ['', '']
name = full_name.strip()
if name.endswith(',') or name.endswith(','):
name = name[:-1]
if name is not None:
if " " in name:
name_list = name.split(" ")
result[0] = name_list[0]
if len(name_list) > 1:
result[1] = name_list[1]
else:
result[0] = name
return result
# def test_prj_refs_validity(self):
#
# defined_projects = set()
# for prj, path, title, description, pi, start, end, personnel, HasUniLUERP, ERPNotes, HasCNER, CNERNotes, publications in collect_prj_info(
# self.SHEETS_FOLDER):
# defined_projects.add(prj)
#
# for dirName, subdirList, fileList in os.walk(self.SHEETS_FOLDER):
# for fname in fileList:
# if is_data_sheet(fname):
# full_file_path = os.path.join(dirName, fname)
# book = pyexcel.get_book(file_name=full_file_path)
# idx = 0
# while idx < book.number_of_sheets():
# if idx > 0 and fname.startswith('from-repository'):
# # print('---->{}'.format(fname))
# prj_refs = set(get_value_list_from_row(book.sheet_by_index(idx),
# 18) + get_value_list_from_row(
# book.sheet_by_index(idx), 19))
# undefined_refs = prj_refs.difference(defined_projects)
# if len(undefined_refs) > 0:
# print('Reference to undefined project(s): {} in file {}'.format(undefined_refs,
# full_file_path))
# if idx > 0 and fname.startswith('from-collaborator'):
# # print('---->{}'.format(full_file_path))
# prj_refs = set(get_value_list_from_row(book.sheet_by_index(idx),
# 18) + get_value_list_from_row(
# book.sheet_by_index(idx), 26) + get_value_list_from_row(book.sheet_by_index(idx),
# 33))
# undefined_refs = prj_refs.difference(defined_projects)
# if len(undefined_refs) > 0:
# print('Reference to undefined project(s): {} in file {} '.format(undefined_refs,
# full_file_path))
# if idx > 0 and fname.startswith('own-cohort'):
# prj_refs = set(get_value_list_from_row(book.sheet_by_index(idx),
# 5) + get_value_list_from_row(
# book.sheet_by_index(idx), 22))
# undefined_refs = prj_refs.difference(defined_projects)
# if len(undefined_refs) > 0:
# print('Reference to undefined project(s): {} in file {} '.format(undefined_refs,
# full_file_path))
# idx += 1
# return
#
def test_export_projects(self):
projects_list = []
for acr, path, title, description, pi, start, end, personnel, HasUniLUERP, ERPNotes, HasCNER, CNERNotes, publications in collect_prj_info(
self.SHEETS_FOLDER):
prj_data = {}
prj_data['acronym'] = acr
prj_data['title'] = title
prj_data['description'] = description
if type(start) is datetime.date:
prj_data['start_date'] = start.strftime('%m/%d/%Y')
elif type(start) is str:
prj_data['start_date'] = start.replace('.', '/')
if type(end) is datetime.date:
prj_data['end_date'] = end.strftime('%m/%d/%Y')
elif type(end) is str:
prj_data['end_date'] = end.replace('.', '/')
contacts_list = []
delimeter = ','
if ';' in pi:
delimeter = ';'
if pi:
for pp in pi.split(delimeter):
pp_data = {}
name_list = self.get_names_from_string(pp)
pp_data['first_name'] = name_list[0]
pp_data['last_name'] = name_list[1]
pp_data['role'] = 'Principal_Investigator'
pp_data['institution'] = 'Luxembourg Center for Systems Biomedicine (LCSB)'
contacts_list.append(pp_data)
delimeter = ','
if ';' in personnel:
delimeter = ';'
if personnel:
for prs in personnel.split(delimeter):
prs_data = {}
name_list = self.get_names_from_string(prs)
prs_data['first_name'] = name_list[0]
prs_data['last_name'] = name_list[1]
prs_data['role'] = 'Researcher'
prs_data['institution'] = 'Luxembourg Center for Systems Biomedicine (LCSB)'
contacts_list.append(prs_data)
prj_data['contacts'] = contacts_list
if HasUniLUERP:
prj_data[
'has_institutional_ethics_approval'] = True if HasUniLUERP == 'Yes' else False
else:
prj_data['has_institutional_ethics_approval'] = False
if ERPNotes:
prj_data['institutional_ethics_approval_notes'] = ERPNotes
if HasCNER:
prj_data['has_national_ethics_approval'] = True if HasUniLUERP == 'Yes' else False
else:
pp_data['has_national_ethics_approval'] = False
if CNERNotes:
prj_data['national_ethics_approval_notes'] = CNERNotes
if publications:
publication_list = []
for pub in publications.split('#'):
pub_data = {}
pub_data['citation_string'] = pub
publication_list.append(pub_data)
prj_data['publications'] = publication_list
projects_list.append(prj_data)
with open('projects.json', 'w') as outfile:
json.dump(projects_list, outfile, indent=4)
# print(json.dumps(projects_list, indent=4))
def test_export_from_collaborator(self):
h = hashlib.md5()
for dirName, subdirList, fileList in os.walk(self.SHEETS_FOLDER):
for fname in fileList:
if fname.startswith('from-collaborator'):
full_file_path = os.path.join(dirName, fname)
dataset_list = []
h.update(os.fsencode(full_file_path))
submission_id = 'IMPORT_{}'.format(str(int(h.hexdigest(), 16)))
book = pyexcel.get_book(file_name=full_file_path)
idx = 1
# print('----> {}'.format(full_file_path))
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['source_type'] = 'From_Collaborator'
dataset_data['submission_id'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
dataset_data['title'] = sheet[4, 2]
datatype_info = process_data_types(get_value_list_from_row(sheet, 5))
dataset_data['data_types'] = datatype_info[0]
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
dataset_data['generated_inhouse'] = process_yes_no_answer(sheet[6, 2])
if sheet[7, 2]:
dataset_data['samples_location'] = sheet[7, 2]
if sheet[8, 2]:
dataset_data['de_identification'] = sheet[8, 2]
if sheet[9, 2]:
dataset_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
if sheet[10, 2]:
dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
sheet[10, 2])
if dataset_data.get('has_special_subjects'):
if dataset_data.get('has_special_subjects') == True and sheet[11, 2]:
dataset_data['special_subject_notes'] = sheet[11, 2]
collab_insts = get_value_list_from_row(sheet, 13)
collab_pis = get_value_list_from_row(sheet, 14)
if len(collab_insts) == len(collab_pis) and len(collab_insts) > 0:
i = 0
src_collab_list = []
while i < len(collab_insts):
collab_data = {'collab_inst': collab_insts[i],
'collab_pi': collab_pis[i],
'collab_project': sheet[18, 2]}
src_collab_list.append(collab_data)
i += 1
dataset_data['source_collaborations'] = src_collab_list
else:
print('mismatched Collab PI-Institution length--------> {} \n'.format(full_file_path))
if sheet[18, 2]:
dataset_data['source_project'] = sheet[18, 2]
use_restrictions = []
if process_yes_no_answer(sheet[25, 2]):
use_restrictions.append({'ga4gh_code': 'PS',
'note': 'Use is restricted to projects ' + ', '.join(
get_value_list_from_row(sheet, 26))})
if process_yes_no_answer(sheet[27, 2]):
use_restrictions.append({'ga4gh_code': 'RS-[XX]',
'note': 'Use is restricted to projects ' + ', '.join(
get_value_list_from_row(sheet, 28))})
dataset_data['use_restrictions'] = use_restrictions
if process_yes_no_answer(sheet[29, 2]):
shares = get_value_list_from_row(sheet, 30)
if len(shares) > 0:
share_list = []
for shr in shares:
share_list.append({'share_notes': shr})
dataset_data['shares'] = share_list
storage_locations = []
master_locations = get_value_list_from_row(sheet, 35)
try:
add_storage_locations(storage_locations,master_locations, 'master')
except ValueError as e:
print('Invalid MAster Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[37, 2]):
backup_locations = get_value_list_from_row(sheet, 38)
try:
add_storage_locations(storage_locations,backup_locations, 'backup')
except ValueError as e:
print('Uneven backup Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[39, 2]):
copy_locations = get_value_list_from_row(sheet, 40)
try:
add_storage_locations(storage_locations,copy_locations, 'copy')
except ValueError as e:
print('Uneven copy Data Location Row {} \n'.format(full_file_path))
acl_list = get_value_list_from_row(sheet, 36)
if len(acl_list)>0:
dataset_data['storage_acl_info'] = ', '.join(acl_list)
dataset_data['storage_locations'] = storage_locations
dataset_list.append(dataset_data)
idx += 1
with open('datasets-{}.json'.format(submission_id), 'w') as outfile:
json.dump(dataset_list, outfile, indent=4)
# print(json.dumps(dataset_list, indent=4))
import os
import pyexcel
def is_data_sheet(fname):
return fname.startswith('from-repository') or fname.startswith('from-collaborator') or fname.startswith(
'own-cohort')
def get_value_list_from_row(sheet, row_idx):
result = []
vals = sheet.row[row_idx]
data_vals = vals[2:]
for val in data_vals:
if val:
result.append(val)
return result
def get_names_from_string(full_name):
result = ['', '']
name = full_name.strip()
if name.endswith(',') or name.endswith(','):
name = name[:-1]
if name is not None:
if " " in name:
name_list = name.split(" ")
result[0] = name_list[0]
if len(name_list) > 1:
result[1] = name_list[1]
else:
result[0] = name
return result
def collect_prj_info(sheets_folder):
# result = open(os.path.join(os.path.dirname(__file__), 'result.txt'), encoding='utf-8', mode="w")
projects = []
for dirName, subdirList, fileList in os.walk(sheets_folder):
# print('Found directory: %s' % dirName)
for fname in fileList:
# result.write('\t%s' % fname)
# print('\t%s' % fname)
if fname.startswith('projects'):
full_file_path = os.path.join(dirName, fname)
book = pyexcel.get_book(file_name=full_file_path)
sheet = book.sheet_by_name('projects')
prj_acronyms = sheet.column[1]
numprojects = len(prj_acronyms) - 2
# print('{}---> {} ----> {}'.format(fname, len(prj_acronyms), prj_acronyms))
if numprojects > 0:
for row in range(2, 2 + numprojects):
projects.append((sheet[row, 1], full_file_path, sheet[row, 2], sheet[row, 3], sheet[row, 4],
sheet[row, 5], sheet[row, 6], sheet[row, 7], sheet[row, 8], sheet[row, 9],
sheet[row, 10], sheet[row, 11], sheet[row, 12]))
return projects
def process_data_types(xls_data_type_list):
result = []
predefined_types = set([
"Omics data",
"Genotype data",
"Whole genome sequencing",
"Exome sequencing",
"Genomics variant array",
"RNASeq",
"Genetic and derived genetic data",
"Transcriptome array",
"Methylation array",
"MicroRNA array",
"Metabolomics",
"Metagenomics",
"Proteomics",
"Other omics data",
"Clinical Imaging",
"Cell Imaging",
"Human subject data",
"Clinical data",
"Lifestyle data",
"Socio Economic Data",
"Environmental Data",
"Other Phenotype data",
"Other"
])
data_type_notes = ''
for type_name in xls_data_type_list:
type_name = type_name.strip()
if type_name:
if type_name in predefined_types:
result.append(type_name.replace(" ", "_"))
else:
data_type_notes += type_name + '\n'
return (result, data_type_notes)
def is_storage_resource(location):
result = []
predefined_types = set([
'hpc_chaos_home',
'hpc_chaos_project',
'hpc_gaia_home',
'hpc_gaia_project',
'hpc_gaia_work',
'hpc_iris_home',
'hpc_iris_project',
'hpc_scratch_personal',
'hpc_scratch_project',
'hpc_isilon',
'atlas_personal',
'atlas_project',
'hpc_backup_chaos',
'hpc_backup_gaia',
'bertha',
'certon_block',
'lcsb_group_server',
'lcsb_desktop',
'lcsb_laptop',
'personal_laptop',
'Owncloud',
'External Storage (e.g. Hard disk, DVD)',
'Other'
])
if location in predefined_types:
return True
else:
return False
def process_yes_no_answer(answer):
"""
convert yes/no answers to boolean we take empty answers as no
:param xls_data_type_list:
"""
result = False
if answer:
if answer == 'Yes':
result = True
return result
def process_yes_no_dontknow_answer(answer):
"""
convert yes/no/dontknow answers to boolean
we return empty and dontknow answers as None
:param xls_data_type_list:
"""
if answer:
if answer == 'Yes':
return True
elif answer == 'No':
return False
else:
return None
else:
return None
def add_storage_locations(storage_dict, locations_list, category):
if len(locations_list) % 2 != 0 and len(locations_list) > 0:
if len(locations_list) == 1:
if is_storage_resource(locations_list[0]):
storage_dict.append(
{'storage_resource': locations_list[0], 'location': '<missing_info>',
'category': 'master'})
else:
storage_dict.append(
{'storage_resource': 'Other', 'location': locations_list[0],
'category': 'master'})
else:
raise ValueError('Uneven Master Data Location Row')
elif len(locations_list) % 2 == 0 and len(locations_list) > 0:
s = 0
e = len(locations_list) // 2
while s < e:
res = locations_list[s * 2] if locations_list[s * 2] else 'Other'
storage_dict.append({'storage_resource': locations_list[s * 2],
'location': locations_list[s * 2 + 1],
'category': category})
s += 1
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment