Commit 4e393f4c authored by Pinar Alper's avatar Pinar Alper
Browse files

Added data extractor for from-repository sheets.

Added data extractor for from-repository sheets.
parent 377b5ee1
......@@ -34,6 +34,12 @@
}
]
},
"other_external_id": {
"type": "string"
},
"access_category": {
"type": "string"
},
"use_restrictions": {
"type": "array",
"items": {
......@@ -69,7 +75,8 @@
}
},
"required": [
"ga4gh_code"
"ga4gh_code",
"note"
]
}
},
......
......@@ -7,7 +7,7 @@ except ImportError:
from distutils.core import setup
requirements = [
'jsonschema'
'jsonschema','pyexcel', 'pyexcel-xls'
]
test_requirements = [
......
import hashlib
import json
import os
from unittest import TestCase
import pyexcel
from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_answer, \
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER
class TestProjectsParser(TestCase):
def test_export_from_collaborator(self):
h = hashlib.md5()
for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER):
for fname in fileList:
if fname.startswith('from-collaborator'):
full_file_path = os.path.join(dirName, fname)
dataset_list = []
h.update(os.fsencode(full_file_path))
submission_id = 'IMP_FC_{}'.format(str(int(h.hexdigest(), 16)))
book = pyexcel.get_book(file_name=full_file_path)
idx = 1
# print('----> {}'.format(full_file_path))
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['source_type'] = 'From_Collaborator'
dataset_data['submission_id'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
dataset_data['title'] = sheet[4, 2]
datatype_info = process_data_types(get_value_list_from_row(sheet, 5))
dataset_data['data_types'] = datatype_info[0]
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
dataset_data['involves_samples'] = process_yes_no_answer(sheet[6, 2])
if sheet[7, 2]:
dataset_data['samples_location'] = sheet[7, 2]
if sheet[8, 2]:
dataset_data['de_identification'] = sheet[8, 2]
if sheet[9, 2]:
dataset_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
if sheet[10, 2]:
dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
sheet[10, 2])
if dataset_data.get('has_special_subjects'):
if dataset_data.get('has_special_subjects') == True and sheet[11, 2]:
dataset_data['special_subject_notes'] = sheet[11, 2]
collab_insts = get_value_list_from_row(sheet, 13)
collab_pis = get_value_list_from_row(sheet, 14)
if len(collab_insts) == len(collab_pis) and len(collab_insts) > 0:
i = 0
src_collab_list = []
while i < len(collab_insts):
collab_data = {'collab_inst': collab_insts[i],
'collab_pi': collab_pis[i],
'collab_project': sheet[18, 2]}
if process_yes_no_dontknow_answer(sheet[17, 2]) == False:
collab_data['collab_role'] = 'controller'
elif process_yes_no_dontknow_answer(sheet[17, 2]) == True:
collab_data['collab_role'] = 'joint-controller'
src_collab_list.append(collab_data)
i += 1
dataset_data['source_collaborations'] = src_collab_list
else:
print('Mismatched Collab PI-Institution length {} \n'.format(full_file_path))
if sheet[18, 2]:
dataset_data['source_project'] = sheet[18, 2]
use_restrictions = []
if process_yes_no_answer(sheet[25, 2]):
use_restrictions.append({'ga4gh_code': 'PS',
'note': 'Use is restricted to projects: ' + ', '.join(
get_value_list_from_row(sheet, 26))})
if process_yes_no_answer(sheet[27, 2]):
use_restrictions.append({'ga4gh_code': 'RS-[XX]',
'note': 'Use is restricted to research areas: ' + ', '.join(
get_value_list_from_row(sheet, 28))})
if process_yes_no_answer(sheet[43, 2]):
use_restrictions.append({'ga4gh_code': 'PUB',
'note': 'Acknowledgement required.'})
has_time_limis = process_yes_no_dontknow_answer(sheet[41, 2])
if has_time_limis and sheet[42, 2]:
use_restrictions.append({'ga4gh_code': 'TS-[XX]',
'note': 'Data is obtained for a limited duration.' + sheet[42, 2]})
dataset_data['use_restrictions'] = use_restrictions
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 33)
if process_yes_no_answer(sheet[29, 2]):
shares = get_value_list_from_row(sheet, 30)
if len(shares) > 0:
share_list = []
for shr in shares:
share_list.append({'share_notes': shr})
dataset_data['shares'] = share_list
storage_locations = []
master_locations = get_value_list_from_row(sheet, 35)
try:
add_storage_locations(storage_locations, master_locations, 'master')
except ValueError as e:
print('Invalid Master Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[37, 2]):
backup_locations = get_value_list_from_row(sheet, 38)
try:
add_storage_locations(storage_locations, backup_locations, 'backup')
except ValueError as e:
print('Uneven Backup Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[39, 2]):
copy_locations = get_value_list_from_row(sheet, 40)
try:
add_storage_locations(storage_locations, copy_locations, 'copy')
except ValueError as e:
print('Uneven Copy Data Location Row {} \n'.format(full_file_path))
acl_list = get_value_list_from_row(sheet, 36)
if len(acl_list) > 0:
dataset_data['storage_acl_info'] = ', '.join(acl_list)
dataset_data['storage_locations'] = storage_locations
dataset_list.append(dataset_data)
idx += 1
with open('datasets-{}.json'.format(submission_id), 'w') as outfile:
json.dump(dataset_list, outfile, indent=4)
# print(json.dumps(dataset_list, indent=4))
import hashlib
import json
import os
from unittest import TestCase
import pyexcel
from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_dontknow_answer, \
process_yes_no_answer, add_storage_locations, SHEETS_FOLDER
class TestProjectsParser(TestCase):
def test_export_from_repository(self):
h = hashlib.md5()
for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER):
for fname in fileList:
if fname.startswith('from-repository'):
full_file_path = os.path.join(dirName, fname)
dataset_list = []
h.update(os.fsencode(full_file_path))
submission_id = 'IMP_FR_{}'.format(str(int(h.hexdigest(), 16)))
book = pyexcel.get_book(file_name=full_file_path)
idx = 1
# print('----> {}'.format(full_file_path))
while idx < book.number_of_sheets():
sheet = book.sheet_by_index(idx)
dataset_data = {}
dataset_data['source_type'] = 'From_Repository'
dataset_data['submission_id'] = submission_id
dataset_data['local_custodian'] = get_value_list_from_row(sheet, 2)
dataset_data['title'] = sheet[5, 2]
if sheet[4, 2]:
dataset_data['other_external_id'] = sheet[4, 2]
datatype_info = process_data_types(get_value_list_from_row(sheet, 7))
dataset_data['data_types'] = datatype_info[0]
if datatype_info[1]:
dataset_data['data_type_notes'] = datatype_info[1]
if sheet[8, 2]:
dataset_data['de_identification'] = sheet[8, 2]
if sheet[9, 2]:
dataset_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
if sheet[10, 2]:
dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
sheet[10, 2])
if dataset_data.get('has_special_subjects'):
if dataset_data.get('has_special_subjects') == True and sheet[11, 2]:
dataset_data['special_subject_notes'] = sheet[11, 2]
if sheet[14, 2]:
dataset_data['access_category'] = sheet[14, 2]
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 19)
use_restrictions = []
if process_yes_no_answer(sheet[17, 2]):
use_restrictions.append({'ga4gh_code': 'PS',
'note': 'Contract restricts data use to projects ' + ', '.join(
get_value_list_from_row(sheet, 18))})
has_time_limis = process_yes_no_dontknow_answer(sheet[27, 2])
if has_time_limis and sheet[28, 2]:
use_restrictions.append({'ga4gh_code': 'TS-[XX]',
'note': 'Data is obtained for a limited duration.' + sheet[28, 2]})
if process_yes_no_answer(sheet[29, 2]):
use_restrictions.append({'ga4gh_code': 'PUB',
'note': 'Acknowledgement required.'})
dataset_data['use_restrictions'] = use_restrictions
storage_locations = []
master_locations = get_value_list_from_row(sheet, 21)
try:
add_storage_locations(storage_locations, master_locations, 'master')
except ValueError as e:
print('Invalid Master Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[23, 2]):
backup_locations = get_value_list_from_row(sheet, 24)
try:
add_storage_locations(storage_locations, backup_locations, 'backup')
except ValueError as e:
print('Uneven Backup Data Location Row {} \n'.format(full_file_path))
if process_yes_no_answer(sheet[25, 2]):
copy_locations = get_value_list_from_row(sheet, 26)
try:
add_storage_locations(storage_locations, copy_locations, 'copy')
except ValueError as e:
print('Uneven Copy Data Location Row {} \n'.format(full_file_path))
acl_list = get_value_list_from_row(sheet, 22)
if len(acl_list) > 0:
dataset_data['storage_acl_info'] = ', '.join(acl_list)
dataset_data['storage_locations'] = storage_locations
dataset_list.append(dataset_data)
idx += 1
with open('datasets-{}.json'.format(submission_id), 'w') as outfile:
json.dump(dataset_list, outfile, indent=4)
# print(json.dumps(dataset_list, indent=4))
from unittest import TestCase
import os
import json
import datetime
from .test_utils import collect_prj_info, get_value_list_from_row, process_data_types, process_yes_no_answer, \
process_yes_no_dontknow_answer, is_data_sheet, add_storage_locations, get_names_from_string
import hashlib
import pyexcel
import json
import os
from unittest import TestCase
class TestProjectsParser(TestCase):
import pyexcel
SHEETS_FOLDER = '/Users/pinar_alper/desktop/test-ANSWERS'
# def test_prj_refs_validity(self):
#
# defined_projects = set()
# for prj, path, title, description, pi, start, end, personnel, HasUniLUERP, ERPNotes, HasCNER, CNERNotes, publications in collect_prj_info(
# self.SHEETS_FOLDER):
# defined_projects.add(prj)
#
# for dirName, subdirList, fileList in os.walk(self.SHEETS_FOLDER):
# for fname in fileList:
# if is_data_sheet(fname):
# full_file_path = os.path.join(dirName, fname)
# book = pyexcel.get_book(file_name=full_file_path)
# idx = 0
# while idx < book.number_of_sheets():
# if idx > 0 and fname.startswith('from-repository'):
# # print('---->{}'.format(fname))
# prj_refs = set(get_value_list_from_row(book.sheet_by_index(idx),
# 18) + get_value_list_from_row(
# book.sheet_by_index(idx), 19))
# undefined_refs = prj_refs.difference(defined_projects)
# if len(undefined_refs) > 0:
# print('Reference to undefined project(s): {} in file {}'.format(undefined_refs,
# full_file_path))
# if idx > 0 and fname.startswith('from-collaborator'):
# # print('---->{}'.format(full_file_path))
# prj_refs = set(get_value_list_from_row(book.sheet_by_index(idx),
# 18) + get_value_list_from_row(
# book.sheet_by_index(idx), 26) + get_value_list_from_row(book.sheet_by_index(idx),
# 33))
# undefined_refs = prj_refs.difference(defined_projects)
# if len(undefined_refs) > 0:
# print('Reference to undefined project(s): {} in file {} '.format(undefined_refs,
# full_file_path))
# if idx > 0 and fname.startswith('own-cohort'):
# prj_refs = set(get_value_list_from_row(book.sheet_by_index(idx),
# 5) + get_value_list_from_row(
# book.sheet_by_index(idx), 22))
# undefined_refs = prj_refs.difference(defined_projects)
# if len(undefined_refs) > 0:
# print('Reference to undefined project(s): {} in file {} '.format(undefined_refs,
# full_file_path))
# idx += 1
# return
#
# def test_export_projects(self):
#
# projects_list = []
# for acr, path, title, description, pi, start, end, personnel, HasUniLUERP, ERPNotes, HasCNER, CNERNotes, publications in collect_prj_info(
# self.SHEETS_FOLDER):
# prj_data = {}
# prj_data['acronym'] = acr
# prj_data['title'] = title
# prj_data['description'] = description
# if type(start) is datetime.date:
# prj_data['start_date'] = start.strftime('%m/%d/%Y')
# elif type(start) is str:
# prj_data['start_date'] = start.replace('.', '/')
#
# if type(end) is datetime.date:
# prj_data['end_date'] = end.strftime('%m/%d/%Y')
# elif type(end) is str:
# prj_data['end_date'] = end.replace('.', '/')
# contacts_list = []
# delimeter = ','
# if ';' in pi:
# delimeter = ';'
# if pi:
# for pp in pi.split(delimeter):
# pp_data = {}
# name_list = get_names_from_string(pp)
# pp_data['first_name'] = name_list[0]
# pp_data['last_name'] = name_list[1]
# pp_data['role'] = 'Principal_Investigator'
# pp_data['institution'] = 'Luxembourg Center for Systems Biomedicine (LCSB)'
# contacts_list.append(pp_data)
# delimeter = ','
# if ';' in personnel:
# delimeter = ';'
# if personnel:
# for prs in personnel.split(delimeter):
# prs_data = {}
# name_list = get_names_from_string(prs)
# prs_data['first_name'] = name_list[0]
# prs_data['last_name'] = name_list[1]
# prs_data['role'] = 'Researcher'
# prs_data['institution'] = 'Luxembourg Center for Systems Biomedicine (LCSB)'
# contacts_list.append(prs_data)
# prj_data['contacts'] = contacts_list
# if HasUniLUERP:
# prj_data[
# 'has_institutional_ethics_approval'] = True if HasUniLUERP == 'Yes' else False
# else:
# prj_data['has_institutional_ethics_approval'] = False
#
# if ERPNotes:
# prj_data['institutional_ethics_approval_notes'] = ERPNotes
#
# if HasCNER:
# prj_data['has_national_ethics_approval'] = True if HasUniLUERP == 'Yes' else False
# else:
# pp_data['has_national_ethics_approval'] = False
# if CNERNotes:
# prj_data['national_ethics_approval_notes'] = CNERNotes
#
# if publications:
# publication_list = []
# for pub in publications.split('#'):
# pub_data = {}
# pub_data['citation_string'] = pub
# publication_list.append(pub_data)
# prj_data['publications'] = publication_list
# projects_list.append(prj_data)
# with open('projects.json', 'w') as outfile:
# json.dump(projects_list, outfile, indent=4)
# # print(json.dumps(projects_list, indent=4))
#
#
#
#
# def test_export_from_collaborator(self):
# h = hashlib.md5()
#
# for dirName, subdirList, fileList in os.walk(self.SHEETS_FOLDER):
# for fname in fileList:
# if fname.startswith('from-collaborator'):
# full_file_path = os.path.join(dirName, fname)
# dataset_list = []
# h.update(os.fsencode(full_file_path))
# submission_id = 'IMP_FC_{}'.format(str(int(h.hexdigest(), 16)))
# book = pyexcel.get_book(file_name=full_file_path)
# idx = 1
# # print('----> {}'.format(full_file_path))
# while idx < book.number_of_sheets():
# sheet = book.sheet_by_index(idx)
# dataset_data = {}
# dataset_data['source_type'] = 'From_Collaborator'
# dataset_data['submission_id'] = submission_id
# dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
# dataset_data['title'] = sheet[4, 2]
# datatype_info = process_data_types(get_value_list_from_row(sheet, 5))
# dataset_data['data_types'] = datatype_info[0]
# if datatype_info[1]:
# dataset_data['data_type_notes'] = datatype_info[1]
#
# dataset_data['involves_samples'] = process_yes_no_answer(sheet[6, 2])
#
# if sheet[7, 2]:
# dataset_data['samples_location'] = sheet[7, 2]
#
# if sheet[8, 2]:
# dataset_data['de_identification'] = sheet[8, 2]
#
# if sheet[9, 2]:
# dataset_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
#
# if sheet[10, 2]:
# dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
# sheet[10, 2])
#
# if dataset_data.get('has_special_subjects'):
# if dataset_data.get('has_special_subjects') == True and sheet[11, 2]:
# dataset_data['special_subject_notes'] = sheet[11, 2]
#
# collab_insts = get_value_list_from_row(sheet, 13)
# collab_pis = get_value_list_from_row(sheet, 14)
#
# if len(collab_insts) == len(collab_pis) and len(collab_insts) > 0:
# i = 0
# src_collab_list = []
# while i < len(collab_insts):
#
# collab_data = {'collab_inst': collab_insts[i],
# 'collab_pi': collab_pis[i],
# 'collab_project': sheet[18, 2]}
#
# if process_yes_no_dontknow_answer(sheet[17, 2]) == False:
# collab_data['collab_role'] = 'controller'
#
# elif process_yes_no_dontknow_answer(sheet[17, 2]) == True:
# collab_data['collab_role'] = 'joint-controller'
#
# src_collab_list.append(collab_data)
# i += 1
# dataset_data['source_collaborations'] = src_collab_list
# else:
# print('Mismatched Collab PI-Institution length {} \n'.format(full_file_path))
#
# if sheet[18, 2]:
# dataset_data['source_project'] = sheet[18, 2]
#
# use_restrictions = []
# if process_yes_no_answer(sheet[25, 2]):
# use_restrictions.append({'ga4gh_code': 'PS',
# 'note': 'Use is restricted to projects: ' + ', '.join(
# get_value_list_from_row(sheet, 26))})
# if process_yes_no_answer(sheet[27, 2]):
# use_restrictions.append({'ga4gh_code': 'RS-[XX]',
# 'note': 'Use is restricted to research areas: ' + ', '.join(
# get_value_list_from_row(sheet, 28))})
#
# dataset_data['use_restrictions'] = use_restrictions
#
# dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 33)
#
# if process_yes_no_answer(sheet[29, 2]):
# shares = get_value_list_from_row(sheet, 30)
# if len(shares) > 0:
# share_list = []
# for shr in shares:
# share_list.append({'share_notes': shr})
# dataset_data['shares'] = share_list
#
# storage_locations = []
#
# master_locations = get_value_list_from_row(sheet, 35)
# try:
# add_storage_locations(storage_locations,master_locations, 'master')
# except ValueError as e:
# print('Invalid Master Data Location Row {} \n'.format(full_file_path))
#
# if process_yes_no_answer(sheet[37, 2]):
# backup_locations = get_value_list_from_row(sheet, 38)
# try:
# add_storage_locations(storage_locations,backup_locations, 'backup')
# except ValueError as e:
# print('Uneven Backup Data Location Row {} \n'.format(full_file_path))
#
# if process_yes_no_answer(sheet[39, 2]):
# copy_locations = get_value_list_from_row(sheet, 40)
# try:
# add_storage_locations(storage_locations,copy_locations, 'copy')
# except ValueError as e:
# print('Uneven Copy Data Location Row {} \n'.format(full_file_path))
#
# acl_list = get_value_list_from_row(sheet, 36)
# if len(acl_list)>0:
# dataset_data['storage_acl_info'] = ', '.join(acl_list)
# dataset_data['storage_locations'] = storage_locations
# dataset_list.append(dataset_data)
# idx += 1
#
# with open('datasets-{}.json'.format(submission_id), 'w') as outfile:
# json.dump(dataset_list, outfile, indent=4)
# # print(json.dumps(dataset_list, indent=4))
from tests.importxls.test_utils import get_value_list_from_row, process_data_types, process_yes_no_answer, \
process_yes_no_dontknow_answer, add_storage_locations, SHEETS_FOLDER
class TestProjectsParser(TestCase):
def test_export_own_cohort(self):
h = hashlib.md5()
for dirName, subdirList, fileList in os.walk(self.SHEETS_FOLDER):
for dirName, subdirList, fileList in os.walk(SHEETS_FOLDER):
for fname in fileList:
if fname.startswith('own-cohort'):
full_file_path = os.path.join(dirName, fname)
......@@ -309,8 +61,8 @@ class TestProjectsParser(TestCase):
if dataset_data.get('has_special_subjects') == True and sheet[13, 2]:
dataset_data['special_subject_notes'] = sheet[13, 2]
if sheet[19,2]:
dataset_data['consent_status'] = sheet[19,2]
if sheet[19, 2]:
dataset_data['consent_status'] = sheet[19, 2]
dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 22)
......@@ -337,8 +89,12 @@ class TestProjectsParser(TestCase):
use_restrictions.append({'ga4gh_code': 'GS-[XX]',
'note': 'Data is consented for sharing outside EU'})
dataset_data['use_restrictions'] = use_restrictions
has_time_limis = process_yes_no_dontknow_answer(sheet[42, 2])
if has_time_limis and sheet[43, 2]:
use_restrictions.append({'ga4gh_code': 'TS-[XX]',
'note': 'Data is obtained for a limited duration.' + sheet[43, 2]})
dataset_data['use_restrictions'] = use_restrictions
share_list = []
......@@ -346,20 +102,19 @@ class TestProjectsParser(TestCase):
luxembourg_shares = get_value_list_from_row(sheet, 28)
if len(luxembourg_shares) > 0:
for shr in luxembourg_shares:
share_list.append({'share_notes': shr, 'share_location_type':'National'})
share_list.append({'share_notes': shr, 'share_location_type': 'National'})
if process_yes_no_answer(sheet[30, 2]):
eu_shares = get_value_list_from_row(sheet, 31)
if len(eu_shares) > 0:
for shr in eu_shares:
share_list.append({'share_notes': shr, 'share_location_type':'EU'})
share_list.append({'share_notes': shr, 'share_location_type': 'EU'})