Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
elixir
metadata-tools
Commits
4e393f4c
Commit
4e393f4c
authored
Apr 23, 2018
by
Pinar Alper
Browse files
Added data extractor for from-repository sheets.
Added data extractor for from-repository sheets.
parent
377b5ee1
Changes
9
Hide whitespace changes
Inline
Side-by-side
metadata-tools/resources/elx-dataset.json
View file @
4e393f4c
...
@@ -34,6 +34,12 @@
...
@@ -34,6 +34,12 @@
}
}
]
]
},
},
"other_external_id"
:
{
"type"
:
"string"
},
"access_category"
:
{
"type"
:
"string"
},
"use_restrictions"
:
{
"use_restrictions"
:
{
"type"
:
"array"
,
"type"
:
"array"
,
"items"
:
{
"items"
:
{
...
@@ -69,7 +75,8 @@
...
@@ -69,7 +75,8 @@
}
}
},
},
"required"
:
[
"required"
:
[
"ga4gh_code"
"ga4gh_code"
,
"note"
]
]
}
}
},
},
...
...
setup.py
View file @
4e393f4c
...
@@ -7,7 +7,7 @@ except ImportError:
...
@@ -7,7 +7,7 @@ except ImportError:
from
distutils.core
import
setup
from
distutils.core
import
setup
requirements
=
[
requirements
=
[
'jsonschema'
'jsonschema'
,
'pyexcel'
,
'pyexcel-xls'
]
]
test_requirements
=
[
test_requirements
=
[
...
...
tests/importxls/__init__.py
0 → 100644
View file @
4e393f4c
tests/importxls/test_from_collab.py
0 → 100644
View file @
4e393f4c
import
hashlib
import
json
import
os
from
unittest
import
TestCase
import
pyexcel
from
tests.importxls.test_utils
import
get_value_list_from_row
,
process_data_types
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
add_storage_locations
,
SHEETS_FOLDER
class
TestProjectsParser
(
TestCase
):
def
test_export_from_collaborator
(
self
):
h
=
hashlib
.
md5
()
for
dirName
,
subdirList
,
fileList
in
os
.
walk
(
SHEETS_FOLDER
):
for
fname
in
fileList
:
if
fname
.
startswith
(
'from-collaborator'
):
full_file_path
=
os
.
path
.
join
(
dirName
,
fname
)
dataset_list
=
[]
h
.
update
(
os
.
fsencode
(
full_file_path
))
submission_id
=
'IMP_FC_{}'
.
format
(
str
(
int
(
h
.
hexdigest
(),
16
)))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
idx
=
1
# print('----> {}'.format(full_file_path))
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'source_type'
]
=
'From_Collaborator'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
3
)
dataset_data
[
'title'
]
=
sheet
[
4
,
2
]
datatype_info
=
process_data_types
(
get_value_list_from_row
(
sheet
,
5
))
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
dataset_data
[
'involves_samples'
]
=
process_yes_no_answer
(
sheet
[
6
,
2
])
if
sheet
[
7
,
2
]:
dataset_data
[
'samples_location'
]
=
sheet
[
7
,
2
]
if
sheet
[
8
,
2
]:
dataset_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
dataset_data
[
'subject_categories'
]
=
sheet
[
9
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
10
,
2
]:
dataset_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
10
,
2
])
if
dataset_data
.
get
(
'has_special_subjects'
):
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
11
,
2
]:
dataset_data
[
'special_subject_notes'
]
=
sheet
[
11
,
2
]
collab_insts
=
get_value_list_from_row
(
sheet
,
13
)
collab_pis
=
get_value_list_from_row
(
sheet
,
14
)
if
len
(
collab_insts
)
==
len
(
collab_pis
)
and
len
(
collab_insts
)
>
0
:
i
=
0
src_collab_list
=
[]
while
i
<
len
(
collab_insts
):
collab_data
=
{
'collab_inst'
:
collab_insts
[
i
],
'collab_pi'
:
collab_pis
[
i
],
'collab_project'
:
sheet
[
18
,
2
]}
if
process_yes_no_dontknow_answer
(
sheet
[
17
,
2
])
==
False
:
collab_data
[
'collab_role'
]
=
'controller'
elif
process_yes_no_dontknow_answer
(
sheet
[
17
,
2
])
==
True
:
collab_data
[
'collab_role'
]
=
'joint-controller'
src_collab_list
.
append
(
collab_data
)
i
+=
1
dataset_data
[
'source_collaborations'
]
=
src_collab_list
else
:
print
(
'Mismatched Collab PI-Institution length {}
\n
'
.
format
(
full_file_path
))
if
sheet
[
18
,
2
]:
dataset_data
[
'source_project'
]
=
sheet
[
18
,
2
]
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
25
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PS'
,
'note'
:
'Use is restricted to projects: '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
26
))})
if
process_yes_no_answer
(
sheet
[
27
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'RS-[XX]'
,
'note'
:
'Use is restricted to research areas: '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
28
))})
if
process_yes_no_answer
(
sheet
[
43
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PUB'
,
'note'
:
'Acknowledgement required.'
})
has_time_limis
=
process_yes_no_dontknow_answer
(
sheet
[
41
,
2
])
if
has_time_limis
and
sheet
[
42
,
2
]:
use_restrictions
.
append
({
'ga4gh_code'
:
'TS-[XX]'
,
'note'
:
'Data is obtained for a limited duration.'
+
sheet
[
42
,
2
]})
dataset_data
[
'use_restrictions'
]
=
use_restrictions
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
33
)
if
process_yes_no_answer
(
sheet
[
29
,
2
]):
shares
=
get_value_list_from_row
(
sheet
,
30
)
if
len
(
shares
)
>
0
:
share_list
=
[]
for
shr
in
shares
:
share_list
.
append
({
'share_notes'
:
shr
})
dataset_data
[
'shares'
]
=
share_list
storage_locations
=
[]
master_locations
=
get_value_list_from_row
(
sheet
,
35
)
try
:
add_storage_locations
(
storage_locations
,
master_locations
,
'master'
)
except
ValueError
as
e
:
print
(
'Invalid Master Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
37
,
2
]):
backup_locations
=
get_value_list_from_row
(
sheet
,
38
)
try
:
add_storage_locations
(
storage_locations
,
backup_locations
,
'backup'
)
except
ValueError
as
e
:
print
(
'Uneven Backup Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
39
,
2
]):
copy_locations
=
get_value_list_from_row
(
sheet
,
40
)
try
:
add_storage_locations
(
storage_locations
,
copy_locations
,
'copy'
)
except
ValueError
as
e
:
print
(
'Uneven Copy Data Location Row {}
\n
'
.
format
(
full_file_path
))
acl_list
=
get_value_list_from_row
(
sheet
,
36
)
if
len
(
acl_list
)
>
0
:
dataset_data
[
'storage_acl_info'
]
=
', '
.
join
(
acl_list
)
dataset_data
[
'storage_locations'
]
=
storage_locations
dataset_list
.
append
(
dataset_data
)
idx
+=
1
with
open
(
'datasets-{}.json'
.
format
(
submission_id
),
'w'
)
as
outfile
:
json
.
dump
(
dataset_list
,
outfile
,
indent
=
4
)
# print(json.dumps(dataset_list, indent=4))
tests/importxls/test_from_repo.py
0 → 100644
View file @
4e393f4c
import
hashlib
import
json
import
os
from
unittest
import
TestCase
import
pyexcel
from
tests.importxls.test_utils
import
get_value_list_from_row
,
process_data_types
,
process_yes_no_dontknow_answer
,
\
process_yes_no_answer
,
add_storage_locations
,
SHEETS_FOLDER
class
TestProjectsParser
(
TestCase
):
def
test_export_from_repository
(
self
):
h
=
hashlib
.
md5
()
for
dirName
,
subdirList
,
fileList
in
os
.
walk
(
SHEETS_FOLDER
):
for
fname
in
fileList
:
if
fname
.
startswith
(
'from-repository'
):
full_file_path
=
os
.
path
.
join
(
dirName
,
fname
)
dataset_list
=
[]
h
.
update
(
os
.
fsencode
(
full_file_path
))
submission_id
=
'IMP_FR_{}'
.
format
(
str
(
int
(
h
.
hexdigest
(),
16
)))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
idx
=
1
# print('----> {}'.format(full_file_path))
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'source_type'
]
=
'From_Repository'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
2
)
dataset_data
[
'title'
]
=
sheet
[
5
,
2
]
if
sheet
[
4
,
2
]:
dataset_data
[
'other_external_id'
]
=
sheet
[
4
,
2
]
datatype_info
=
process_data_types
(
get_value_list_from_row
(
sheet
,
7
))
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
if
sheet
[
8
,
2
]:
dataset_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
dataset_data
[
'subject_categories'
]
=
sheet
[
9
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
10
,
2
]:
dataset_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
10
,
2
])
if
dataset_data
.
get
(
'has_special_subjects'
):
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
11
,
2
]:
dataset_data
[
'special_subject_notes'
]
=
sheet
[
11
,
2
]
if
sheet
[
14
,
2
]:
dataset_data
[
'access_category'
]
=
sheet
[
14
,
2
]
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
19
)
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
17
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PS'
,
'note'
:
'Contract restricts data use to projects '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
18
))})
has_time_limis
=
process_yes_no_dontknow_answer
(
sheet
[
27
,
2
])
if
has_time_limis
and
sheet
[
28
,
2
]:
use_restrictions
.
append
({
'ga4gh_code'
:
'TS-[XX]'
,
'note'
:
'Data is obtained for a limited duration.'
+
sheet
[
28
,
2
]})
if
process_yes_no_answer
(
sheet
[
29
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PUB'
,
'note'
:
'Acknowledgement required.'
})
dataset_data
[
'use_restrictions'
]
=
use_restrictions
storage_locations
=
[]
master_locations
=
get_value_list_from_row
(
sheet
,
21
)
try
:
add_storage_locations
(
storage_locations
,
master_locations
,
'master'
)
except
ValueError
as
e
:
print
(
'Invalid Master Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
23
,
2
]):
backup_locations
=
get_value_list_from_row
(
sheet
,
24
)
try
:
add_storage_locations
(
storage_locations
,
backup_locations
,
'backup'
)
except
ValueError
as
e
:
print
(
'Uneven Backup Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
25
,
2
]):
copy_locations
=
get_value_list_from_row
(
sheet
,
26
)
try
:
add_storage_locations
(
storage_locations
,
copy_locations
,
'copy'
)
except
ValueError
as
e
:
print
(
'Uneven Copy Data Location Row {}
\n
'
.
format
(
full_file_path
))
acl_list
=
get_value_list_from_row
(
sheet
,
22
)
if
len
(
acl_list
)
>
0
:
dataset_data
[
'storage_acl_info'
]
=
', '
.
join
(
acl_list
)
dataset_data
[
'storage_locations'
]
=
storage_locations
dataset_list
.
append
(
dataset_data
)
idx
+=
1
with
open
(
'datasets-{}.json'
.
format
(
submission_id
),
'w'
)
as
outfile
:
json
.
dump
(
dataset_list
,
outfile
,
indent
=
4
)
# print(json.dumps(dataset_list, indent=4))
tests/importxls/test_
survey_xls_parsers
.py
→
tests/importxls/test_
own_cohort
.py
View file @
4e393f4c
from
unittest
import
TestCase
import
os
import
json
import
datetime
from
.test_utils
import
collect_prj_info
,
get_value_list_from_row
,
process_data_types
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
is_data_sheet
,
add_storage_locations
,
get_names_from_string
import
hashlib
import
hashlib
import
pyexcel
import
json
import
os
from
unittest
import
TestCase
class
TestProjectsParser
(
TestCase
):
import
pyexcel
SHEETS_FOLDER
=
'/Users/pinar_alper/desktop/test-ANSWERS'
from
tests.importxls.test_utils
import
get_value_list_from_row
,
process_data_types
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
add_storage_locations
,
SHEETS_FOLDER
# def test_prj_refs_validity(self):
#
# defined_projects = set()
# for prj, path, title, description, pi, start, end, personnel, HasUniLUERP, ERPNotes, HasCNER, CNERNotes, publications in collect_prj_info(
# self.SHEETS_FOLDER):
# defined_projects.add(prj)
#
# for dirName, subdirList, fileList in os.walk(self.SHEETS_FOLDER):
# for fname in fileList:
# if is_data_sheet(fname):
# full_file_path = os.path.join(dirName, fname)
# book = pyexcel.get_book(file_name=full_file_path)
# idx = 0
# while idx < book.number_of_sheets():
# if idx > 0 and fname.startswith('from-repository'):
# # print('---->{}'.format(fname))
# prj_refs = set(get_value_list_from_row(book.sheet_by_index(idx),
# 18) + get_value_list_from_row(
# book.sheet_by_index(idx), 19))
# undefined_refs = prj_refs.difference(defined_projects)
# if len(undefined_refs) > 0:
# print('Reference to undefined project(s): {} in file {}'.format(undefined_refs,
# full_file_path))
# if idx > 0 and fname.startswith('from-collaborator'):
# # print('---->{}'.format(full_file_path))
# prj_refs = set(get_value_list_from_row(book.sheet_by_index(idx),
# 18) + get_value_list_from_row(
# book.sheet_by_index(idx), 26) + get_value_list_from_row(book.sheet_by_index(idx),
# 33))
# undefined_refs = prj_refs.difference(defined_projects)
# if len(undefined_refs) > 0:
# print('Reference to undefined project(s): {} in file {} '.format(undefined_refs,
# full_file_path))
# if idx > 0 and fname.startswith('own-cohort'):
# prj_refs = set(get_value_list_from_row(book.sheet_by_index(idx),
# 5) + get_value_list_from_row(
# book.sheet_by_index(idx), 22))
# undefined_refs = prj_refs.difference(defined_projects)
# if len(undefined_refs) > 0:
# print('Reference to undefined project(s): {} in file {} '.format(undefined_refs,
# full_file_path))
# idx += 1
# return
#
# def test_export_projects(self):
#
# projects_list = []
# for acr, path, title, description, pi, start, end, personnel, HasUniLUERP, ERPNotes, HasCNER, CNERNotes, publications in collect_prj_info(
# self.SHEETS_FOLDER):
# prj_data = {}
# prj_data['acronym'] = acr
# prj_data['title'] = title
# prj_data['description'] = description
# if type(start) is datetime.date:
# prj_data['start_date'] = start.strftime('%m/%d/%Y')
# elif type(start) is str:
# prj_data['start_date'] = start.replace('.', '/')
#
# if type(end) is datetime.date:
# prj_data['end_date'] = end.strftime('%m/%d/%Y')
# elif type(end) is str:
# prj_data['end_date'] = end.replace('.', '/')
# contacts_list = []
# delimeter = ','
# if ';' in pi:
# delimeter = ';'
# if pi:
# for pp in pi.split(delimeter):
# pp_data = {}
# name_list = get_names_from_string(pp)
# pp_data['first_name'] = name_list[0]
# pp_data['last_name'] = name_list[1]
# pp_data['role'] = 'Principal_Investigator'
# pp_data['institution'] = 'Luxembourg Center for Systems Biomedicine (LCSB)'
# contacts_list.append(pp_data)
# delimeter = ','
# if ';' in personnel:
# delimeter = ';'
# if personnel:
# for prs in personnel.split(delimeter):
# prs_data = {}
# name_list = get_names_from_string(prs)
# prs_data['first_name'] = name_list[0]
# prs_data['last_name'] = name_list[1]
# prs_data['role'] = 'Researcher'
# prs_data['institution'] = 'Luxembourg Center for Systems Biomedicine (LCSB)'
# contacts_list.append(prs_data)
# prj_data['contacts'] = contacts_list
# if HasUniLUERP:
# prj_data[
# 'has_institutional_ethics_approval'] = True if HasUniLUERP == 'Yes' else False
# else:
# prj_data['has_institutional_ethics_approval'] = False
#
# if ERPNotes:
# prj_data['institutional_ethics_approval_notes'] = ERPNotes
#
# if HasCNER:
# prj_data['has_national_ethics_approval'] = True if HasUniLUERP == 'Yes' else False
# else:
# pp_data['has_national_ethics_approval'] = False
# if CNERNotes:
# prj_data['national_ethics_approval_notes'] = CNERNotes
#
# if publications:
# publication_list = []
# for pub in publications.split('#'):
# pub_data = {}
# pub_data['citation_string'] = pub
# publication_list.append(pub_data)
# prj_data['publications'] = publication_list
# projects_list.append(prj_data)
# with open('projects.json', 'w') as outfile:
# json.dump(projects_list, outfile, indent=4)
# # print(json.dumps(projects_list, indent=4))
#
#
#
#
# def test_export_from_collaborator(self):
# h = hashlib.md5()
#
# for dirName, subdirList, fileList in os.walk(self.SHEETS_FOLDER):
# for fname in fileList:
# if fname.startswith('from-collaborator'):
# full_file_path = os.path.join(dirName, fname)
# dataset_list = []
# h.update(os.fsencode(full_file_path))
# submission_id = 'IMP_FC_{}'.format(str(int(h.hexdigest(), 16)))
# book = pyexcel.get_book(file_name=full_file_path)
# idx = 1
# # print('----> {}'.format(full_file_path))
# while idx < book.number_of_sheets():
# sheet = book.sheet_by_index(idx)
# dataset_data = {}
# dataset_data['source_type'] = 'From_Collaborator'
# dataset_data['submission_id'] = submission_id
# dataset_data['local_custodian'] = get_value_list_from_row(sheet, 3)
# dataset_data['title'] = sheet[4, 2]
# datatype_info = process_data_types(get_value_list_from_row(sheet, 5))
# dataset_data['data_types'] = datatype_info[0]
# if datatype_info[1]:
# dataset_data['data_type_notes'] = datatype_info[1]
#
# dataset_data['involves_samples'] = process_yes_no_answer(sheet[6, 2])
#
# if sheet[7, 2]:
# dataset_data['samples_location'] = sheet[7, 2]
#
# if sheet[8, 2]:
# dataset_data['de_identification'] = sheet[8, 2]
#
# if sheet[9, 2]:
# dataset_data['subject_categories'] = sheet[9, 2].replace(' & ', '_and_')
#
# if sheet[10, 2]:
# dataset_data['has_special_subjects'] = process_yes_no_dontknow_answer(
# sheet[10, 2])
#
# if dataset_data.get('has_special_subjects'):
# if dataset_data.get('has_special_subjects') == True and sheet[11, 2]:
# dataset_data['special_subject_notes'] = sheet[11, 2]
#
# collab_insts = get_value_list_from_row(sheet, 13)
# collab_pis = get_value_list_from_row(sheet, 14)
#
# if len(collab_insts) == len(collab_pis) and len(collab_insts) > 0:
# i = 0
# src_collab_list = []
# while i < len(collab_insts):
#
# collab_data = {'collab_inst': collab_insts[i],
# 'collab_pi': collab_pis[i],
# 'collab_project': sheet[18, 2]}
#
# if process_yes_no_dontknow_answer(sheet[17, 2]) == False:
# collab_data['collab_role'] = 'controller'
#
# elif process_yes_no_dontknow_answer(sheet[17, 2]) == True:
# collab_data['collab_role'] = 'joint-controller'
#
# src_collab_list.append(collab_data)
# i += 1
# dataset_data['source_collaborations'] = src_collab_list
# else:
# print('Mismatched Collab PI-Institution length {} \n'.format(full_file_path))
#
# if sheet[18, 2]:
# dataset_data['source_project'] = sheet[18, 2]
#
# use_restrictions = []
# if process_yes_no_answer(sheet[25, 2]):
# use_restrictions.append({'ga4gh_code': 'PS',
# 'note': 'Use is restricted to projects: ' + ', '.join(
# get_value_list_from_row(sheet, 26))})
# if process_yes_no_answer(sheet[27, 2]):
# use_restrictions.append({'ga4gh_code': 'RS-[XX]',
# 'note': 'Use is restricted to research areas: ' + ', '.join(
# get_value_list_from_row(sheet, 28))})
#
# dataset_data['use_restrictions'] = use_restrictions
#
# dataset_data['used_by_projects'] = get_value_list_from_row(sheet, 33)
#
# if process_yes_no_answer(sheet[29, 2]):
# shares = get_value_list_from_row(sheet, 30)
# if len(shares) > 0:
# share_list = []
# for shr in shares:
# share_list.append({'share_notes': shr})
# dataset_data['shares'] = share_list
#
# storage_locations = []
#
# master_locations = get_value_list_from_row(sheet, 35)
# try:
# add_storage_locations(storage_locations,master_locations, 'master')
# except ValueError as e:
# print('Invalid Master Data Location Row {} \n'.format(full_file_path))
#
# if process_yes_no_answer(sheet[37, 2]):
# backup_locations = get_value_list_from_row(sheet, 38)
# try:
# add_storage_locations(storage_locations,backup_locations, 'backup')
# except ValueError as e:
# print('Uneven Backup Data Location Row {} \n'.format(full_file_path))
#
# if process_yes_no_answer(sheet[39, 2]):
# copy_locations = get_value_list_from_row(sheet, 40)
# try:
# add_storage_locations(storage_locations,copy_locations, 'copy')
# except ValueError as e:
# print('Uneven Copy Data Location Row {} \n'.format(full_file_path))
#
# acl_list = get_value_list_from_row(sheet, 36)
# if len(acl_list)>0:
# dataset_data['storage_acl_info'] = ', '.join(acl_list)
# dataset_data['storage_locations'] = storage_locations
# dataset_list.append(dataset_data)
# idx += 1
#
# with open('datasets-{}.json'.format(submission_id), 'w') as outfile:
# json.dump(dataset_list, outfile, indent=4)
# # print(json.dumps(dataset_list, indent=4))
class
TestProjectsParser
(
TestCase
):
def
test_export_own_cohort
(
self
):
def
test_export_own_cohort
(
self
):
h
=
hashlib
.
md5
()
h
=
hashlib
.
md5
()
for
dirName
,
subdirList
,
fileList
in
os
.
walk
(
self
.
SHEETS_FOLDER
):
for
dirName
,
subdirList
,
fileList
in
os
.
walk
(
SHEETS_FOLDER
):
for
fname
in
fileList
:
for
fname
in
fileList
:
if
fname
.
startswith
(
'own-cohort'
):
if
fname
.
startswith
(
'own-cohort'
):
full_file_path
=
os
.
path
.
join
(
dirName
,
fname
)
full_file_path
=
os
.
path
.
join
(
dirName
,
fname
)
...
@@ -309,8 +61,8 @@ class TestProjectsParser(TestCase):
...
@@ -309,8 +61,8 @@ class TestProjectsParser(TestCase):
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
13
,
2
]:
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
13
,
2
]:
dataset_data
[
'special_subject_notes'
]
=
sheet
[
13
,
2
]
dataset_data
[
'special_subject_notes'
]
=
sheet
[
13
,
2
]
if
sheet
[
19
,
2
]:
if
sheet
[
19
,
2
]:
dataset_data
[
'consent_status'
]
=
sheet
[
19
,
2
]
dataset_data
[
'consent_status'
]
=
sheet
[
19
,
2
]
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
22
)
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
22
)
...
@@ -337,8 +89,12 @@ class TestProjectsParser(TestCase):
...
@@ -337,8 +89,12 @@ class TestProjectsParser(TestCase):
use_restrictions
.
append
({
'ga4gh_code'
:
'GS-[XX]'
,
use_restrictions
.
append
({
'ga4gh_code'
:
'GS-[XX]'
,
'note'
:
'Data is consented for sharing outside EU'
})
'note'
:
'Data is consented for sharing outside EU'
})
dataset_data
[
'use_restrictions'
]
=
use_restrictions