Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
elixir
metadata-tools
Commits
fbd3c4e1
Commit
fbd3c4e1
authored
Oct 05, 2018
by
Pinar Alper
Browse files
Refactored code from tests to classes
parent
d24c302a
Changes
19
Expand all
Hide whitespace changes
Inline
Side-by-side
metadata
-
tools/__init__.py
→
metadata
_
tools/__init__.py
View file @
fbd3c4e1
File moved
metadata_tools/importxls/__init__.py
0 → 100644
View file @
fbd3c4e1
metadata_tools/importxls/dataset_exporter.py
0 → 100644
View file @
fbd3c4e1
import
pkg_resources
import
json
import
hashlib
from
os
import
fsencode
from
metadata_tools.importxls.export_utils
import
get_lines_from_string
class
DatasetExporter
:
def
__init__
(
self
):
with
open
(
pkg_resources
.
resource_filename
(
'metadata_tools'
,
'resources/elu_institutions.json'
),
encoding
=
'utf-8'
)
as
institutions_file
:
institutions
=
json
.
loads
(
institutions_file
.
read
())
self
.
institution_dict
=
{}
for
inst
in
institutions
:
self
.
institution_dict
[
inst
.
get
(
'institution_name'
)]
=
inst
.
get
(
'elu_accession'
)
self
.
h
=
hashlib
.
md5
()
self
.
predefined_data_types
=
set
([
"Omics data"
,
"Genotype data"
,
"Whole genome sequencing"
,
"Exome sequencing"
,
"Genomics variant array"
,
"RNASeq"
,
"Genetic and derived genetic data"
,
"Transcriptome array"
,
"Methylation array"
,
"MicroRNA array"
,
"Metabolomics"
,
"Metagenomics"
,
"Proteomics"
,
"Other omics data"
,
"Clinical Imaging"
,
"Cell Imaging"
,
"Human subject data"
,
"Clinical data"
,
"Lifestyle data"
,
"Socio Economic Data"
,
"Environmental Data"
,
"Other Phenotype data"
,
"Other"
])
self
.
predefined_storage_types
=
set
([
'hpc_chaos_home'
,
'hpc_chaos_project'
,
'hpc_gaia_home'
,
'hpc_gaia_project'
,
'hpc_gaia_work'
,
'hpc_iris_home'
,
'hpc_iris_project'
,
'hpc_scratch_personal'
,
'hpc_scratch_project'
,
'hpc_isilon'
,
'atlas_personal'
,
'atlas_project'
,
'hpc_backup_chaos'
,
'hpc_backup_gaia'
,
'bertha'
,
'certon_block'
,
'lcsb_group_server'
,
'lcsb_desktop'
,
'lcsb_laptop'
,
'personal_laptop'
,
'Owncloud'
,
'External Storage (e.g. Hard disk, DVD)'
,
'Other'
])
def
get_hash_for_path
(
self
,
path
):
self
.
h
.
update
(
fsencode
(
path
))
return
str
(
int
(
self
.
h
.
hexdigest
(),
16
))
def
lookup_institution_accession
(
self
,
institution_name
):
if
institution_name
not
in
self
.
institution_dict
.
keys
():
print
(
'Undefined institution -- > {}'
.
format
(
institution_name
))
return
None
else
:
return
self
.
institution_dict
[
institution_name
]
def
process_data_types
(
self
,
xls_data_type_list
):
result
=
[]
data_type_notes
=
''
for
type_name
in
xls_data_type_list
:
type_name
=
type_name
.
strip
()
if
type_name
:
if
type_name
in
self
.
predefined_data_types
:
result
.
append
(
type_name
.
replace
(
" "
,
"_"
))
else
:
data_type_notes
+=
type_name
+
'
\n
'
return
(
result
,
data_type_notes
)
def
is_storage_resource
(
self
,
resource
):
if
resource
in
self
.
predefined_storage_types
:
return
True
else
:
print
(
'Unknow Storage resource --> {}'
.
format
(
resource
))
return
False
def
get_storage_location
(
self
,
resource
,
path
,
category
):
result
=
{}
if
self
.
is_application
(
path
):
result
[
'storage_resource'
]
=
'application'
elif
resource
in
self
.
predefined_data_types
:
result
[
'storage_resource'
]
=
resource
else
:
result
[
'storage_resource'
]
=
'Other'
result
[
'location'
]
=
{
'location'
:
path
}
result
[
'category'
]
=
category
return
result
def
is_application
(
self
,
path
):
if
(
"transmart"
in
path
.
lower
())
or
(
"redcap"
in
path
.
lower
()):
return
True
else
:
return
False
def
process_share_list
(
self
,
shares
):
share_list
=
[]
for
shr
in
shares
:
if
";"
not
in
shr
:
if
self
.
lookup_institution_accession
(
shr
.
strip
()):
share_list
.
append
({
'share_inst'
:
self
.
lookup_institution_accession
(
shr
.
strip
())})
else
:
share_list
.
append
({
'share_notes'
:
shr
})
else
:
infos
=
shr
.
split
(
";"
)
share_list
.
append
({
'share_inst'
:
self
.
lookup_institution_accession
(
infos
[
0
].
strip
()),
'share_notes'
:
infos
[
1
].
strip
()})
return
share_list
def
add_storage_locations
(
self
,
storage_dict
,
locations_list
,
category
):
if
len
(
locations_list
)
%
2
!=
0
and
len
(
locations_list
)
>
0
:
if
len
(
locations_list
)
==
1
:
if
self
.
is_storage_resource
(
locations_list
[
0
]):
storage_dict
.
append
(
self
.
get_storage_location
(
locations_list
[
0
],
'<missing_info>'
,
category
))
else
:
for
line
in
get_lines_from_string
(
locations_list
[
0
]):
storage_dict
.
append
(
self
.
get_storage_location
(
'Other'
,
line
,
category
))
else
:
raise
ValueError
(
'Uneven Master Data Location Row'
)
elif
len
(
locations_list
)
%
2
==
0
and
len
(
locations_list
)
>
0
:
s
=
0
e
=
len
(
locations_list
)
//
2
while
s
<
e
:
if
self
.
is_storage_resource
(
locations_list
[
s
*
2
]):
for
line
in
get_lines_from_string
(
locations_list
[
s
*
2
+
1
]):
storage_dict
.
append
(
self
.
get_storage_location
(
locations_list
[
s
*
2
],
line
,
category
))
else
:
for
line
in
get_lines_from_string
(
locations_list
[
s
*
2
]):
storage_dict
.
append
(
self
.
get_storage_location
(
'Other'
,
line
,
category
))
s
+=
1
metadata_tools/importxls/export_utils.py
0 → 100644
View file @
fbd3c4e1
import
datetime
def
process_yes_no_answer
(
answer
):
"""
convert yes/no answers to boolean we take empty answers as no
:param xls_data_type_list:
"""
result
=
False
if
answer
:
if
answer
==
'Yes'
:
result
=
True
return
result
def
process_yes_no_dontknow_answer
(
answer
):
"""
convert yes/no/dontknow answers to boolean
we return empty and dontknow answers as None
:param xls_data_type_list:
"""
if
answer
:
if
answer
==
'Yes'
:
return
True
elif
answer
==
'No'
:
return
False
else
:
return
None
else
:
return
None
def
is_data_sheet
(
fname
):
return
fname
.
startswith
(
'from-repository'
)
or
fname
.
startswith
(
'from-collaborator'
)
or
fname
.
startswith
(
'own-cohort'
)
def
get_value_list_from_row
(
sheet
,
row_idx
):
result
=
[]
vals
=
sheet
.
row
[
row_idx
]
data_vals
=
vals
[
2
:]
for
val
in
data_vals
:
if
val
:
result
.
append
(
val
)
return
result
def
process_possible_date
(
possible_date
):
if
isinstance
(
possible_date
,
datetime
.
date
):
return
possible_date
.
strftime
(
"%Y/%m/%d"
)
else
:
return
str
(
possible_date
).
replace
(
'.'
,
'/'
)
def
get_names_from_string
(
full_name
):
result
=
[
''
,
''
]
name
=
full_name
.
strip
()
if
name
.
endswith
(
','
)
or
name
.
endswith
(
','
):
name
=
name
[:
-
1
]
if
name
is
not
None
:
if
" "
in
name
:
name_list
=
name
.
split
(
" "
)
len_name
=
len
(
name_list
)
result
[
0
]
=
name_list
[
0
]
if
len_name
>
1
:
result
[
1
]
=
name_list
[
1
]
if
len_name
==
3
:
result
[
1
]
=
result
[
1
]
+
' '
+
name_list
[
2
]
else
:
result
[
0
]
=
name
return
result
def
get_lines_from_string
(
a_string
):
result
=
[]
stripped
=
a_string
.
strip
()
line_list
=
stripped
.
splitlines
()
for
line
in
line_list
:
if
line
:
result
.
append
(
line
)
return
result
\ No newline at end of file
metadata_tools/importxls/from_collab_exporter.py
0 → 100644
View file @
fbd3c4e1
from
.dataset_exporter
import
DatasetExporter
import
pyexcel
import
json
from
metadata_tools.importxls.export_utils
import
get_value_list_from_row
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
process_possible_date
class
FromCollabXlsExporter
(
DatasetExporter
):
def
export
(
self
,
full_file_path
):
submission_id
=
'IMP_FC_{}'
.
format
(
self
.
get_hash_for_path
(
full_file_path
))
idx
=
1
print
(
'Processing ----> {}'
.
format
(
full_file_path
))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'source_type'
]
=
'From_Collaborator'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
3
)
dataset_data
[
'title'
]
=
sheet
[
4
,
2
]
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
datatype_info
=
self
.
process_data_types
(
get_value_list_from_row
(
sheet
,
5
))
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
dataset_data
[
'involves_samples'
]
=
process_yes_no_answer
(
sheet
[
6
,
2
])
if
sheet
[
7
,
2
]:
dataset_data
[
'samples_location'
]
=
sheet
[
7
,
2
]
if
sheet
[
8
,
2
]:
dataset_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
dataset_data
[
'subject_categories'
]
=
sheet
[
9
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
10
,
2
]:
dataset_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
10
,
2
])
if
dataset_data
.
get
(
'has_special_subjects'
):
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
11
,
2
]:
dataset_data
[
'special_subject_notes'
]
=
sheet
[
11
,
2
]
collab_insts
=
get_value_list_from_row
(
sheet
,
13
)
collab_pis
=
get_value_list_from_row
(
sheet
,
14
)
if
(
len
(
collab_insts
)
==
len
(
collab_pis
))
and
len
(
collab_insts
)
>
0
:
i
=
0
src_collab_list
=
[]
while
i
<
len
(
collab_insts
):
collab_data
=
{
'collab_inst'
:
self
.
lookup_institution_accession
(
collab_insts
[
i
]),
'collab_pi'
:
collab_pis
[
i
],
'collab_project'
:
sheet
[
18
,
2
]}
if
process_yes_no_dontknow_answer
(
sheet
[
17
,
2
])
==
False
:
collab_data
[
'collab_role'
]
=
'controller'
elif
process_yes_no_dontknow_answer
(
sheet
[
17
,
2
])
==
True
:
collab_data
[
'collab_role'
]
=
'joint-controller'
src_collab_list
.
append
(
collab_data
)
i
+=
1
dataset_data
[
'source_collaborations'
]
=
src_collab_list
else
:
print
(
'Mismatched Collab PI-Institution length {}
\n
'
.
format
(
full_file_path
))
if
len
(
collab_insts
)
>
1
:
print
(
'Multi source collab ----> {}'
.
format
(
full_file_path
))
if
sheet
[
18
,
2
]:
dataset_data
[
'source_project'
]
=
sheet
[
18
,
2
]
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
25
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PS'
,
'note'
:
'Use is restricted to projects: '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
26
))})
if
process_yes_no_answer
(
sheet
[
27
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'RS-[XX]'
,
'note'
:
'Use is restricted to research areas: '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
28
))})
if
process_yes_no_answer
(
sheet
[
43
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PUB'
,
'note'
:
'Acknowledgement required.'
})
has_time_limis
=
process_yes_no_dontknow_answer
(
sheet
[
41
,
2
])
if
has_time_limis
and
sheet
[
42
,
2
]:
use_restrictions
.
append
({
'ga4gh_code'
:
'TS-[XX]'
,
'note'
:
'Data is obtained for a limited duration.'
+
process_possible_date
(
sheet
[
42
,
2
])})
dataset_data
[
'use_restrictions'
]
=
use_restrictions
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
33
)
if
process_yes_no_answer
(
sheet
[
29
,
2
]):
dataset_data
[
'shares'
]
=
self
.
process_share_list
(
get_value_list_from_row
(
sheet
,
30
))
storage_locations
=
[]
master_locations
=
get_value_list_from_row
(
sheet
,
35
)
try
:
self
.
add_storage_locations
(
storage_locations
,
master_locations
,
'master'
)
except
ValueError
as
e
:
print
(
'Invalid Master Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
37
,
2
]):
backup_locations
=
get_value_list_from_row
(
sheet
,
38
)
try
:
self
.
add_storage_locations
(
storage_locations
,
backup_locations
,
'backup'
)
except
ValueError
as
e
:
print
(
'Uneven Backup Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
39
,
2
]):
copy_locations
=
get_value_list_from_row
(
sheet
,
40
)
try
:
self
.
add_storage_locations
(
storage_locations
,
copy_locations
,
'copy'
)
except
ValueError
as
e
:
print
(
'Uneven Copy Data Location Row {}
\n
'
.
format
(
full_file_path
))
acl_list
=
get_value_list_from_row
(
sheet
,
36
)
if
len
(
acl_list
)
>
0
:
dataset_data
[
'storage_acl_info'
]
=
', '
.
join
(
acl_list
)
dataset_data
[
'storage_locations'
]
=
storage_locations
idx
+=
1
with
open
(
'{}_.json'
.
format
(
submission_id
),
'w'
)
as
outfile
:
json
.
dump
(
dataset_data
,
outfile
,
indent
=
4
)
metadata_tools/importxls/from_owncohort_exporter.py
0 → 100644
View file @
fbd3c4e1
import
pyexcel
import
json
from
.dataset_exporter
import
DatasetExporter
from
metadata_tools.importxls.export_utils
import
get_value_list_from_row
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
process_possible_date
class
FromOwncohortXlsExporter
(
DatasetExporter
):
def
export
(
self
,
full_file_path
):
submission_id
=
'IMP_OC_{}'
.
format
(
self
.
get_hash_for_path
(
full_file_path
))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
idx
=
1
print
(
'----> {}'
.
format
(
full_file_path
))
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'source_type'
]
=
'Own_Cohort'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
3
)
dataset_data
[
'title'
]
=
sheet
[
4
,
2
]
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
if
sheet
[
5
,
2
]:
dataset_data
[
'source_project'
]
=
sheet
[
5
,
2
]
datatype_info
=
self
.
process_data_types
(
get_value_list_from_row
(
sheet
,
6
))
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
dataset_data
[
'involves_samples'
]
=
process_yes_no_answer
(
sheet
[
7
,
2
])
if
sheet
[
7
,
2
]:
dataset_data
[
'samples_location'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
dataset_data
[
'de_identification'
]
=
sheet
[
9
,
2
]
if
sheet
[
10
,
2
]:
dataset_data
[
'ombudsman'
]
=
sheet
[
10
,
2
]
if
sheet
[
11
,
2
]:
dataset_data
[
'subject_categories'
]
=
sheet
[
11
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
12
,
2
]:
dataset_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
12
,
2
])
if
dataset_data
.
get
(
'has_special_subjects'
):
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
13
,
2
]:
dataset_data
[
'special_subject_notes'
]
=
sheet
[
13
,
2
]
if
sheet
[
19
,
2
]:
dataset_data
[
'consent_status'
]
=
sheet
[
19
,
2
]
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
22
)
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
21
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PS'
,
'note'
:
'Consent form restricts data use to projects '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
23
))})
if
process_yes_no_answer
(
sheet
[
24
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'RS-[XX]'
,
'note'
:
'Data is consented for research on'
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
25
))})
if
process_yes_no_answer
(
sheet
[
26
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'GS-[XX]'
,
'note'
:
'Data is consented for sharing outside LCSB (Within Luxembourg)'
})
if
process_yes_no_answer
(
sheet
[
29
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'GS-[XX]'
,
'note'
:
'Data is consented for sharing outside Luxembourg (within EU)'
})
if
process_yes_no_answer
(
sheet
[
32
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'GS-[XX]'
,
'note'
:
'Data is consented for sharing outside EU'
})
has_time_limis
=
process_yes_no_dontknow_answer
(
sheet
[
42
,
2
])
if
has_time_limis
and
sheet
[
43
,
2
]:
use_restrictions
.
append
({
'ga4gh_code'
:
'TS-[XX]'
,
'note'
:
'Data is obtained for a limited duration.'
+
process_possible_date
(
sheet
[
43
,
2
])})
dataset_data
[
'use_restrictions'
]
=
use_restrictions
share_list
=
[]
if
process_yes_no_answer
(
sheet
[
27
,
2
]):
share_list
+=
self
.
process_share_list
(
get_value_list_from_row
(
sheet
,
28
))
if
process_yes_no_answer
(
sheet
[
30
,
2
]):
share_list
+=
self
.
process_share_list
(
get_value_list_from_row
(
sheet
,
31
))
if
process_yes_no_answer
(
sheet
[
33
,
2
]):
share_list
+=
self
.
process_share_list
(
get_value_list_from_row
(
sheet
,
34
))
dataset_data
[
'shares'
]
=
share_list
storage_locations
=
[]
master_locations
=
get_value_list_from_row
(
sheet
,
36
)
try
:
self
.
add_storage_locations
(
storage_locations
,
master_locations
,
'master'
)
except
ValueError
as
e
:
print
(
'Invalid Master Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
38
,
2
]):
backup_locations
=
get_value_list_from_row
(
sheet
,
39
)
try
:
self
.
add_storage_locations
(
storage_locations
,
backup_locations
,
'backup'
)
except
ValueError
as
e
:
print
(
'Uneven Backup Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
40
,
2
]):
copy_locations
=
get_value_list_from_row
(
sheet
,
41
)
try
:
self
.
add_storage_locations
(
storage_locations
,
copy_locations
,
'copy'
)
except
ValueError
as
e
:
print
(
'Uneven Copy Data Location Row {}
\n
'
.
format
(
full_file_path
))
acl_list
=
get_value_list_from_row
(
sheet
,
37
)
if
len
(
acl_list
)
>
0
:
dataset_data
[
'storage_acl_info'
]
=
', '
.
join
(
acl_list
)
dataset_data
[
'storage_locations'
]
=
storage_locations
idx
+=
1
with
open
(
'datasets-{}.json'
.
format
(
submission_id
),
'w'
)
as
outfile
:
json
.
dump
(
dataset_data
,
outfile
,
indent
=
4
)
metadata_tools/importxls/from_repo_exporter.py
0 → 100644
View file @
fbd3c4e1
import
pyexcel
import
json
from
.dataset_exporter
import
DatasetExporter
from
metadata_tools.importxls.export_utils
import
get_value_list_from_row
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
process_possible_date
class
FromRepoXlsExporter
(
DatasetExporter
):
def
export
(
self
,
full_file_path
):
submission_id
=
'IMP_FR_{}'
.
format
(
self
.
get_hash_for_path
(
full_file_path
))
idx
=
1
print
(
'Processing ----> {}'
.
format
(
full_file_path
))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'source_type'
]
=
'From_Repository'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
2
)
dataset_data
[
'source_repository'
]
=
self
.
lookup_institution_accession
(
sheet
[
6
,
2
].
strip
())
if
sheet
[
4
,
2
]:
dataset_data
[
'other_external_id'
]
=
sheet
[
4
,
2
]
if
sheet
[
5
,
2
]:
dataset_data
[
'title'
]
=
sheet
[
5
,
2
].
strip
()
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
datatype_info
=
self
.
process_data_types
(
get_value_list_from_row
(
sheet
,
7
))
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
if
datatype_info
[
1
].
__contains__
(
'..'
):
print
(
'INVALID DATA TYPE NOTES----> {}'
.
format
(
full_file_path
))
if
sheet
[
8
,
2
]:
dataset_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
dataset_data
[
'subject_categories'
]
=
sheet
[
9
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
10
,
2
]:
dataset_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
10
,
2
])
if
dataset_data
.
get
(
'has_special_subjects'
):
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
11
,
2
]:
dataset_data
[
'special_subject_notes'
]
=
sheet
[
11
,
2
]
if
sheet
[
14
,
2
]:
dataset_data
[
'access_category'
]
=
sheet
[
14
,
2
]
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
19
)
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
17
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PS'
,
'note'
:
'Contract restricts data use to projects '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
18
))})