Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Pinar Alper
metadata-tools
Commits
fbd3c4e1
Commit
fbd3c4e1
authored
Oct 05, 2018
by
Pinar Alper
Browse files
Refactored code from tests to classes
parent
d24c302a
Changes
19
Hide whitespace changes
Inline
Side-by-side
metadata
-
tools/__init__.py
→
metadata
_
tools/__init__.py
View file @
fbd3c4e1
File moved
metadata_tools/importxls/__init__.py
0 → 100644
View file @
fbd3c4e1
metadata_tools/importxls/dataset_exporter.py
0 → 100644
View file @
fbd3c4e1
import
pkg_resources
import
json
import
hashlib
from
os
import
fsencode
from
metadata_tools.importxls.export_utils
import
get_lines_from_string
class
DatasetExporter
:
def
__init__
(
self
):
with
open
(
pkg_resources
.
resource_filename
(
'metadata_tools'
,
'resources/elu_institutions.json'
),
encoding
=
'utf-8'
)
as
institutions_file
:
institutions
=
json
.
loads
(
institutions_file
.
read
())
self
.
institution_dict
=
{}
for
inst
in
institutions
:
self
.
institution_dict
[
inst
.
get
(
'institution_name'
)]
=
inst
.
get
(
'elu_accession'
)
self
.
h
=
hashlib
.
md5
()
self
.
predefined_data_types
=
set
([
"Omics data"
,
"Genotype data"
,
"Whole genome sequencing"
,
"Exome sequencing"
,
"Genomics variant array"
,
"RNASeq"
,
"Genetic and derived genetic data"
,
"Transcriptome array"
,
"Methylation array"
,
"MicroRNA array"
,
"Metabolomics"
,
"Metagenomics"
,
"Proteomics"
,
"Other omics data"
,
"Clinical Imaging"
,
"Cell Imaging"
,
"Human subject data"
,
"Clinical data"
,
"Lifestyle data"
,
"Socio Economic Data"
,
"Environmental Data"
,
"Other Phenotype data"
,
"Other"
])
self
.
predefined_storage_types
=
set
([
'hpc_chaos_home'
,
'hpc_chaos_project'
,
'hpc_gaia_home'
,
'hpc_gaia_project'
,
'hpc_gaia_work'
,
'hpc_iris_home'
,
'hpc_iris_project'
,
'hpc_scratch_personal'
,
'hpc_scratch_project'
,
'hpc_isilon'
,
'atlas_personal'
,
'atlas_project'
,
'hpc_backup_chaos'
,
'hpc_backup_gaia'
,
'bertha'
,
'certon_block'
,
'lcsb_group_server'
,
'lcsb_desktop'
,
'lcsb_laptop'
,
'personal_laptop'
,
'Owncloud'
,
'External Storage (e.g. Hard disk, DVD)'
,
'Other'
])
def
get_hash_for_path
(
self
,
path
):
self
.
h
.
update
(
fsencode
(
path
))
return
str
(
int
(
self
.
h
.
hexdigest
(),
16
))
def
lookup_institution_accession
(
self
,
institution_name
):
if
institution_name
not
in
self
.
institution_dict
.
keys
():
print
(
'Undefined institution -- > {}'
.
format
(
institution_name
))
return
None
else
:
return
self
.
institution_dict
[
institution_name
]
def
process_data_types
(
self
,
xls_data_type_list
):
result
=
[]
data_type_notes
=
''
for
type_name
in
xls_data_type_list
:
type_name
=
type_name
.
strip
()
if
type_name
:
if
type_name
in
self
.
predefined_data_types
:
result
.
append
(
type_name
.
replace
(
" "
,
"_"
))
else
:
data_type_notes
+=
type_name
+
'
\n
'
return
(
result
,
data_type_notes
)
def
is_storage_resource
(
self
,
resource
):
if
resource
in
self
.
predefined_storage_types
:
return
True
else
:
print
(
'Unknow Storage resource --> {}'
.
format
(
resource
))
return
False
def
get_storage_location
(
self
,
resource
,
path
,
category
):
result
=
{}
if
self
.
is_application
(
path
):
result
[
'storage_resource'
]
=
'application'
elif
resource
in
self
.
predefined_data_types
:
result
[
'storage_resource'
]
=
resource
else
:
result
[
'storage_resource'
]
=
'Other'
result
[
'location'
]
=
{
'location'
:
path
}
result
[
'category'
]
=
category
return
result
def
is_application
(
self
,
path
):
if
(
"transmart"
in
path
.
lower
())
or
(
"redcap"
in
path
.
lower
()):
return
True
else
:
return
False
def
process_share_list
(
self
,
shares
):
share_list
=
[]
for
shr
in
shares
:
if
";"
not
in
shr
:
if
self
.
lookup_institution_accession
(
shr
.
strip
()):
share_list
.
append
({
'share_inst'
:
self
.
lookup_institution_accession
(
shr
.
strip
())})
else
:
share_list
.
append
({
'share_notes'
:
shr
})
else
:
infos
=
shr
.
split
(
";"
)
share_list
.
append
({
'share_inst'
:
self
.
lookup_institution_accession
(
infos
[
0
].
strip
()),
'share_notes'
:
infos
[
1
].
strip
()})
return
share_list
def
add_storage_locations
(
self
,
storage_dict
,
locations_list
,
category
):
if
len
(
locations_list
)
%
2
!=
0
and
len
(
locations_list
)
>
0
:
if
len
(
locations_list
)
==
1
:
if
self
.
is_storage_resource
(
locations_list
[
0
]):
storage_dict
.
append
(
self
.
get_storage_location
(
locations_list
[
0
],
'<missing_info>'
,
category
))
else
:
for
line
in
get_lines_from_string
(
locations_list
[
0
]):
storage_dict
.
append
(
self
.
get_storage_location
(
'Other'
,
line
,
category
))
else
:
raise
ValueError
(
'Uneven Master Data Location Row'
)
elif
len
(
locations_list
)
%
2
==
0
and
len
(
locations_list
)
>
0
:
s
=
0
e
=
len
(
locations_list
)
//
2
while
s
<
e
:
if
self
.
is_storage_resource
(
locations_list
[
s
*
2
]):
for
line
in
get_lines_from_string
(
locations_list
[
s
*
2
+
1
]):
storage_dict
.
append
(
self
.
get_storage_location
(
locations_list
[
s
*
2
],
line
,
category
))
else
:
for
line
in
get_lines_from_string
(
locations_list
[
s
*
2
]):
storage_dict
.
append
(
self
.
get_storage_location
(
'Other'
,
line
,
category
))
s
+=
1
metadata_tools/importxls/export_utils.py
0 → 100644
View file @
fbd3c4e1
import
datetime
def
process_yes_no_answer
(
answer
):
"""
convert yes/no answers to boolean we take empty answers as no
:param xls_data_type_list:
"""
result
=
False
if
answer
:
if
answer
==
'Yes'
:
result
=
True
return
result
def
process_yes_no_dontknow_answer
(
answer
):
"""
convert yes/no/dontknow answers to boolean
we return empty and dontknow answers as None
:param xls_data_type_list:
"""
if
answer
:
if
answer
==
'Yes'
:
return
True
elif
answer
==
'No'
:
return
False
else
:
return
None
else
:
return
None
def
is_data_sheet
(
fname
):
return
fname
.
startswith
(
'from-repository'
)
or
fname
.
startswith
(
'from-collaborator'
)
or
fname
.
startswith
(
'own-cohort'
)
def
get_value_list_from_row
(
sheet
,
row_idx
):
result
=
[]
vals
=
sheet
.
row
[
row_idx
]
data_vals
=
vals
[
2
:]
for
val
in
data_vals
:
if
val
:
result
.
append
(
val
)
return
result
def
process_possible_date
(
possible_date
):
if
isinstance
(
possible_date
,
datetime
.
date
):
return
possible_date
.
strftime
(
"%Y/%m/%d"
)
else
:
return
str
(
possible_date
).
replace
(
'.'
,
'/'
)
def
get_names_from_string
(
full_name
):
result
=
[
''
,
''
]
name
=
full_name
.
strip
()
if
name
.
endswith
(
','
)
or
name
.
endswith
(
','
):
name
=
name
[:
-
1
]
if
name
is
not
None
:
if
" "
in
name
:
name_list
=
name
.
split
(
" "
)
len_name
=
len
(
name_list
)
result
[
0
]
=
name_list
[
0
]
if
len_name
>
1
:
result
[
1
]
=
name_list
[
1
]
if
len_name
==
3
:
result
[
1
]
=
result
[
1
]
+
' '
+
name_list
[
2
]
else
:
result
[
0
]
=
name
return
result
def
get_lines_from_string
(
a_string
):
result
=
[]
stripped
=
a_string
.
strip
()
line_list
=
stripped
.
splitlines
()
for
line
in
line_list
:
if
line
:
result
.
append
(
line
)
return
result
\ No newline at end of file
metadata_tools/importxls/from_collab_exporter.py
0 → 100644
View file @
fbd3c4e1
from
.dataset_exporter
import
DatasetExporter
import
pyexcel
import
json
from
metadata_tools.importxls.export_utils
import
get_value_list_from_row
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
process_possible_date
class
FromCollabXlsExporter
(
DatasetExporter
):
def
export
(
self
,
full_file_path
):
submission_id
=
'IMP_FC_{}'
.
format
(
self
.
get_hash_for_path
(
full_file_path
))
idx
=
1
print
(
'Processing ----> {}'
.
format
(
full_file_path
))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'source_type'
]
=
'From_Collaborator'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
3
)
dataset_data
[
'title'
]
=
sheet
[
4
,
2
]
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
datatype_info
=
self
.
process_data_types
(
get_value_list_from_row
(
sheet
,
5
))
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
dataset_data
[
'involves_samples'
]
=
process_yes_no_answer
(
sheet
[
6
,
2
])
if
sheet
[
7
,
2
]:
dataset_data
[
'samples_location'
]
=
sheet
[
7
,
2
]
if
sheet
[
8
,
2
]:
dataset_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
dataset_data
[
'subject_categories'
]
=
sheet
[
9
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
10
,
2
]:
dataset_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
10
,
2
])
if
dataset_data
.
get
(
'has_special_subjects'
):
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
11
,
2
]:
dataset_data
[
'special_subject_notes'
]
=
sheet
[
11
,
2
]
collab_insts
=
get_value_list_from_row
(
sheet
,
13
)
collab_pis
=
get_value_list_from_row
(
sheet
,
14
)
if
(
len
(
collab_insts
)
==
len
(
collab_pis
))
and
len
(
collab_insts
)
>
0
:
i
=
0
src_collab_list
=
[]
while
i
<
len
(
collab_insts
):
collab_data
=
{
'collab_inst'
:
self
.
lookup_institution_accession
(
collab_insts
[
i
]),
'collab_pi'
:
collab_pis
[
i
],
'collab_project'
:
sheet
[
18
,
2
]}
if
process_yes_no_dontknow_answer
(
sheet
[
17
,
2
])
==
False
:
collab_data
[
'collab_role'
]
=
'controller'
elif
process_yes_no_dontknow_answer
(
sheet
[
17
,
2
])
==
True
:
collab_data
[
'collab_role'
]
=
'joint-controller'
src_collab_list
.
append
(
collab_data
)
i
+=
1
dataset_data
[
'source_collaborations'
]
=
src_collab_list
else
:
print
(
'Mismatched Collab PI-Institution length {}
\n
'
.
format
(
full_file_path
))
if
len
(
collab_insts
)
>
1
:
print
(
'Multi source collab ----> {}'
.
format
(
full_file_path
))
if
sheet
[
18
,
2
]:
dataset_data
[
'source_project'
]
=
sheet
[
18
,
2
]
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
25
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PS'
,
'note'
:
'Use is restricted to projects: '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
26
))})
if
process_yes_no_answer
(
sheet
[
27
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'RS-[XX]'
,
'note'
:
'Use is restricted to research areas: '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
28
))})
if
process_yes_no_answer
(
sheet
[
43
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PUB'
,
'note'
:
'Acknowledgement required.'
})
has_time_limis
=
process_yes_no_dontknow_answer
(
sheet
[
41
,
2
])
if
has_time_limis
and
sheet
[
42
,
2
]:
use_restrictions
.
append
({
'ga4gh_code'
:
'TS-[XX]'
,
'note'
:
'Data is obtained for a limited duration.'
+
process_possible_date
(
sheet
[
42
,
2
])})
dataset_data
[
'use_restrictions'
]
=
use_restrictions
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
33
)
if
process_yes_no_answer
(
sheet
[
29
,
2
]):
dataset_data
[
'shares'
]
=
self
.
process_share_list
(
get_value_list_from_row
(
sheet
,
30
))
storage_locations
=
[]
master_locations
=
get_value_list_from_row
(
sheet
,
35
)
try
:
self
.
add_storage_locations
(
storage_locations
,
master_locations
,
'master'
)
except
ValueError
as
e
:
print
(
'Invalid Master Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
37
,
2
]):
backup_locations
=
get_value_list_from_row
(
sheet
,
38
)
try
:
self
.
add_storage_locations
(
storage_locations
,
backup_locations
,
'backup'
)
except
ValueError
as
e
:
print
(
'Uneven Backup Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
39
,
2
]):
copy_locations
=
get_value_list_from_row
(
sheet
,
40
)
try
:
self
.
add_storage_locations
(
storage_locations
,
copy_locations
,
'copy'
)
except
ValueError
as
e
:
print
(
'Uneven Copy Data Location Row {}
\n
'
.
format
(
full_file_path
))
acl_list
=
get_value_list_from_row
(
sheet
,
36
)
if
len
(
acl_list
)
>
0
:
dataset_data
[
'storage_acl_info'
]
=
', '
.
join
(
acl_list
)
dataset_data
[
'storage_locations'
]
=
storage_locations
idx
+=
1
with
open
(
'{}_.json'
.
format
(
submission_id
),
'w'
)
as
outfile
:
json
.
dump
(
dataset_data
,
outfile
,
indent
=
4
)
metadata_tools/importxls/from_owncohort_exporter.py
0 → 100644
View file @
fbd3c4e1
import
pyexcel
import
json
from
.dataset_exporter
import
DatasetExporter
from
metadata_tools.importxls.export_utils
import
get_value_list_from_row
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
process_possible_date
class
FromOwncohortXlsExporter
(
DatasetExporter
):
def
export
(
self
,
full_file_path
):
submission_id
=
'IMP_OC_{}'
.
format
(
self
.
get_hash_for_path
(
full_file_path
))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
idx
=
1
print
(
'----> {}'
.
format
(
full_file_path
))
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'source_type'
]
=
'Own_Cohort'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
3
)
dataset_data
[
'title'
]
=
sheet
[
4
,
2
]
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
if
sheet
[
5
,
2
]:
dataset_data
[
'source_project'
]
=
sheet
[
5
,
2
]
datatype_info
=
self
.
process_data_types
(
get_value_list_from_row
(
sheet
,
6
))
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
dataset_data
[
'involves_samples'
]
=
process_yes_no_answer
(
sheet
[
7
,
2
])
if
sheet
[
7
,
2
]:
dataset_data
[
'samples_location'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
dataset_data
[
'de_identification'
]
=
sheet
[
9
,
2
]
if
sheet
[
10
,
2
]:
dataset_data
[
'ombudsman'
]
=
sheet
[
10
,
2
]
if
sheet
[
11
,
2
]:
dataset_data
[
'subject_categories'
]
=
sheet
[
11
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
12
,
2
]:
dataset_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
12
,
2
])
if
dataset_data
.
get
(
'has_special_subjects'
):
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
13
,
2
]:
dataset_data
[
'special_subject_notes'
]
=
sheet
[
13
,
2
]
if
sheet
[
19
,
2
]:
dataset_data
[
'consent_status'
]
=
sheet
[
19
,
2
]
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
22
)
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
21
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PS'
,
'note'
:
'Consent form restricts data use to projects '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
23
))})
if
process_yes_no_answer
(
sheet
[
24
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'RS-[XX]'
,
'note'
:
'Data is consented for research on'
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
25
))})
if
process_yes_no_answer
(
sheet
[
26
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'GS-[XX]'
,
'note'
:
'Data is consented for sharing outside LCSB (Within Luxembourg)'
})
if
process_yes_no_answer
(
sheet
[
29
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'GS-[XX]'
,
'note'
:
'Data is consented for sharing outside Luxembourg (within EU)'
})
if
process_yes_no_answer
(
sheet
[
32
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'GS-[XX]'
,
'note'
:
'Data is consented for sharing outside EU'
})
has_time_limis
=
process_yes_no_dontknow_answer
(
sheet
[
42
,
2
])
if
has_time_limis
and
sheet
[
43
,
2
]:
use_restrictions
.
append
({
'ga4gh_code'
:
'TS-[XX]'
,
'note'
:
'Data is obtained for a limited duration.'
+
process_possible_date
(
sheet
[
43
,
2
])})
dataset_data
[
'use_restrictions'
]
=
use_restrictions
share_list
=
[]
if
process_yes_no_answer
(
sheet
[
27
,
2
]):
share_list
+=
self
.
process_share_list
(
get_value_list_from_row
(
sheet
,
28
))
if
process_yes_no_answer
(
sheet
[
30
,
2
]):
share_list
+=
self
.
process_share_list
(
get_value_list_from_row
(
sheet
,
31
))
if
process_yes_no_answer
(
sheet
[
33
,
2
]):
share_list
+=
self
.
process_share_list
(
get_value_list_from_row
(
sheet
,
34
))
dataset_data
[
'shares'
]
=
share_list
storage_locations
=
[]
master_locations
=
get_value_list_from_row
(
sheet
,
36
)
try
:
self
.
add_storage_locations
(
storage_locations
,
master_locations
,
'master'
)
except
ValueError
as
e
:
print
(
'Invalid Master Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
38
,
2
]):
backup_locations
=
get_value_list_from_row
(
sheet
,
39
)
try
:
self
.
add_storage_locations
(
storage_locations
,
backup_locations
,
'backup'
)
except
ValueError
as
e
:
print
(
'Uneven Backup Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
40
,
2
]):
copy_locations
=
get_value_list_from_row
(
sheet
,
41
)
try
:
self
.
add_storage_locations
(
storage_locations
,
copy_locations
,
'copy'
)
except
ValueError
as
e
:
print
(
'Uneven Copy Data Location Row {}
\n
'
.
format
(
full_file_path
))
acl_list
=
get_value_list_from_row
(
sheet
,
37
)
if
len
(
acl_list
)
>
0
:
dataset_data
[
'storage_acl_info'
]
=
', '
.
join
(
acl_list
)
dataset_data
[
'storage_locations'
]
=
storage_locations
idx
+=
1
with
open
(
'datasets-{}.json'
.
format
(
submission_id
),
'w'
)
as
outfile
:
json
.
dump
(
dataset_data
,
outfile
,
indent
=
4
)
metadata_tools/importxls/from_repo_exporter.py
0 → 100644
View file @
fbd3c4e1
import
pyexcel
import
json
from
.dataset_exporter
import
DatasetExporter
from
metadata_tools.importxls.export_utils
import
get_value_list_from_row
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
process_possible_date
class
FromRepoXlsExporter
(
DatasetExporter
):
def
export
(
self
,
full_file_path
):
submission_id
=
'IMP_FR_{}'
.
format
(
self
.
get_hash_for_path
(
full_file_path
))
idx
=
1
print
(
'Processing ----> {}'
.
format
(
full_file_path
))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'source_type'
]
=
'From_Repository'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
2
)
dataset_data
[
'source_repository'
]
=
self
.
lookup_institution_accession
(
sheet
[
6
,
2
].
strip
())
if
sheet
[
4
,
2
]:
dataset_data
[
'other_external_id'
]
=
sheet
[
4
,
2
]
if
sheet
[
5
,
2
]:
dataset_data
[
'title'
]
=
sheet
[
5
,
2
].
strip
()
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
datatype_info
=
self
.
process_data_types
(
get_value_list_from_row
(
sheet
,
7
))
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
if
datatype_info
[
1
].
__contains__
(
'..'
):
print
(
'INVALID DATA TYPE NOTES----> {}'
.
format
(
full_file_path
))
if
sheet
[
8
,
2
]:
dataset_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
dataset_data
[
'subject_categories'
]
=
sheet
[
9
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
10
,
2
]:
dataset_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
10
,
2
])
if
dataset_data
.
get
(
'has_special_subjects'
):
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
11
,
2
]:
dataset_data
[
'special_subject_notes'
]
=
sheet
[
11
,
2
]
if
sheet
[
14
,
2
]:
dataset_data
[
'access_category'
]
=
sheet
[
14
,
2
]
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
19
)
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
17
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PS'
,
'note'
:
'Contract restricts data use to projects '
+
', '
.
join
(
get_value_list_from_row
(
sheet
,
18
))})
has_time_limis
=
process_yes_no_dontknow_answer
(
sheet
[
27
,
2
])