Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
elixir
metadata-tools
Commits
13742219
Commit
13742219
authored
Oct 23, 2018
by
Pinar Alper
Browse files
XLS to JSON exporters adapted to the new DAISY model
parent
95d87fe4
Changes
14
Hide whitespace changes
Inline
Side-by-side
metadata_tools/importxls/dataset_exporter.py
View file @
13742219
...
...
@@ -103,20 +103,25 @@ class DatasetExporter:
if
self
.
is_application
(
path
):
result
[
'storage_resource'
]
=
'application'
elif
resource
in
self
.
predefined_
data
_types
:
elif
resource
in
self
.
predefined_
storage
_types
:
result
[
'storage_resource'
]
=
resource
else
:
result
[
'storage_resource'
]
=
'Other'
result
[
'location'
]
=
{
'location'
:
path
}
path_lines
=
[]
path_lines
.
extend
(
get_lines_from_string
(
path
))
result
[
'locations'
]
=
path_lines
result
[
'category'
]
=
category
return
result
def
get_samples_storage
(
self
,
sample_location
):
return
[{
'storage_resource'
:
'sample-storage'
,
'locations'
:[
sample_location
],
'category'
:
'master'
}]
def
is_application
(
self
,
path
):
if
(
"transmart"
in
path
.
lower
())
or
(
"redcap"
in
path
.
lower
()):
if
(
"transmart"
in
path
.
lower
())
or
(
"redcap"
in
path
.
lower
()):
return
True
else
:
return
False
...
...
@@ -137,15 +142,14 @@ class DatasetExporter:
'share_notes'
:
infos
[
1
].
strip
()})
return
share_list
def
ad
d_storage_locations
(
self
,
storage_dict
,
locations_list
,
category
):
def
buil
d_storage_locations
(
self
,
locations_list
,
category
):
result
=
[]
if
len
(
locations_list
)
%
2
!=
0
and
len
(
locations_list
)
>
0
:
if
len
(
locations_list
)
==
1
:
if
self
.
is_storage_resource
(
locations_list
[
0
]):
storage_dic
t
.
append
(
self
.
get_storage_location
(
locations_list
[
0
],
'<missing_info>'
,
category
))
resul
t
.
append
(
self
.
get_storage_location
(
locations_list
[
0
],
'<missing_info>'
,
category
))
else
:
for
line
in
get_lines_from_string
(
locations_list
[
0
]):
storage_dict
.
append
(
self
.
get_storage_location
(
'Other'
,
line
,
category
))
result
.
append
(
self
.
get_storage_location
(
'Other'
,
locations_list
[
0
],
category
))
else
:
raise
ValueError
(
'Uneven Master Data Location Row'
)
elif
len
(
locations_list
)
%
2
==
0
and
len
(
locations_list
)
>
0
:
...
...
@@ -153,11 +157,9 @@ class DatasetExporter:
e
=
len
(
locations_list
)
//
2
while
s
<
e
:
if
self
.
is_storage_resource
(
locations_list
[
s
*
2
]):
for
line
in
get_lines_from_string
(
locations_list
[
s
*
2
+
1
]):
storage_dict
.
append
(
self
.
get_storage_location
(
locations_list
[
s
*
2
],
line
,
category
))
result
.
append
(
self
.
get_storage_location
(
locations_list
[
s
*
2
],
locations_list
[
s
*
2
+
1
],
category
))
else
:
for
line
in
get_lines_from_string
(
locations_list
[
s
*
2
]):
storage_dict
.
append
(
self
.
get_storage_location
(
'Other'
,
line
,
category
))
result
.
append
(
self
.
get_storage_location
(
'Other'
,
[
locations_list
[
s
*
2
]],
category
))
s
+=
1
return
result
metadata_tools/importxls/export_utils.py
View file @
13742219
import
datetime
from
datetime
import
datetime
import
datetime
as
dt
def
process_yes_no_answer
(
answer
):
"""
convert yes/no answers to boolean we take empty answers as no
...
...
@@ -7,7 +7,7 @@ def process_yes_no_answer(answer):
"""
result
=
False
if
answer
:
if
answer
==
'
Y
es'
:
if
answer
.
lower
()
==
'
y
es'
:
result
=
True
return
result
...
...
@@ -19,9 +19,9 @@ def process_yes_no_dontknow_answer(answer):
:param xls_data_type_list:
"""
if
answer
:
if
answer
==
'
Y
es'
:
if
answer
.
lower
()
==
'
y
es'
:
return
True
elif
answer
==
'
N
o'
:
elif
answer
.
lower
()
==
'
n
o'
:
return
False
else
:
return
None
...
...
@@ -45,10 +45,16 @@ def get_value_list_from_row(sheet, row_idx):
def
process_possible_date
(
possible_date
):
if
isinstance
(
possible_date
,
datetime
.
date
):
return
possible_date
.
strftime
(
"%Y/%m/%d"
)
if
isinstance
(
possible_date
,
dt
.
date
):
return
possible_date
.
strftime
(
"%Y-%m-%d"
)
elif
isinstance
(
possible_date
,
int
):
return
""
else
:
return
str
(
possible_date
).
replace
(
'.'
,
'/'
)
try
:
d
=
datetime
.
strptime
(
possible_date
.
replace
(
'/'
,
'.'
),
'%d.%m.%Y'
)
return
d
.
strftime
(
"%Y-%m-%d"
)
except
ValueError
as
e
:
return
""
...
...
metadata_tools/importxls/from_collab_exporter.py
View file @
13742219
...
...
@@ -6,75 +6,62 @@ from metadata_tools.importxls.export_utils import get_value_list_from_row, proce
class
FromCollabXlsExporter
(
DatasetExporter
):
def
export
(
self
,
full_file_path
):
submission_id
=
'IMP_FC_{}'
.
format
(
self
.
get_hash_for_path
(
full_file_path
))
def
export
_datadecs
(
self
,
full_file_path
):
result
=
[]
idx
=
1
print
(
'Processing ----> {}'
.
format
(
full_file_path
))
#
print('Processing ----> {}'.format(full_file_path))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'source_type'
]
=
'From_Collaborator'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
3
)
dataset_data
[
'title'
]
=
sheet
[
4
,
2
]
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
datadec_data
=
{}
datadec_data
[
'dataset'
]
=
sheet
[
1
,
2
]
datadec_data
[
'title'
]
=
sheet
[
4
,
2
]
if
not
datadec_data
[
'title'
]:
print
(
'Missing data title ----> {}'
.
format
(
full_file_path
))
datatype_info
=
self
.
process_data_types
(
get_value_list_from_row
(
sheet
,
5
))
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
dataset_data
[
'involves_samples'
]
=
process_yes_no_answer
(
sheet
[
6
,
2
])
if
datatype_info
[
1
]:
datadec_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
#if it involces samples add this as a datatype
if
process_yes_no_answer
(
sheet
[
6
,
2
]):
datatype_info
[
0
].
append
(
'Samples'
)
datadec_data
[
'data_types'
]
=
datatype_info
[
0
]
if
sheet
[
7
,
2
]:
dataset_data
[
'samples_location'
]
=
sheet
[
7
,
2
]
if
sheet
[
8
,
2
]:
data
set
_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
data
dec
_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
data
set
_data
[
'subject_categories'
]
=
sheet
[
9
,
2
].
replace
(
' & '
,
'_and_'
)
data
dec
_data
[
'subject_categories'
]
=
sheet
[
9
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
10
,
2
]:
data
set
_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
data
dec
_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
10
,
2
])
if
dataset_data
.
get
(
'has_special_subjects'
):
if
dataset_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
11
,
2
]:
dataset_data
[
'special_subject_notes'
]
=
sheet
[
11
,
2
]
collab_insts
=
get_value_list_from_row
(
sheet
,
13
)
collab_pis
=
get_value_list_from_row
(
sheet
,
14
)
if
datadec_data
.
get
(
'has_special_subjects'
):
if
datadec_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
11
,
2
]:
datadec_data
[
'special_subject_notes'
]
=
sheet
[
11
,
2
]
if
(
len
(
collab_insts
)
==
len
(
collab_pis
))
and
len
(
collab_insts
)
>
0
:
i
=
0
src_collab_list
=
[]
while
i
<
len
(
collab_insts
):
collab_inst
=
sheet
[
13
,
2
]
collab_pi
=
sheet
[
14
,
2
]
collab_data
=
{
'collab_inst'
:
self
.
lookup_institution_accession
(
collab_insts
[
i
]),
'collab_pi'
:
collab_pis
[
i
],
'collab_project'
:
sheet
[
18
,
2
]}
if
collab_inst
and
collab_pi
:
collab_dict
=
{}
collab_dict
[
'collab_inst'
]
=
self
.
lookup_institution_accession
(
collab_inst
)
collab_dict
[
'collab_pi'
]
=
collab_pi
if
sheet
[
18
,
2
]:
collab_dict
[
'collab_project'
]
=
sheet
[
18
,
2
]
if
process_yes_no_dontknow_answer
(
sheet
[
17
,
2
])
==
False
:
collab_d
ata
[
'collab_role'
]
=
'controller'
if
process_yes_no_dontknow_answer
(
sheet
[
17
,
2
])
==
False
:
collab_d
ict
[
'collab_role'
]
=
'controller'
elif
process_yes_no_dontknow_answer
(
sheet
[
17
,
2
])
==
True
:
collab_data
[
'collab_role'
]
=
'joint-controller'
src_collab_list
.
append
(
collab_data
)
i
+=
1
dataset_data
[
'source_collaborations'
]
=
src_collab_list
elif
process_yes_no_dontknow_answer
(
sheet
[
17
,
2
])
==
True
:
collab_dict
[
'collab_role'
]
=
'joint_controller'
datadec_data
[
'source_collaboration'
]
=
collab_dict
datadec_data
[
'source_notes'
]
=
'Data is from collaborator.'
else
:
print
(
'Mismatched Collab PI-Institution length {}
\n
'
.
format
(
full_file_path
))
if
len
(
collab_insts
)
>
1
:
print
(
'Multi source collab ----> {}'
.
format
(
full_file_path
))
if
sheet
[
18
,
2
]:
dataset_data
[
'source_project'
]
=
sheet
[
18
,
2
]
print
(
'Missing collaborator information{}
\n
'
.
format
(
full_file_path
))
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
25
,
2
]):
...
...
@@ -94,41 +81,62 @@ class FromCollabXlsExporter(DatasetExporter):
use_restrictions
.
append
({
'ga4gh_code'
:
'TS-[XX]'
,
'note'
:
'Data is obtained for a limited duration.'
+
process_possible_date
(
sheet
[
42
,
2
])})
dataset_data
[
'use_restrictions'
]
=
use_restrictions
datadec_data
[
'use_restrictions'
]
=
use_restrictions
idx
+=
1
result
.
append
(
datadec_data
)
return
result
def
export_datasets
(
self
,
full_file_path
):
result
=
[]
idx
=
1
print
(
'Processing ----> {}'
.
format
(
full_file_path
))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'title'
]
=
sheet
[
1
,
2
]
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
3
)
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
33
)
if
sheet
[
33
,
2
]:
dataset_data
[
'project'
]
=
sheet
[
33
,
2
]
if
process_yes_no_answer
(
sheet
[
29
,
2
]):
dataset_data
[
'shares'
]
=
self
.
process_share_list
(
get_value_list_from_row
(
sheet
,
30
))
storage_locations
=
[]
master_locations
=
get_value_list_from_row
(
sheet
,
35
)
try
:
self
.
add_
storage_locations
(
storage_locations
,
master_locations
,
'master'
)
storage_locations
.
extend
(
self
.
build_
storage_locations
(
master_locations
,
'master'
)
)
except
ValueError
as
e
:
print
(
'Invalid Master Data Location Row {}
\n
'
.
format
(
full_file_path
))
master_acl_list
=
get_value_list_from_row
(
sheet
,
36
)
if
len
(
master_acl_list
)
>
0
:
for
loc
in
storage_locations
:
loc
[
'storage_acl_info'
]
=
', '
.
join
(
master_acl_list
)
if
process_yes_no_answer
(
sheet
[
37
,
2
]):
backup_locations
=
get_value_list_from_row
(
sheet
,
38
)
try
:
self
.
add_
storage_locations
(
storage_locations
,
backup_locations
,
'backup'
)
storage_locations
.
extend
(
self
.
build_
storage_locations
(
backup_locations
,
'backup'
)
)
except
ValueError
as
e
:
print
(
'Uneven Backup Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
39
,
2
]):
copy_locations
=
get_value_list_from_row
(
sheet
,
40
)
try
:
self
.
add_
storage_locations
(
storage_locations
,
copy_locations
,
'copy'
)
storage_locations
.
extend
(
self
.
build_
storage_locations
(
copy_locations
,
'copy'
)
)
except
ValueError
as
e
:
print
(
'Uneven Copy Data Location Row {}
\n
'
.
format
(
full_file_path
))
acl_list
=
get_value_list_from_row
(
sheet
,
36
)
if
len
(
acl_list
)
>
0
:
dataset_data
[
'storage_acl_info'
]
=
', '
.
join
(
acl_list
)
if
process_yes_no_answer
(
sheet
[
6
,
2
]):
storage_locations
.
extend
(
self
.
get_samples_storage
(
sheet
[
7
,
2
]))
dataset_data
[
'storage_locations'
]
=
storage_locations
idx
+=
1
with
open
(
'{}_.json'
.
format
(
submission_id
),
'w'
)
as
outfile
:
json
.
dump
(
dataset_data
,
outfile
,
indent
=
4
)
result
.
append
(
dataset_data
)
return
result
metadata_tools/importxls/from_owncohort_exporter.py
View file @
13742219
...
...
@@ -6,56 +6,51 @@ from metadata_tools.importxls.export_utils import get_value_list_from_row, proce
class
FromOwncohortXlsExporter
(
DatasetExporter
):
def
export
(
self
,
full_file_path
):
#submission_id = 'IMP_OC_{}'.format(self.get_hash_for_path(full_file_path))
def
export
_datadecs
(
self
,
full_file_path
):
result
=
[]
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
idx
=
1
print
(
'----> {}'
.
format
(
full_file_path
))
#
print('----> {}'.format(full_file_path))
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
#dataset_data['source_type'] = 'Own_Cohort'
dataset_data
[
'dataset'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
3
)
dataset_data
[
'title'
]
=
sheet
[
4
,
2
]
if
not
dataset_data
[
'title'
]:
datadec_data
=
{}
datadec_data
[
'dataset'
]
=
sheet
[
1
,
2
]
datadec_data
[
'title'
]
=
sheet
[
4
,
2
]
if
not
datadec_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
if
sheet
[
5
,
2
]:
data
set
_data
[
'source_
project
'
]
=
sheet
[
5
,
2
]
data
dec
_data
[
'source_
study
'
]
=
sheet
[
5
,
2
]
datatype_info
=
self
.
process_data_types
(
get_value_list_from_row
(
sheet
,
6
))
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
datadec_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
dataset_data
[
'involves_samples'
]
=
process_yes_no_answer
(
sheet
[
7
,
2
])
datadec_data
[
'source_notes'
]
=
'Data is from own cohort.'
#if it involces samples add this as a datatype
if
process_yes_no_answer
(
sheet
[
7
,
2
]):
datatype_info
[
0
].
append
(
'Samples'
)
datadec_data
[
'data_types'
]
=
datatype_info
[
0
]
if
sheet
[
7
,
2
]:
dataset_data
[
'samples_location'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
data
set
_data
[
'de_identification'
]
=
sheet
[
9
,
2
]
data
dec
_data
[
'de_identification'
]
=
sheet
[
9
,
2
]
if
sheet
[
10
,
2
]:
data
set
_data
[
'ombudsman'
]
=
sheet
[
10
,
2
]
data
dec
_data
[
'ombudsman'
]
=
sheet
[
10
,
2
]
if
sheet
[
11
,
2
]:
data
set
_data
[
'subject_categories'
]
=
sheet
[
11
,
2
].
replace
(
' & '
,
'_and_'
)
data
dec
_data
[
'subject_categories'
]
=
sheet
[
11
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
12
,
2
]:
data
set
_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
data
dec
_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
12
,
2
])
if
data
set
_data
.
get
(
'has_special_subjects'
):
if
data
set
_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
13
,
2
]:
data
set
_data
[
'special_subject_notes'
]
=
sheet
[
13
,
2
]
if
data
dec
_data
.
get
(
'has_special_subjects'
):
if
data
dec
_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
13
,
2
]:
data
dec
_data
[
'special_subject_notes'
]
=
sheet
[
13
,
2
]
if
sheet
[
19
,
2
]:
dataset_data
[
'consent_status'
]
=
sheet
[
19
,
2
]
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
22
)
datadec_data
[
'consent_status'
]
=
sheet
[
19
,
2
].
lower
()
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
21
,
2
]):
...
...
@@ -84,11 +79,28 @@ class FromOwncohortXlsExporter(DatasetExporter):
if
has_time_limis
and
sheet
[
43
,
2
]:
use_restrictions
.
append
({
'ga4gh_code'
:
'TS-[XX]'
,
'note'
:
'Data is obtained for a limited duration.'
+
process_possible_date
(
sheet
[
43
,
2
])})
datadec_data
[
'use_restrictions'
]
=
use_restrictions
idx
+=
1
result
.
append
(
datadec_data
)
dataset_data
[
'use_restrictions'
]
=
use_restrictions
return
result
share_list
=
[]
def
export_datasets
(
self
,
full_file_path
):
result
=
[]
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
idx
=
1
print
(
'----> {}'
.
format
(
full_file_path
))
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'title'
]
=
sheet
[
1
,
2
]
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
3
)
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
if
sheet
[
22
,
2
]:
dataset_data
[
'project'
]
=
sheet
[
22
,
2
]
share_list
=
[]
if
process_yes_no_answer
(
sheet
[
27
,
2
]):
share_list
+=
self
.
process_share_list
(
get_value_list_from_row
(
sheet
,
28
))
...
...
@@ -101,33 +113,38 @@ class FromOwncohortXlsExporter(DatasetExporter):
dataset_data
[
'shares'
]
=
share_list
storage_locations
=
[]
master_locations
=
get_value_list_from_row
(
sheet
,
36
)
try
:
self
.
add_
storage_locations
(
storage_locations
,
master_locations
,
'master'
)
storage_locations
.
extend
(
self
.
build_
storage_locations
(
master_locations
,
'master'
)
)
except
ValueError
as
e
:
print
(
'Invalid Master Data Location Row {}
\n
'
.
format
(
full_file_path
))
master_acl_list
=
get_value_list_from_row
(
sheet
,
37
)
if
len
(
master_acl_list
)
>
0
:
for
loc
in
storage_locations
:
loc
[
'storage_acl_info'
]
=
', '
.
join
(
master_acl_list
)
if
process_yes_no_answer
(
sheet
[
38
,
2
]):
backup_locations
=
get_value_list_from_row
(
sheet
,
39
)
try
:
self
.
add_
storage_locations
(
storage_locations
,
backup_locations
,
'backup'
)
storage_locations
.
extend
(
self
.
build_
storage_locations
(
backup_locations
,
'backup'
)
)
except
ValueError
as
e
:
print
(
'Uneven Backup Data Location Row {}
\n
'
.
format
(
full_file_path
))
if
process_yes_no_answer
(
sheet
[
40
,
2
]):
copy_locations
=
get_value_list_from_row
(
sheet
,
41
)
try
:
self
.
add_
storage_locations
(
storage_locations
,
copy_locations
,
'copy'
)
storage_locations
.
extend
(
self
.
build_
storage_locations
(
copy_locations
,
'copy'
)
)
except
ValueError
as
e
:
print
(
'Uneven Copy Data Location Row {}
\n
'
.
format
(
full_file_path
))
acl_list
=
get_value_list_from_row
(
sheet
,
37
)
if
len
(
acl_list
)
>
0
:
dataset_data
[
'storage_acl_info'
]
=
', '
.
join
(
acl_list
)
if
process_yes_no_answer
(
sheet
[
7
,
2
])
:
storage_locations
.
extend
(
self
.
get_samples_storage
(
sheet
[
8
,
2
])
)
dataset_data
[
'storage_locations'
]
=
storage_locations
idx
+=
1
with
open
(
'datasets-{}.json'
.
format
(
submission_id
),
'w'
)
as
outfile
:
json
.
dump
(
dataset_data
,
outfile
,
indent
=
4
)
result
.
append
(
dataset_data
)
return
result
metadata_tools/importxls/from_repo_exporter.py
View file @
13742219
...
...
@@ -7,53 +7,55 @@ from metadata_tools.importxls.export_utils import get_value_list_from_row, proce
class
FromRepoXlsExporter
(
DatasetExporter
):
def
export
(
self
,
full_file_path
):
submission_id
=
'IMP_FR_{}'
.
format
(
self
.
get_hash_for_path
(
full_file_path
))
def
export
_datadecs
(
self
,
full_file_path
):
result
=
[]
idx
=
1
print
(
'Processing ----> {}'
.
format
(
full_file_path
))
#
print('Processing ----> {}'.format(full_file_path))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'source_type'
]
=
'From_Repository'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
2
)
dataset_data
[
'source_repository'
]
=
self
.
lookup_institution_accession
(
sheet
[
6
,
2
].
strip
())
datadec_data
=
{}
datadec_data
[
'dataset'
]
=
sheet
[
3
,
2
]
if
sheet
[
5
,
2
]:
datadec_data
[
'title'
]
=
sheet
[
5
,
2
].
strip
()
if
not
datadec_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
if
sheet
[
4
,
2
]:
dataset_data
[
'other_external_id'
]
=
sheet
[
4
,
2
]
if
sheet
[
5
,
2
]:
dataset_data
[
'title'
]
=
sheet
[
5
,
2
].
strip
()
collab_dict
=
{}
collab_dict
[
'collab_inst'
]
=
self
.
lookup_institution_accession
(
sheet
[
6
,
2
].
strip
())
if
sheet
[
19
,
2
]:
collab_dict
[
'collab_project'
]
=
sheet
[
19
,
2
].
strip
()
datadec_data
[
'source_collaboration'
]
=
collab_dict
datadec_data
[
'source_notes'
]
=
'Data is obtained from repository.'
if
sheet
[
4
,
2
]:
datadec_data
[
'other_external_id'
]
=
sheet
[
4
,
2
]
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
datatype_info
=
self
.
process_data_types
(
get_value_list_from_row
(
sheet
,
7
))
data
set
_data
[
'data_types'
]
=
datatype_info
[
0
]
data
dec
_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
if
datatype_info
[
1
].
__contains__
(
'..'
):
print
(
'INVALID DATA TYPE NOTES----> {}'
.
format
(
full_file_path
))
datadec_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
if
sheet
[
8
,
2
]:
data
set
_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
data
dec
_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
if
sheet
[
9
,
2
]:
data
set
_data
[
'subject_categories'
]
=
sheet
[
9
,
2
].
replace
(
' & '
,
'_and_'
)
data
dec
_data
[
'subject_categories'
]
=
sheet
[
9
,
2
].
replace
(
' & '
,
'_and_'
)
if
sheet
[
10
,
2
]:
data
set
_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
data
dec
_data
[
'has_special_subjects'
]
=
process_yes_no_dontknow_answer
(
sheet
[
10
,
2
])
if
data
set
_data
.
get
(
'has_special_subjects'
):
if
data
set
_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
11
,
2
]:
data
set
_data
[
'special_subject_notes'
]
=
sheet
[
11
,
2
]
if
data
dec
_data
.
get
(
'has_special_subjects'
):
if
data
dec
_data
.
get
(
'has_special_subjects'
)
==
True
and
sheet
[
11
,
2
]:
data
dec
_data
[
'special_subject_notes'
]
=
sheet
[
11
,
2
]
if
sheet
[
14
,
2
]:
dataset_data
[
'access_category'
]
=
sheet
[
14
,
2
]
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
19
)
datadec_data
[
'access_category'
]
=
sheet
[
14
,
2
].
replace
(
'-'
,
'_'
)
use_restrictions
=
[]
if
process_yes_no_answer
(
sheet
[
17
,
2
]):
...
...
@@ -69,35 +71,53 @@ class FromRepoXlsExporter(DatasetExporter):
if
process_yes_no_answer
(
sheet
[
29
,
2
]):
use_restrictions
.
append
({
'ga4gh_code'
:
'PUB'
,
'note'
:
'Acknowledgement required.'
})
dataset_data
[
'use_restrictions'
]
=
use_restrictions
datadec_data
[
'use_restrictions'
]
=
use_restrictions
idx
+=
1
result
.
append
(
datadec_data
)
return
result
def
export_datasets
(
self
,
full_file_path
):
result
=
[]
idx
=
1
print
(
'Processing ----> {}'
.
format
(
full_file_path
))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
while
idx
<
book
.
number_of_sheets
():
sheet
=
book
.
sheet_by_index
(
idx
)
dataset_data
=
{}
dataset_data
[
'title'
]
=
sheet
[
3
,
2
]
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
2
)
if
sheet
[
19
,
2
]:
dataset_data
[
'project'
]
=
sheet
[
19
,
2
]
storage_locations
=
[]
master_locations
=
get_value_list_from_row
(
sheet
,
21
)
try
:
self
.
add_
storage_locations
(
storage_locations
,
master_locations
,
'master'
)
storage_locations
.
extend
(
self
.
build_
storage_locations
(
master_locations
,
'master'
)
)
except
ValueError
as
e
:
print
(
'Invalid Master Data Location Row {}
\n
'
.
format
(
full_file_path
))
master_acl_list
=
get_value_list_from_row
(
sheet
,
22
)
if
len
(
master_acl_list
)
>
0
:
for
loc
in
storage_locations
:
loc
[
'storage_acl_info'
]
=
', '
.
join
(
master_acl_list
)