Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
elixir
metadata-tools
Commits
d24c302a
Commit
d24c302a
authored
Jul 31, 2018
by
Pinar Alper
Browse files
Extended storage resource export
parent
0ba1f1d4
Changes
8
Hide whitespace changes
Inline
Side-by-side
metadata-tools/resources/el
x
-dataset.json
→
metadata-tools/resources/el
u
-dataset.json
View file @
d24c302a
File moved
metadata-tools/resources/el
x
-study.json
→
metadata-tools/resources/el
u
-study.json
View file @
d24c302a
File moved
metadata-tools/resources/elu_institutions.json
View file @
d24c302a
...
...
@@ -239,9 +239,9 @@
},
{
"elu_accession"
:
"ELU_I_44"
,
"institution_name"
:
"London School of Hygiene & Tropical Medicine"
,
"institution_name"
:
"London School of Hygiene & Tropical Medicine
, Medical Research Council Unit The Gambia
"
,
"geo_category"
:
"EU"
,
"acronym"
:
"LSHTM"
"acronym"
:
"LSHTM
MRU The Gambia
"
},
{
"elu_accession"
:
"ELU_I_45"
,
...
...
@@ -495,10 +495,10 @@
"geo_category"
:
"EU"
},
{
"elu_accession"
:
"ELU_I_91"
,
"institution_name"
:
"Helmholtz Zentrum München"
,
"geo_category"
:
"EU"
,
"acronym"
:
"HMGU"
"elu_accession"
:
"ELU_I_91"
,
"institution_name"
:
"Helmholtz Zentrum München"
,
"geo_category"
:
"EU"
,
"acronym"
:
"HMGU"
},
{
"elu_accession"
:
"ELU_I_92"
,
...
...
tests/importxls/test_from_collab.py
View file @
d24c302a
...
...
@@ -6,7 +6,7 @@ from unittest import TestCase
import
pyexcel
from
tests.importxls.test_utils
import
get_value_list_from_row
,
process_data_types
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
add_storage_locations
,
SHEETS_FOLDER
,
process_possible_date
process_yes_no_dontknow_answer
,
add_storage_locations
,
SHEETS_FOLDER
,
process_possible_date
,
process_share_list
class
TestProjectsParser
(
TestCase
):
...
...
@@ -24,7 +24,7 @@ class TestProjectsParser(TestCase):
submission_id
=
'IMP_FC_{}'
.
format
(
str
(
int
(
h
.
hexdigest
(),
16
)))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
idx
=
1
#
print('----> {}'.format(full_file_path))
print
(
'
Processing
----> {}'
.
format
(
full_file_path
))
while
idx
<
book
.
number_of_sheets
():
# dataset_count+=1
sheet
=
book
.
sheet_by_index
(
idx
)
...
...
@@ -58,8 +58,6 @@ class TestProjectsParser(TestCase):
if
sheet
[
7
,
2
]:
dataset_data
[
'samples_location'
]
=
sheet
[
7
,
2
]
if
dataset_data
[
'involves_samples'
]
==
False
:
print
(
'----> {}'
.
format
(
'Inconsistent samples information'
+
full_file_path
))
if
sheet
[
8
,
2
]:
dataset_data
[
'de_identification'
]
=
sheet
[
8
,
2
]
...
...
@@ -78,7 +76,7 @@ class TestProjectsParser(TestCase):
collab_insts
=
get_value_list_from_row
(
sheet
,
13
)
collab_pis
=
get_value_list_from_row
(
sheet
,
14
)
if
len
(
collab_insts
)
==
len
(
collab_pis
)
and
len
(
collab_insts
)
>
0
:
if
(
len
(
collab_insts
)
==
len
(
collab_pis
)
)
and
len
(
collab_insts
)
>
0
:
i
=
0
src_collab_list
=
[]
while
i
<
len
(
collab_insts
):
...
...
@@ -99,6 +97,9 @@ class TestProjectsParser(TestCase):
else
:
print
(
'Mismatched Collab PI-Institution length {}
\n
'
.
format
(
full_file_path
))
if
len
(
collab_insts
)
>
1
:
print
(
'Multi source collab ----> {}'
.
format
(
full_file_path
))
if
sheet
[
18
,
2
]:
dataset_data
[
'source_project'
]
=
sheet
[
18
,
2
]
...
...
@@ -125,12 +126,8 @@ class TestProjectsParser(TestCase):
dataset_data
[
'used_by_projects'
]
=
get_value_list_from_row
(
sheet
,
33
)
if
process_yes_no_answer
(
sheet
[
29
,
2
]):
shares
=
get_value_list_from_row
(
sheet
,
30
)
if
len
(
shares
)
>
0
:
share_list
=
[]
for
shr
in
shares
:
share_list
.
append
({
'share_notes'
:
shr
})
dataset_data
[
'shares'
]
=
share_list
dataset_data
[
'shares'
]
=
process_share_list
(
get_value_list_from_row
(
sheet
,
30
))
storage_locations
=
[]
...
...
tests/importxls/test_from_repo.py
View file @
d24c302a
...
...
@@ -26,7 +26,7 @@ class TestProjectsParser(TestCase):
submission_id
=
'IMP_FR_{}'
.
format
(
str
(
int
(
h
.
hexdigest
(),
16
)))
book
=
pyexcel
.
get_book
(
file_name
=
full_file_path
)
idx
=
1
#
print('----> {}'.format(full_file_path))
print
(
'
Processing
----> {}'
.
format
(
full_file_path
))
while
idx
<
book
.
number_of_sheets
():
# dataset_count+=1
sheet
=
book
.
sheet_by_index
(
idx
)
...
...
@@ -45,7 +45,7 @@ class TestProjectsParser(TestCase):
dataset_data
[
'other_external_id'
]
=
sheet
[
4
,
2
]
if
sheet
[
5
,
2
]:
dataset_data
[
'title'
]
=
sheet
[
5
,
2
]
dataset_data
[
'title'
]
=
sheet
[
5
,
2
]
.
strip
()
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
...
...
@@ -55,7 +55,8 @@ class TestProjectsParser(TestCase):
dataset_data
[
'data_types'
]
=
datatype_info
[
0
]
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
if
datatype_info
[
1
].
__contains__
(
'..'
):
print
(
'INVLAID DATA TYPE NOTES----> {}'
.
format
(
full_file_path
))
# for dd in datatype_info[0]:
# if dd in datatype_count.keys():
# datatype_count[dd] +=1
...
...
tests/importxls/test_own_cohort.py
View file @
d24c302a
...
...
@@ -6,7 +6,7 @@ from unittest import TestCase
import
pyexcel
from
tests.importxls.test_utils
import
get_value_list_from_row
,
process_data_types
,
process_yes_no_answer
,
\
process_yes_no_dontknow_answer
,
add_storage_locations
,
SHEETS_FOLDER
,
process_possible_date
process_yes_no_dontknow_answer
,
add_storage_locations
,
SHEETS_FOLDER
,
process_possible_date
,
process_share_list
class
TestProjectsParser
(
TestCase
):
...
...
@@ -14,8 +14,7 @@ class TestProjectsParser(TestCase):
h
=
hashlib
.
md5
()
# count = 0
# custodian_count = {}
# datatype_count={}
for
dirName
,
subdirList
,
fileList
in
os
.
walk
(
SHEETS_FOLDER
):
for
fname
in
fileList
:
...
...
@@ -34,11 +33,7 @@ class TestProjectsParser(TestCase):
dataset_data
[
'source_type'
]
=
'Own_Cohort'
dataset_data
[
'submission_id'
]
=
submission_id
dataset_data
[
'local_custodian'
]
=
get_value_list_from_row
(
sheet
,
3
)
# for cc in dataset_data['local_custodian']:
# if cc in custodian_count.keys():
# custodian_count[cc] +=1
# else:
# custodian_count[cc] =1
dataset_data
[
'title'
]
=
sheet
[
4
,
2
]
if
not
dataset_data
[
'title'
]:
print
(
'Missing dataset title ----> {}'
.
format
(
full_file_path
))
...
...
@@ -50,11 +45,6 @@ class TestProjectsParser(TestCase):
if
datatype_info
[
1
]:
dataset_data
[
'data_type_notes'
]
=
datatype_info
[
1
]
# for dd in datatype_info[0]:
# if dd in datatype_count.keys():
# datatype_count[dd] +=1
# else:
# datatype_count[dd] =1
dataset_data
[
'involves_samples'
]
=
process_yes_no_answer
(
sheet
[
7
,
2
])
...
...
@@ -116,22 +106,13 @@ class TestProjectsParser(TestCase):
share_list
=
[]
if
process_yes_no_answer
(
sheet
[
27
,
2
]):
luxembourg_shares
=
get_value_list_from_row
(
sheet
,
28
)
if
len
(
luxembourg_shares
)
>
0
:
for
shr
in
luxembourg_shares
:
share_list
.
append
({
'share_notes'
:
shr
,
'share_location_type'
:
'National'
})
share_list
+=
process_share_list
(
get_value_list_from_row
(
sheet
,
28
))
if
process_yes_no_answer
(
sheet
[
30
,
2
]):
eu_shares
=
get_value_list_from_row
(
sheet
,
31
)
if
len
(
eu_shares
)
>
0
:
for
shr
in
eu_shares
:
share_list
.
append
({
'share_notes'
:
shr
,
'share_location_type'
:
'EU'
})
share_list
+=
process_share_list
(
get_value_list_from_row
(
sheet
,
31
))
if
process_yes_no_answer
(
sheet
[
33
,
2
]):
noneu_shares
=
get_value_list_from_row
(
sheet
,
34
)
if
len
(
noneu_shares
)
>
0
:
for
shr
in
noneu_shares
:
share_list
.
append
({
'share_notes'
:
shr
,
'share_location_type'
:
'Non-EU'
})
share_list
+=
process_share_list
(
get_value_list_from_row
(
sheet
,
34
))
dataset_data
[
'shares'
]
=
share_list
...
...
tests/importxls/test_projects.py
View file @
d24c302a
...
...
@@ -57,7 +57,6 @@ class TestProjectsParser(TestCase):
def
test_duplicate_dataset_title
(
self
):
titles
=
set
()
for
dirName
,
subdirList
,
fileList
in
os
.
walk
(
SHEETS_FOLDER
):
...
...
tests/importxls/test_utils.py
View file @
d24c302a
...
...
@@ -103,41 +103,63 @@ def process_data_types(xls_data_type_list):
data_type_notes
+=
type_name
+
'
\n
'
return
(
result
,
data_type_notes
)
predefined_types
=
set
([
'hpc_chaos_home'
,
'hpc_chaos_project'
,
'hpc_gaia_home'
,
'hpc_gaia_project'
,
'hpc_gaia_work'
,
'hpc_iris_home'
,
'hpc_iris_project'
,
'hpc_scratch_personal'
,
'hpc_scratch_project'
,
'hpc_isilon'
,
'atlas_personal'
,
'atlas_project'
,
'hpc_backup_chaos'
,
'hpc_backup_gaia'
,
'bertha'
,
'certon_block'
,
'lcsb_group_server'
,
'lcsb_desktop'
,
'lcsb_laptop'
,
'personal_laptop'
,
'Owncloud'
,
'External Storage (e.g. Hard disk, DVD)'
,
'Other'
])
def
is_storage_resource
(
resource
):
if
resource
in
predefined_types
:
return
True
else
:
print
(
'Unknow Storage resource --> {}'
.
format
(
resource
))
return
False
def
is_storage_resource
(
location
):
result
=
[]
predefined_types
=
set
([
'hpc_chaos_home'
,
'hpc_chaos_project'
,
'hpc_gaia_home'
,
'hpc_gaia_project'
,
'hpc_gaia_work'
,
'hpc_iris_home'
,
'hpc_iris_project'
,
'hpc_scratch_personal'
,
'hpc_scratch_project'
,
'hpc_isilon'
,
'atlas_personal'
,
'atlas_project'
,
'hpc_backup_chaos'
,
'hpc_backup_gaia'
,
'bertha'
,
'certon_block'
,
'lcsb_group_server'
,
'lcsb_desktop'
,
'lcsb_laptop'
,
'personal_laptop'
,
'Owncloud'
,
'External Storage (e.g. Hard disk, DVD)'
,
'OTHER'
])
def
get_storage_location
(
resource
,
path
,
category
):
result
=
{}
if
is_application
(
path
):
result
[
'storage_resource'
]
=
'application'
elif
resource
in
predefined_types
:
result
[
'storage_resource'
]
=
resource
else
:
result
[
'storage_resource'
]
=
'Other'
result
[
'location'
]
=
{
'location'
:
path
}
result
[
'category'
]
=
category
if
location
in
predefined_types
:
return
result
def
is_application
(
path
):
if
(
"transmart"
in
path
.
lower
())
or
(
"redcap"
in
path
.
lower
()):
return
True
else
:
return
False
def
process_yes_no_answer
(
answer
):
"""
convert yes/no answers to boolean we take empty answers as no
...
...
@@ -167,18 +189,26 @@ def process_yes_no_dontknow_answer(answer):
return
None
def
process_share_list
(
shares
):
share_list
=
[]
for
shr
in
shares
:
if
";"
not
in
shr
:
share_list
.
append
({
'share_notes'
:
shr
})
else
:
infos
=
shr
.
split
(
";"
)
share_list
.
append
({
'share_inst'
:
infos
[
0
].
strip
(),
'share_notes'
:
infos
[
1
].
strip
()})
return
share_list
def
add_storage_locations
(
storage_dict
,
locations_list
,
category
):
if
len
(
locations_list
)
%
2
!=
0
and
len
(
locations_list
)
>
0
:
if
len
(
locations_list
)
==
1
:
if
is_storage_resource
(
locations_list
[
0
]):
storage_dict
.
append
(
{
'storage_resource'
:
locations_list
[
0
],
'location'
:
'<missing_info>'
,
'category'
:
category
})
storage_dict
.
append
(
get_storage_location
(
locations_list
[
0
],
'<missing_info>'
,
category
))
else
:
for
line
in
get_lines_from_string
(
locations_list
[
0
]):
storage_dict
.
append
(
{
'storage_resource'
:
'Other'
,
'location'
:
line
,
'category'
:
category
})
storage_dict
.
append
(
get_storage_location
(
'Other'
,
line
,
category
))
else
:
raise
ValueError
(
'Uneven Master Data Location Row'
)
elif
len
(
locations_list
)
%
2
==
0
and
len
(
locations_list
)
>
0
:
...
...
@@ -187,21 +217,10 @@ def add_storage_locations(storage_dict, locations_list, category):
while
s
<
e
:
if
is_storage_resource
(
locations_list
[
s
*
2
]):
for
line
in
get_lines_from_string
(
locations_list
[
s
*
2
+
1
]):
storage_dict
.
append
(
{
'storage_resource'
:
locations_list
[
s
*
2
],
'location'
:
line
,
'category'
:
category
})
storage_dict
.
append
(
get_storage_location
(
locations_list
[
s
*
2
],
line
,
category
))
else
:
for
line
in
get_lines_from_string
(
locations_list
[
s
*
2
]):
storage_dict
.
append
(
{
'storage_resource'
:
'Other'
,
'location'
:
line
,
'category'
:
category
})
# res = locations_list[s * 2] if locations_list[s * 2] else 'Other'
#
# storage_dict.append({'storage_resource': res,
# 'location': locations_list[s * 2 + 1],
# 'category': category})
storage_dict
.
append
(
get_storage_location
(
'Other'
,
line
,
category
))
s
+=
1
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment