Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Fractalis
fractalis
Commits
7bf7cc34
Commit
7bf7cc34
authored
Aug 18, 2017
by
Sascha Herzinger
Browse files
Added sorting and cutting to heatmap algo
parent
0a6d4fe7
Changes
3
Hide whitespace changes
Inline
Side-by-side
fractalis/analytics/tasks/heatmap/main.py
View file @
7bf7cc34
...
...
@@ -5,7 +5,6 @@ from functools import reduce
import
logging
import
pandas
as
pd
from
scipy.stats
import
zscore
from
fractalis.analytics.task
import
AnalyticTask
from
fractalis.analytics.tasks.heatmap.stats
import
StatisticTask
...
...
@@ -28,6 +27,7 @@ class HeatmapTask(AnalyticTask):
categoricals
:
List
[
pd
.
DataFrame
],
ranking_method
:
str
,
id_filter
:
List
[
T
],
max_rows
:
int
,
subsets
:
List
[
List
[
T
]])
->
dict
:
# merge input data into single df
df
=
reduce
(
lambda
a
,
b
:
a
.
append
(
b
),
numerical_arrays
)
...
...
@@ -49,18 +49,13 @@ class HeatmapTask(AnalyticTask):
"the subset sample ids do not match the data."
logger
.
error
(
error
)
raise
ValueError
(
error
)
for
subset
in
subsets
:
if
not
subset
:
error
=
"One or more of the specified subsets does not "
\
"match any sample id for the given array data."
logger
.
error
(
error
)
raise
ValueError
(
error
)
# make matrix of input data
_df
=
df
.
pivot
(
index
=
'feature'
,
columns
=
'id'
,
values
=
'value'
)
# create z-score matrix used for visualising the heatmap
z_df
=
_df
.
apply
(
zscore
,
axis
=
1
)
z_df
=
_df
.
apply
(
lambda
row
:
(
row
-
row
.
mean
())
/
row
.
std
(
ddof
=
0
),
axis
=
1
)
# compute statistic for ranking
stats
=
self
.
stat_task
.
main
(
df
=
_df
,
subsets
=
subsets
,
...
...
@@ -73,6 +68,14 @@ class HeatmapTask(AnalyticTask):
df
=
df
.
merge
(
z_df
,
on
=
[
'id'
,
'feature'
])
df
.
columns
=
[
'id'
,
'feature'
,
'value'
,
'zscore'
]
# sort by ranking_value
df
[
'sort_value'
]
=
df
[
'feature'
].
apply
(
lambda
x
:
stats
[
stats
[
'feature'
]
==
x
][
ranking_method
][
0
])
df
=
df
.
sort_values
(
'sort_value'
,
ascending
=
False
).
drop
(
'sort_value'
,
1
)
# discard rows according to max_rows
df
=
df
[
df
[
'feature'
].
isin
(
df
[
'feature'
].
unique
()[:
max_rows
])]
return
{
'data'
:
df
.
to_json
(
orient
=
'records'
),
'stats'
:
stats
.
to_json
(
orient
=
'records'
)
...
...
fractalis/analytics/tasks/heatmap/stats.py
View file @
7bf7cc34
...
...
@@ -70,7 +70,7 @@ class StatisticTask(AnalyticTask):
# prepare the df in case an id exists in more than one subset
if
len
(
subsets
)
<
2
:
error
=
"Limma analysis requires at least "
\
"two groups for comparison."
"two
non-empty
groups for comparison."
logger
.
error
(
error
)
raise
ValueError
(
error
)
if
df
.
shape
[
0
]
<
1
or
df
.
shape
[
1
]
<
2
:
...
...
tests/heatmap/test_main.py
View file @
7bf7cc34
"""This module provides tests for the heatmap analysis main module."""
import
json
import
pytest
import
pandas
as
pd
import
numpy
as
np
from
fractalis.analytics.tasks.heatmap.main
import
HeatmapTask
...
...
@@ -11,7 +14,7 @@ class TestHeatmap:
task
=
HeatmapTask
()
def
test_functional
_1
(
self
):
def
test_functional
(
self
):
numerical_arrays
=
[
pd
.
DataFrame
([[
101
,
'foo'
,
5
],
[
101
,
'bar'
,
6
],
[
102
,
'foo'
,
10
],
[
102
,
'bar'
,
11
],
[
103
,
'foo'
,
15
],
[
103
,
'bar'
,
16
],
...
...
@@ -24,43 +27,47 @@ class TestHeatmap:
categoricals
=
[],
ranking_method
=
'B'
,
id_filter
=
[],
max_rows
=
100
,
subsets
=
subsets
)
assert
'data'
in
result
assert
'stats'
in
result
def
test_
main_raises_if_invalid_data
(
self
):
def
test_
functional_with_nans_and_missing
(
self
):
numerical_arrays
=
[
pd
.
DataFrame
([[
101
,
'foo'
,
5
],
[
101
,
'bar'
,
6
],
[
102
,
'foo'
,
10
],
[
102
,
'bar'
,
11
],
[
103
,
'foo'
,
15
],
[
103
,
'bar'
,
16
],
pd
.
DataFrame
([[
101
,
'foo'
,
5
],
[
101
,
'bar'
,
6
],
[
102
,
'foo'
,
10
],
[
103
,
'foo'
,
float
(
'nan'
)],
[
103
,
'bar'
,
16
],
[
104
,
'foo'
,
20
],
[
104
,
'bar'
,
21
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
]
subsets
=
[[
1
,
2
,
3
,
4
]]
# does not match sample colnames
with
pytest
.
raises
(
ValueError
)
as
e
:
self
.
task
.
main
(
numerical_arrays
=
numerical_arrays
,
numericals
=
[],
categoricals
=
[],
ranking_method
=
'mean'
,
id_filter
=
[],
subsets
=
subsets
)
assert
'subset sample ids do not match the data'
in
e
subsets
=
[[
101
,
102
],
[
103
,
104
]]
result
=
self
.
task
.
main
(
numerical_arrays
=
numerical_arrays
,
numericals
=
[],
categoricals
=
[],
ranking_method
=
'B'
,
id_filter
=
[],
max_rows
=
100
,
subsets
=
subsets
)
stats
=
json
.
loads
(
result
[
'stats'
])
assert
stats
[
0
]
!=
stats
[
1
]
def
test_main_raises_if_invalid_
subsets
(
self
):
def
test_main_raises_if_invalid_
data
(
self
):
numerical_arrays
=
[
pd
.
DataFrame
([[
101
,
'foo'
,
5
],
[
101
,
'bar'
,
6
],
[
102
,
'foo'
,
10
],
[
102
,
'bar'
,
11
],
[
103
,
'foo'
,
15
],
[
103
,
'bar'
,
16
],
[
104
,
'foo'
,
20
],
[
104
,
'bar'
,
21
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
]
subsets
=
[[
1
01
,
102
,
103
],
[
123
]]
subsets
=
[[
1
,
2
,
3
,
4
]]
# does not match sample colnames
with
pytest
.
raises
(
ValueError
)
as
e
:
self
.
task
.
main
(
numerical_arrays
=
numerical_arrays
,
numericals
=
[],
categoricals
=
[],
ranking_method
=
'mean'
,
id_filter
=
[],
max_rows
=
100
,
subsets
=
subsets
)
assert
'
specified subsets does not match
'
in
e
assert
'
data set is too small
'
in
e
def
test_empty_subset_equals_full_subset
(
self
):
numerical_arrays
=
[
...
...
@@ -74,6 +81,7 @@ class TestHeatmap:
categoricals
=
[],
ranking_method
=
'mean'
,
id_filter
=
[],
max_rows
=
100
,
subsets
=
[])
result_2
=
self
.
task
.
main
(
numerical_arrays
=
numerical_arrays
,
...
...
@@ -81,5 +89,95 @@ class TestHeatmap:
categoricals
=
[],
ranking_method
=
'mean'
,
id_filter
=
[],
max_rows
=
100
,
subsets
=
[[
101
,
102
,
103
,
104
]])
assert
result_1
==
result_2
def
test_multiple_numerical_array_data
(
self
):
numerical_arrays
=
[
pd
.
DataFrame
([[
101
,
'foo'
,
5
],
[
101
,
'bar'
,
6
],
[
102
,
'foo'
,
10
],
[
102
,
'bar'
,
11
],
[
103
,
'foo'
,
15
],
[
103
,
'bar'
,
16
],
[
104
,
'foo'
,
20
],
[
104
,
'bar'
,
21
]],
columns
=
[
'id'
,
'feature'
,
'value'
]),
pd
.
DataFrame
([[
101
,
'baz'
,
10
],
[
102
,
'baz'
,
11
],
[
105
,
'foo'
,
20
],
[
105
,
'baz'
,
21
],
[
106
,
'bar'
,
15
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
]
subsets
=
[[
101
,
102
,
106
],
[
103
,
104
,
105
]]
result
=
self
.
task
.
main
(
numerical_arrays
=
numerical_arrays
,
numericals
=
[],
categoricals
=
[],
ranking_method
=
'B'
,
id_filter
=
[],
max_rows
=
100
,
subsets
=
subsets
)
assert
'data'
in
result
assert
'stats'
in
result
def
test_zscore_is_not_nan_if_data_misses_values
(
self
):
numerical_arrays
=
[
pd
.
DataFrame
([[
101
,
'foo'
,
5
],
[
101
,
'bar'
,
6
],
[
102
,
'foo'
,
10
],
[
102
,
'bar'
,
11
],
[
103
,
'foo'
,
15
],
[
103
,
'bar'
,
16
],
[
104
,
'foo'
,
20
],
[
104
,
'bar'
,
21
]],
columns
=
[
'id'
,
'feature'
,
'value'
]),
pd
.
DataFrame
([[
101
,
'baz'
,
10
],
[
102
,
'baz'
,
11
],
[
105
,
'foo'
,
20
],
[
105
,
'baz'
,
21
],
[
106
,
'bar'
,
15
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
]
subsets
=
[[
101
,
102
,
106
],
[
103
,
104
,
105
]]
result
=
self
.
task
.
main
(
numerical_arrays
=
numerical_arrays
,
numericals
=
[],
categoricals
=
[],
ranking_method
=
'B'
,
id_filter
=
[],
max_rows
=
100
,
subsets
=
subsets
)
data
=
json
.
loads
(
result
[
'data'
])
data
=
pd
.
DataFrame
(
data
)
assert
not
np
.
isnan
(
np
.
min
(
data
[
'zscore'
]))
def
test_results_are_sorted
(
self
):
numerical_arrays
=
[
pd
.
DataFrame
([[
101
,
'A'
,
5
],
[
102
,
'A'
,
5
],
[
101
,
'B'
,
2
],
[
102
,
'B'
,
2
],
[
101
,
'C'
,
8
],
[
102
,
'C'
,
8
],
[
101
,
'D'
,
10
],
[
102
,
'D'
,
10
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
]
subsets
=
[]
result
=
self
.
task
.
main
(
numerical_arrays
=
numerical_arrays
,
numericals
=
[],
categoricals
=
[],
ranking_method
=
'mean'
,
id_filter
=
[],
max_rows
=
100
,
subsets
=
subsets
)
data
=
json
.
loads
(
result
[
'data'
])
data
=
pd
.
DataFrame
(
data
)
feature_col
=
data
[
'feature'
].
tolist
()
assert
[
'D'
,
'D'
,
'C'
,
'C'
,
'A'
,
'A'
,
'B'
,
'B'
]
==
feature_col
def
test_max_rows_works
(
self
):
numerical_arrays
=
[
pd
.
DataFrame
([[
101
,
'A'
,
5
],
[
102
,
'A'
,
5
],
[
101
,
'B'
,
2
],
[
102
,
'B'
,
2
],
[
101
,
'C'
,
8
],
[
102
,
'C'
,
8
],
[
101
,
'D'
,
10
],
[
102
,
'D'
,
10
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
]
subsets
=
[]
result
=
self
.
task
.
main
(
numerical_arrays
=
numerical_arrays
,
numericals
=
[],
categoricals
=
[],
ranking_method
=
'mean'
,
id_filter
=
[],
max_rows
=
2
,
subsets
=
subsets
)
data
=
json
.
loads
(
result
[
'data'
])
data
=
pd
.
DataFrame
(
data
)
feature_col
=
data
[
'feature'
].
tolist
()
assert
[
'D'
,
'D'
,
'C'
,
'C'
]
==
feature_col
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment