Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Fractalis
fractalis
Commits
ce61158d
Commit
ce61158d
authored
Jul 31, 2018
by
Sascha Herzinger
Browse files
Added new parameters to histogram analysis
parent
d6219fd3
Pipeline
#5977
failed with stages
in 37 minutes and 38 seconds
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
fractalis/analytics/tasks/histogram/main.py
View file @
ce61158d
...
...
@@ -2,10 +2,12 @@
histogram."""
import
logging
from
functools
import
partial
from
typing
import
List
import
pandas
as
pd
import
numpy
as
np
import
scipy.stats
from
fractalis.analytics.task
import
AnalyticTask
from
fractalis.analytics.tasks.shared
import
utils
...
...
@@ -21,11 +23,15 @@ class HistogramTask(AnalyticTask):
name
=
'compute-histogram'
def
main
(
self
,
bw_factor
:
float
,
num_bins
:
int
,
id_filter
:
List
[
str
],
subsets
:
List
[
List
[
str
]],
data
:
pd
.
DataFrame
,
categories
:
List
[
pd
.
DataFrame
])
->
dict
:
"""Compute several basic statistics such as bin size and kde.
:param bw_factor: KDE resolution.
:param num_bins: Number of bins to use for histogram.
:param id_filter: If specified use only given ids during the analysis.
:param subsets: List of lists of subset ids.
:param data: Numerical values to create histogram of.
...
...
@@ -49,12 +55,23 @@ class HistogramTask(AnalyticTask):
sub_df
=
df
[(
df
[
'category'
]
==
category
)
&
(
df
[
'subset'
]
==
subset
)]
values
=
sub_df
[
'value'
]
hist
,
bin_edges
=
np
.
histogram
(
values
)
if
values
.
shape
[
0
]
<
2
:
continue
hist
,
bin_edges
=
np
.
histogram
(
values
,
bins
=
num_bins
)
hist
=
hist
.
tolist
()
bin_edges
=
bin_edges
.
tolist
()
mean
=
np
.
mean
(
values
)
median
=
np
.
median
(
values
)
std
=
np
.
std
(
values
)
def
bw
(
obj
,
fac
):
return
np
.
power
(
obj
.
n
,
-
1.0
/
(
obj
.
d
+
4
))
*
fac
kde
=
scipy
.
stats
.
gaussian_kde
(
values
,
bw_method
=
partial
(
bw
,
fac
=
bw_factor
))
xs
=
np
.
linspace
(
start
=
np
.
min
(
values
),
stop
=
np
.
max
(
values
),
num
=
200
)
dist
=
kde
(
xs
).
tolist
()
if
not
stats
.
get
(
category
):
stats
[
category
]
=
{}
stats
[
category
][
subset
]
=
{
...
...
@@ -62,7 +79,8 @@ class HistogramTask(AnalyticTask):
'bin_edges'
:
bin_edges
,
'mean'
:
mean
,
'median'
:
median
,
'std'
:
std
'std'
:
std
,
'dist'
:
dist
}
return
{
'stats'
:
stats
,
...
...
tests/unit/analytics/histogram/test_histogram.py
View file @
ce61158d
...
...
@@ -34,6 +34,7 @@ class TestHistogramTask:
[
109
,
'cat'
,
'B'
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
bw_factor
=
0.5
,
subsets
=
[],
data
=
df
,
categories
=
[
cat_df
])
...
...
@@ -43,7 +44,7 @@ class TestHistogramTask:
assert
'B'
in
result
[
'stats'
]
assert
0
in
result
[
'stats'
][
'A'
]
assert
all
([
stat
in
result
[
'stats'
][
'A'
][
0
]
for
stat
in
[
'hist'
,
'bin_edges'
,
'mean'
,
'median'
,
'std'
]])
[
'hist'
,
'bin_edges'
,
'mean'
,
'median'
,
'std'
,
'dist'
]])
def
test_can_handle_nas
(
self
):
df
=
pd
.
DataFrame
([[
100
,
'foo'
,
float
(
'nan'
)],
...
...
@@ -58,6 +59,7 @@ class TestHistogramTask:
[
109
,
'foo'
,
10
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
bw_factor
=
0.5
,
subsets
=
[],
data
=
df
,
categories
=
[])
...
...
@@ -77,13 +79,14 @@ class TestHistogramTask:
[
109
,
'foo'
,
10
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
bw_factor
=
0.5
,
subsets
=
[],
data
=
df
,
categories
=
[])
assert
result
[
'stats'
][
''
][
0
][
'median'
]
==
0
assert
result
[
'stats'
][
''
][
0
][
'mean'
]
==
0
def
test_
can_handle
_small_groups
(
self
):
def
test_
skips
_small_groups
(
self
):
df
=
pd
.
DataFrame
([[
100
,
'foo'
,
1
],
[
101
,
'foo'
,
2
],
[
102
,
'foo'
,
float
(
'nan'
)],
...
...
@@ -107,12 +110,11 @@ class TestHistogramTask:
[
109
,
'cat'
,
'B'
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
bw_factor
=
0.5
,
subsets
=
[],
data
=
df
,
categories
=
[
cat_df
])
assert
result
[
'stats'
][
'A'
][
0
][
'median'
]
==
1
assert
result
[
'stats'
][
'A'
][
0
][
'mean'
]
==
1
assert
result
[
'stats'
][
'A'
][
0
][
'std'
]
==
0
assert
'A'
not
in
result
[
'stats'
]
def
test_skips_empty_groups
(
self
):
df
=
pd
.
DataFrame
([[
100
,
'foo'
,
float
(
'nan'
)],
...
...
@@ -138,6 +140,7 @@ class TestHistogramTask:
[
109
,
'cat'
,
'B'
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
bw_factor
=
0.5
,
subsets
=
[],
data
=
df
,
categories
=
[
cat_df
])
...
...
@@ -169,6 +172,7 @@ class TestHistogramTask:
columns
=
[
'id'
,
'feature'
,
'value'
])
with
pytest
.
raises
(
ValueError
)
as
e
:
self
.
task
.
main
(
id_filter
=
[],
bw_factor
=
0.5
,
subsets
=
[],
data
=
df
,
categories
=
[
cat_df
])
...
...
@@ -198,6 +202,7 @@ class TestHistogramTask:
[
109
,
'cat'
,
'B'
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
bw_factor
=
0.5
,
subsets
=
[],
data
=
df
,
categories
=
[
cat_df
])
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment