Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Fractalis
fractalis
Commits
fa7b4cbc
Commit
fa7b4cbc
authored
Jul 26, 2018
by
Sascha Herzinger
Browse files
adding histogram statistics
parent
17b809d5
Pipeline
#5926
passed with stages
in 37 minutes and 14 seconds
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
fractalis/analytics/tasks/histogram/__init__.py
0 → 100644
View file @
fa7b4cbc
fractalis/analytics/tasks/histogram/main.py
0 → 100644
View file @
fa7b4cbc
"""This module contains several statistics necessary for creating a
histogram."""
import
logging
from
typing
import
List
import
pandas
as
pd
import
numpy
as
np
from
fractalis.analytics.task
import
AnalyticTask
from
fractalis.analytics.tasks.shared
import
utils
logger
=
logging
.
getLogger
(
__name__
)
class
HistogramTask
(
AnalyticTask
):
"""Histogram Analysis Task implementing AnalyticsTask. This class is a
submittable celery task."""
name
=
'compute-histogram'
def
main
(
self
,
id_filter
:
List
[
str
],
subsets
:
List
[
List
[
str
]],
data
:
pd
.
DataFrame
,
categories
:
List
[
pd
.
DataFrame
])
->
dict
:
"""Compute several basic statistics such as bin size and variance.
:param id_filter: If specified use only given ids during the analysis.
:param subsets: List of lists of subset ids.
:param data: Numerical values to create histogram of.
:param categories: The groups to split the values into.
"""
df
=
data
del
data
df
.
dropna
(
inplace
=
True
)
if
df
.
shape
[
0
]
==
0
:
error
=
'The selected numerical variable must be non-empty.'
logger
.
exception
(
error
)
raise
ValueError
(
error
)
df
=
utils
.
apply_id_filter
(
df
=
df
,
id_filter
=
id_filter
)
df
=
utils
.
apply_subsets
(
df
=
df
,
subsets
=
subsets
)
df
=
utils
.
apply_categories
(
df
=
df
,
categories
=
categories
)
stats
=
{}
categories
=
df
[
'category'
].
unique
().
tolist
()
subsets
=
df
[
'subset'
].
unique
().
tolist
()
for
category
in
categories
:
for
subset
in
subsets
:
sub_df
=
df
[(
df
[
'category'
]
==
category
)
&
(
df
[
'subset'
]
==
subset
)]
values
=
sub_df
[
'value'
]
hist
,
bin_edges
=
np
.
histogram
(
values
)
hist
=
list
(
hist
)
bin_edges
=
list
(
bin_edges
)
mean
=
np
.
mean
(
values
)
median
=
np
.
median
(
values
)
variance
=
np
.
var
(
values
)
if
not
stats
.
get
(
category
):
stats
[
category
]
=
{}
stats
[
category
][
subset
]
=
{
'hist'
:
hist
,
'bin_edges'
:
bin_edges
,
'mean'
:
mean
,
'median'
:
median
,
'variance'
:
variance
}
return
{
'stats'
:
stats
}
tests/unit/analytics/histogram/__init__.py
0 → 100644
View file @
fa7b4cbc
tests/unit/analytics/histogram/test_histogram.py
0 → 100644
View file @
fa7b4cbc
import
pytest
import
pandas
as
pd
from
fractalis.analytics.tasks.histogram.main
import
HistogramTask
class
TestHistogramTask
:
task
=
HistogramTask
()
def
test_correct_output
(
self
):
df
=
pd
.
DataFrame
([[
100
,
'foo'
,
1
],
[
101
,
'foo'
,
2
],
[
102
,
'foo'
,
3
],
[
103
,
'foo'
,
4
],
[
104
,
'foo'
,
5
],
[
105
,
'foo'
,
6
],
[
106
,
'foo'
,
7
],
[
107
,
'foo'
,
8
],
[
108
,
'foo'
,
9
],
[
109
,
'foo'
,
10
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
cat_df
=
pd
.
DataFrame
([[
100
,
'cat'
,
'A'
],
[
101
,
'cat'
,
'B'
],
[
102
,
'cat'
,
'A'
],
[
103
,
'cat'
,
'B'
],
[
104
,
'cat'
,
'A'
],
[
105
,
'cat'
,
'B'
],
[
106
,
'cat'
,
'A'
],
[
107
,
'cat'
,
'B'
],
[
108
,
'cat'
,
'A'
],
[
109
,
'cat'
,
'B'
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
subsets
=
[],
data
=
df
,
categories
=
[
cat_df
])
assert
'stats'
in
result
assert
'A'
in
result
[
'stats'
]
assert
'B'
in
result
[
'stats'
]
assert
0
in
result
[
'stats'
][
'A'
]
assert
all
([
stat
in
result
[
'stats'
][
'A'
][
0
]
for
stat
in
[
'hist'
,
'bin_edges'
,
'mean'
,
'median'
,
'variance'
]])
def
test_can_handle_nas
(
self
):
df
=
pd
.
DataFrame
([[
100
,
'foo'
,
float
(
'nan'
)],
[
101
,
'foo'
,
2
],
[
102
,
'foo'
,
float
(
'nan'
)],
[
103
,
'foo'
,
4
],
[
104
,
'foo'
,
float
(
'nan'
)],
[
105
,
'foo'
,
6
],
[
106
,
'foo'
,
float
(
'nan'
)],
[
107
,
'foo'
,
8
],
[
108
,
'foo'
,
float
(
'nan'
)],
[
109
,
'foo'
,
10
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
subsets
=
[],
data
=
df
,
categories
=
[])
assert
result
[
'stats'
][
''
][
0
][
'median'
]
==
6
assert
result
[
'stats'
][
''
][
0
][
'mean'
]
==
6
assert
result
[
'stats'
][
''
][
0
][
'variance'
]
==
8
def
test_can_handle_negatives
(
self
):
df
=
pd
.
DataFrame
([[
100
,
'foo'
,
-
2
],
[
101
,
'foo'
,
2
],
[
102
,
'foo'
,
-
4
],
[
103
,
'foo'
,
4
],
[
104
,
'foo'
,
-
6
],
[
105
,
'foo'
,
6
],
[
106
,
'foo'
,
-
8
],
[
107
,
'foo'
,
8
],
[
108
,
'foo'
,
-
10
],
[
109
,
'foo'
,
10
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
subsets
=
[],
data
=
df
,
categories
=
[])
assert
result
[
'stats'
][
''
][
0
][
'median'
]
==
0
assert
result
[
'stats'
][
''
][
0
][
'mean'
]
==
0
def
test_can_handle_small_groups
(
self
):
df
=
pd
.
DataFrame
([[
100
,
'foo'
,
1
],
[
101
,
'foo'
,
2
],
[
102
,
'foo'
,
float
(
'nan'
)],
[
103
,
'foo'
,
4
],
[
104
,
'foo'
,
float
(
'nan'
)],
[
105
,
'foo'
,
6
],
[
106
,
'foo'
,
float
(
'nan'
)],
[
107
,
'foo'
,
8
],
[
108
,
'foo'
,
float
(
'nan'
)],
[
109
,
'foo'
,
10
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
cat_df
=
pd
.
DataFrame
([[
100
,
'cat'
,
'A'
],
[
101
,
'cat'
,
'B'
],
[
102
,
'cat'
,
'A'
],
[
103
,
'cat'
,
'B'
],
[
104
,
'cat'
,
'A'
],
[
105
,
'cat'
,
'B'
],
[
106
,
'cat'
,
'A'
],
[
107
,
'cat'
,
'B'
],
[
108
,
'cat'
,
'A'
],
[
109
,
'cat'
,
'B'
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
subsets
=
[],
data
=
df
,
categories
=
[
cat_df
])
assert
result
[
'stats'
][
'A'
][
0
][
'median'
]
==
1
assert
result
[
'stats'
][
'A'
][
0
][
'mean'
]
==
1
assert
result
[
'stats'
][
'A'
][
0
][
'variance'
]
==
0
def
test_skips_empty_groups
(
self
):
df
=
pd
.
DataFrame
([[
100
,
'foo'
,
float
(
'nan'
)],
[
101
,
'foo'
,
2
],
[
102
,
'foo'
,
float
(
'nan'
)],
[
103
,
'foo'
,
4
],
[
104
,
'foo'
,
float
(
'nan'
)],
[
105
,
'foo'
,
6
],
[
106
,
'foo'
,
float
(
'nan'
)],
[
107
,
'foo'
,
8
],
[
108
,
'foo'
,
float
(
'nan'
)],
[
109
,
'foo'
,
10
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
cat_df
=
pd
.
DataFrame
([[
100
,
'cat'
,
'A'
],
[
101
,
'cat'
,
'B'
],
[
102
,
'cat'
,
'A'
],
[
103
,
'cat'
,
'B'
],
[
104
,
'cat'
,
'A'
],
[
105
,
'cat'
,
'B'
],
[
106
,
'cat'
,
'A'
],
[
107
,
'cat'
,
'B'
],
[
108
,
'cat'
,
'A'
],
[
109
,
'cat'
,
'B'
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
result
=
self
.
task
.
main
(
id_filter
=
[],
subsets
=
[],
data
=
df
,
categories
=
[
cat_df
])
assert
'A'
not
in
result
[
'stats'
]
assert
'B'
in
result
[
'stats'
]
def
test_throws_error_if_all_groups_empty
(
self
):
df
=
pd
.
DataFrame
([[
100
,
'foo'
,
float
(
'nan'
)],
[
101
,
'foo'
,
float
(
'nan'
)],
[
102
,
'foo'
,
float
(
'nan'
)],
[
103
,
'foo'
,
float
(
'nan'
)],
[
104
,
'foo'
,
float
(
'nan'
)],
[
105
,
'foo'
,
float
(
'nan'
)],
[
106
,
'foo'
,
float
(
'nan'
)],
[
107
,
'foo'
,
float
(
'nan'
)],
[
108
,
'foo'
,
float
(
'nan'
)],
[
109
,
'foo'
,
float
(
'nan'
)]],
columns
=
[
'id'
,
'feature'
,
'value'
])
cat_df
=
pd
.
DataFrame
([[
100
,
'cat'
,
'A'
],
[
101
,
'cat'
,
'B'
],
[
102
,
'cat'
,
'A'
],
[
103
,
'cat'
,
'B'
],
[
104
,
'cat'
,
'A'
],
[
105
,
'cat'
,
'B'
],
[
106
,
'cat'
,
'A'
],
[
107
,
'cat'
,
'B'
],
[
108
,
'cat'
,
'A'
],
[
109
,
'cat'
,
'B'
]],
columns
=
[
'id'
,
'feature'
,
'value'
])
with
pytest
.
raises
(
ValueError
)
as
e
:
self
.
task
.
main
(
id_filter
=
[],
subsets
=
[],
data
=
df
,
categories
=
[
cat_df
])
assert
'selected numerical variable must be non-empty'
in
e
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment