Commit 805292b4 authored by Sascha Herzinger's avatar Sascha Herzinger

Implemented survival analysis

parent be9f918c
Pipeline #5326 failed with stages
in 2 minutes and 53 seconds
......@@ -61,6 +61,7 @@ class BoxplotTask(AnalyticTask):
(df['feature'] == feature)]['value'].tolist()
if len(values) < 2:
continue
# FIXME: v This is ugly. Look at kaplan_meier_survival.py
label = '{}//{}//s{}'.format(feature, category, subset + 1)
group_values.append(values)
stats = self.boxplot_statistics(values)
......
"""Module containing the Celery Task for the Correlation Analysis."""
import logging
from typing import List, TypeVar
from typing import List
import pandas as pd
import numpy as np
from scipy import stats
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.shared.utils import \
apply_subsets, apply_categories
from fractalis.analytics.tasks.shared import utils
logger = logging.getLogger(__name__)
T = TypeVar('T')
class CorrelationTask(AnalyticTask):
......@@ -25,9 +23,9 @@ class CorrelationTask(AnalyticTask):
def main(self,
x: pd.DataFrame,
y: pd.DataFrame,
id_filter: List[T],
id_filter: List[str],
method: str,
subsets: List[List[T]],
subsets: List[List[str]],
categories: List[pd.DataFrame]) -> dict:
"""Compute correlation statistics for the given parameters.
:param x: DataFrame containing x axis values.
......@@ -51,8 +49,8 @@ class CorrelationTask(AnalyticTask):
(x_label, y_label) = (df['feature_x'][0], df['feature_y'][0])
if id_filter:
df = df[df['id'].isin(id_filter)]
df = apply_subsets(df=df, subsets=subsets)
df = apply_categories(df=df, categories=categories)
df = utils.apply_subsets(df=df, subsets=subsets)
df = utils.apply_categories(df=df, categories=categories)
global_stats = self.compute_stats(df, method)
output = global_stats
output['method'] = method
......
"""This module provides statistics for a Kaplan Meier Survival Analysis."""
import logging
from typing import List
import pandas as pd
import numpy as np
from lifelines import KaplanMeierFitter
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.shared import utils
logger = logging.getLogger(__name__)
class KaplanMeierSurvivalTask(AnalyticTask):
"""Kaplan Meier Survival Analysis Task implementing AnalyticTask. This
class is a submittable celery task."""
name = 'kaplan-meier-estimate'
def main(self, durations: List[pd.DataFrame],
categories: List[pd.DataFrame],
event_observed: List[pd.DataFrame],
id_filter: List[str],
subsets: List[List[str]]) -> dict:
# TODO: Docstring
if len(durations) != 1:
error = 'Analysis requires exactly one array that specifies the ' \
'duration length.'
logger.exception(error)
raise ValueError(error)
if len(event_observed) > 1:
error = 'Maximal one variable for "event_observed" allowed'
logger.exception(error)
raise ValueError(error)
df = durations[0]
if id_filter:
df = df[df['id'].isin(id_filter)]
df = utils.apply_subsets(df=df, subsets=subsets)
df = utils.apply_categories(df=df, categories=categories)
stats = {}
# for every category and subset combination estimate the survival fun.
for category in df['category'].unique().tolist():
for subset in df['subset'].unique().tolist():
kmf = KaplanMeierFitter()
sub_df = df[(df['category'] == category) &
(df['subset'] == subset)]
T = sub_df['value']
E = None # default is nothing is censored
if event_observed:
# find observation boolean value for every duration
E = event_observed[0].merge(sub_df, how='left', on='id')
E = [bool(x) and not np.isnan(x) for x in E['value']]
assert len(E) == len(T)
kmf.fit(durations=T, event_observed=E)
if not stats.get(category):
stats[category] = {}
# noinspection PyUnresolvedReferences
stats[category][subset] = {
'timeline': kmf.timeline,
'median': kmf.median_,
'survival_function':
kmf.survival_function_.to_dict(orient='list'),
'confidence_interval':
kmf.confidence_interval_.to_dict(orient='list')
}
return {
'stats': stats
}
"""This module provides statistics for volcano plots."""
import logging
from typing import List, TypeVar
from typing import List
from functools import reduce
import pandas as pd
......@@ -9,7 +9,7 @@ import pandas as pd
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.shared import utils, array_stats
T = TypeVar('T')
# TODO: Log more
logger = logging.getLogger(__name__)
......@@ -20,10 +20,11 @@ class VolcanoTask(AnalyticTask):
name = 'compute-volcanoplot'
def main(self, numerical_arrays: List[pd.DataFrame],
id_filter: List[T],
id_filter: List[str],
ranking_method: str,
params: dict,
subsets: List[List[T]]):
subsets: List[List[str]]) -> dict:
# TODO: docstring
# merge input data into single df
df = reduce(lambda a, b: a.append(b), numerical_arrays)
if not subsets:
......
......@@ -22,6 +22,7 @@ Jinja2==2.10
jsonschema==2.6.0
kiwisolver==1.0.1
kombu==4.1.0
lifelines==0.14.3
MarkupSafe==1.0
matplotlib==2.2.2
mccabe==0.6.1
......
......@@ -28,6 +28,7 @@ setup(
'scipy==0.19.1',
'pandas==0.20.3',
'sklearn==0.0',
'lifelines==0.14.3',
'requests==2.18.4',
'PyYAML==3.12',
'pycryptodomex==3.4.7',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment