main.py 2.98 KB
Newer Older
Sascha Herzinger's avatar
Sascha Herzinger committed
1
2
"""Module containing analysis code for heatmap analytics."""

Sascha Herzinger's avatar
Sascha Herzinger committed
3
from typing import List, TypeVar
Sascha Herzinger's avatar
Sascha Herzinger committed
4
from functools import reduce
5
import logging
Sascha Herzinger's avatar
Sascha Herzinger committed
6
7
8
9

import pandas as pd

from fractalis.analytics.task import AnalyticTask
10
from fractalis.analytics.tasks.heatmap.stats import StatisticTask
11
from fractalis.analytics.tasks.shared import utils
Sascha Herzinger's avatar
Sascha Herzinger committed
12
13


Sascha Herzinger's avatar
Sascha Herzinger committed
14
T = TypeVar('T')
15
logger = logging.getLogger(__name__)
Sascha Herzinger's avatar
Sascha Herzinger committed
16
17


Sascha Herzinger's avatar
Sascha Herzinger committed
18
19
20
21
22
class HeatmapTask(AnalyticTask):
    """Heatmap Analysis Task implementing AnalyticsTask. This class is a
    submittable celery task."""

    name = 'compute-heatmap'
23
    stat_task = StatisticTask()
Sascha Herzinger's avatar
Sascha Herzinger committed
24
25
26

    def main(self, numerical_arrays: List[pd.DataFrame],
             numericals: List[pd.DataFrame],
Sascha Herzinger's avatar
Sascha Herzinger committed
27
             categoricals: List[pd.DataFrame],
28
29
             ranking_method: str,
             id_filter: List[T],
30
             max_rows: int,
Sascha Herzinger's avatar
Sascha Herzinger committed
31
             subsets: List[List[T]]) -> dict:
32
        # merge input data into single df
Sascha Herzinger's avatar
Sascha Herzinger committed
33
        df = reduce(lambda a, b: a.append(b), numerical_arrays)
34
        if not subsets:
35
            # empty subsets equals all samples in one subset
36
            subsets = [df['id'].unique().tolist()]
37
38
39
40
41
42
43
44
45
        else:
            # if subsets are defined we drop the rows that are not part of one
            flattened_subsets = [x for subset in subsets for x in subset]
            df = df[df['id'].isin(flattened_subsets)]
        # apply id filter
        if id_filter:
            df = df[df['id'].isin(id_filter)]
        # drop subset ids that are not in the df
        subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets)
46
        # make sure the input data are still valid after the pre-processing
47
        if df.shape[0] < 1:
48
49
50
51
            error = "Either the input data set is too small or " \
                    "the subset sample ids do not match the data."
            logger.error(error)
            raise ValueError(error)
52

53
54
55
        # make matrix of input data
        _df = df.pivot(index='feature', columns='id', values='value')

56
        # create z-score matrix used for visualising the heatmap
57
58
        z_df = _df.apply(lambda row: (row - row.mean()) / row.std(ddof=0),
                         axis=1)
59

60
        # compute statistic for ranking
61
        stats = self.stat_task.main(df=_df, subsets=subsets,
62
                                    ranking_method=ranking_method)
63
        del _df
Sascha Herzinger's avatar
Sascha Herzinger committed
64

65
        # prepare output for front-end
66
67
68
69
        z_df['feature'] = z_df.index
        z_df = pd.melt(z_df, id_vars='feature')
        df = df.merge(z_df, on=['id', 'feature'])
        df.columns = ['id', 'feature', 'value', 'zscore']
Sascha Herzinger's avatar
Sascha Herzinger committed
70

71
72
        # sort by ranking_value
        df['sort_value'] = df['feature'].apply(
73
            lambda x: stats[stats['feature'] == x][ranking_method].tolist()[0])
74
75
76
77
78
        df = df.sort_values('sort_value', ascending=False).drop('sort_value', 1)

        # discard rows according to max_rows
        df = df[df['feature'].isin(df['feature'].unique()[:max_rows])]

Sascha Herzinger's avatar
Sascha Herzinger committed
79
        return {
80
81
            'data': df.to_json(orient='records'),
            'stats': stats.to_json(orient='records')
82
        }