main.py 3.6 KB
Newer Older
Sascha Herzinger's avatar
Sascha Herzinger committed
1
2
"""Module containing analysis code for heatmap analytics."""

Sascha Herzinger's avatar
Sascha Herzinger committed
3
from typing import List, TypeVar
Sascha Herzinger's avatar
Sascha Herzinger committed
4
from functools import reduce
5
import logging
Sascha Herzinger's avatar
Sascha Herzinger committed
6
7
8
9

import pandas as pd

from fractalis.analytics.task import AnalyticTask
10
from fractalis.analytics.tasks.heatmap.stats import StatisticTask
11
from fractalis.analytics.tasks.shared import utils
Sascha Herzinger's avatar
Sascha Herzinger committed
12
13


Sascha Herzinger's avatar
Sascha Herzinger committed
14
T = TypeVar('T')
15
logger = logging.getLogger(__name__)
Sascha Herzinger's avatar
Sascha Herzinger committed
16
17


Sascha Herzinger's avatar
Sascha Herzinger committed
18
19
20
21
22
class HeatmapTask(AnalyticTask):
    """Heatmap Analysis Task implementing AnalyticsTask. This class is a
    submittable celery task."""

    name = 'compute-heatmap'
23
    stat_task = StatisticTask()
Sascha Herzinger's avatar
Sascha Herzinger committed
24
25
26

    def main(self, numerical_arrays: List[pd.DataFrame],
             numericals: List[pd.DataFrame],
Sascha Herzinger's avatar
Sascha Herzinger committed
27
             categoricals: List[pd.DataFrame],
28
29
             ranking_method: str,
             id_filter: List[T],
30
             max_rows: int,
Sascha Herzinger's avatar
Sascha Herzinger committed
31
             subsets: List[List[T]]) -> dict:
32
        # merge input data into single df
Sascha Herzinger's avatar
Sascha Herzinger committed
33
        df = reduce(lambda a, b: a.append(b), numerical_arrays)
34
        if not subsets:
35
            # empty subsets equals all samples in one subset
36
            subsets = [df['id'].unique().tolist()]
37
38
39
40
41
42
43
44
45
        else:
            # if subsets are defined we drop the rows that are not part of one
            flattened_subsets = [x for subset in subsets for x in subset]
            df = df[df['id'].isin(flattened_subsets)]
        # apply id filter
        if id_filter:
            df = df[df['id'].isin(id_filter)]
        # drop subset ids that are not in the df
        subsets = utils.drop_unused_subset_ids(df=df, subsets=subsets)
46
        # make sure the input data are still valid after the pre-processing
47
        if df.shape[0] < 1:
48
49
50
51
            error = "Either the input data set is too small or " \
                    "the subset sample ids do not match the data."
            logger.error(error)
            raise ValueError(error)
52

53
        # make matrix of input data
54
        df = df.pivot(index='feature', columns='id', values='value')
55

56
        # create z-score matrix used for visualising the heatmap
57
58
59
        z_df = [(df.iloc[i] - df.iloc[i].mean()) / df.iloc[i].std(ddof=0)
                for i in range(df.shape[0])]
        z_df = pd.DataFrame(z_df, columns=df.columns, index=df.index)
60

61
        # compute statistic for ranking
62
        stats = self.stat_task.main(df=df, subsets=subsets,
63
                                    ranking_method=ranking_method)
Sascha Herzinger's avatar
Sascha Herzinger committed
64

65
        # sort by ranking_value
Sascha Herzinger's avatar
Sascha Herzinger committed
66
67
68
        self.sort(df, stats[ranking_method], ranking_method)
        self.sort(z_df, stats[ranking_method], ranking_method)
        self.sort(stats, stats[ranking_method], ranking_method)
69
70

        # discard rows according to max_rows
71
72
73
74
75
        df = df[:max_rows]
        z_df = z_df[:max_rows]
        stats = stats[:max_rows]

        # prepare output for front-end
Sascha Herzinger's avatar
Sascha Herzinger committed
76
77
        df['feature'] = df.index
        z_df['feature'] = z_df.index
78
79
80
81
82
        df = pd.melt(df, id_vars='feature', var_name='id')
        z_df = pd.melt(z_df, id_vars='feature', var_name='id')
        df = df.merge(z_df, on=['id', 'feature'])
        df.rename(columns={'value_x': 'value', 'value_y': 'zscore'},
                  inplace=True)
83

Sascha Herzinger's avatar
Sascha Herzinger committed
84
        return {
85
86
            'data': df.to_dict(orient='list'),
            'stats': stats.to_dict(orient='list')
87
        }
Sascha Herzinger's avatar
Sascha Herzinger committed
88
89
90
91
92
93
94
95
96
97
98

    @staticmethod
    def sort(df, order, method):
        order = order.tolist()
        if method == 'P.Value' or method == 'adj.P.Val':
            order = [1 - x for x in order]
        elif method == 'logFC' or method == 't':
            order = [abs(x) for x in order]
        df['sort_value'] = order
        df.sort_values('sort_value', ascending=False, inplace=True)
        df.drop('sort_value', axis=1, inplace=True)