main.py 2.39 KB
Newer Older
Sascha Herzinger's avatar
Sascha Herzinger committed
1
2
3
4
5
6
7
"""Module containing analysis code for pca."""

from typing import List, TypeVar
from functools import reduce
import logging

import pandas as pd
8
import numpy as np
Sascha Herzinger's avatar
Sascha Herzinger committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer

from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.shared import utils


T = TypeVar('T')
logger = logging.getLogger(__name__)


class PCATask(AnalyticTask):
    """PCATask implementing AnalyticsTask. This class is a
    submittable celery task."""

    name = 'compute-pca'

    def main(self,
             features: List[pd.DataFrame],
             categories: List[pd.DataFrame],
             whiten: bool,
             id_filter: List[T],
             subsets: List[List[T]]) -> dict:
        # merge input data into single df
        df = reduce(lambda a, b: a.append(b), features)
        if not subsets:
            # empty subsets equals all samples in one subset
            subsets = [df['id'].unique().tolist()]

        # make matrix of data
        df = df.pivot(index='feature', columns='id', values='value')
        df = df.T
41
        feature_labels = list(df)
Sascha Herzinger's avatar
Sascha Herzinger committed
42

Sascha Herzinger's avatar
Sascha Herzinger committed
43
44
45
46
        # apply id filter
        if id_filter:
            df = df[df.index.isin(id_filter)]

Sascha Herzinger's avatar
Sascha Herzinger committed
47
48
49
50
51
52
53
54
55
        # save ids so we can re-assign them after pca
        ids = df.index.tolist()

        # replace missing values with row medians
        imp = Imputer(missing_values='NaN', strategy='median', axis=0)
        imp.fit(df)
        df = imp.transform(df)

        # PCA
56
        pca = PCA(whiten=whiten)
Sascha Herzinger's avatar
Sascha Herzinger committed
57
58
59
        pca.fit(df)
        reduced_df = pca.transform(df)

60
61
62
63
64
65
66
67
        # get explained variance ratios of components
        variance_ratios = pca.explained_variance_ratio_

        # get loadings
        loadings = -1 * pca.components_.T * np.sqrt(pca.explained_variance_)
        loadings = pd.DataFrame(loadings)
        loadings['feature'] = feature_labels

Sascha Herzinger's avatar
Sascha Herzinger committed
68
69
70
71
72
73
74
75
76
77
        # re-assign ids
        reduced_df = pd.DataFrame(reduced_df)
        reduced_df['id'] = ids

        # add category and subset column
        reduced_df = utils.apply_subsets(df=reduced_df, subsets=subsets)
        reduced_df = utils.apply_categories(df=reduced_df,
                                            categories=categories)

        return {
78
79
            'data': reduced_df.to_dict(orient='list'),
            'loadings': loadings.to_dict(orient='list'),
80
            'variance_ratios': variance_ratios.tolist()
Sascha Herzinger's avatar
Sascha Herzinger committed
81
        }