Commit 72793e60 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

added variant ratios to PCA output

parent 6cbd51bd
Pipeline #2269 failed with stage
in 1 minute and 16 seconds
......@@ -5,6 +5,7 @@ from functools import reduce
import logging
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
......@@ -38,6 +39,7 @@ class PCATask(AnalyticTask):
# make matrix of data
df = df.pivot(index='feature', columns='id', values='value')
df = df.T
feature_labels = list(df)
# apply id filter
if id_filter:
......@@ -56,6 +58,14 @@ class PCATask(AnalyticTask):
pca.fit(df)
reduced_df = pca.transform(df)
# get explained variance ratios of components
variance_ratios = pca.explained_variance_ratio_
# get loadings
loadings = -1 * pca.components_.T * np.sqrt(pca.explained_variance_)
loadings = pd.DataFrame(loadings)
loadings['feature'] = feature_labels
# re-assign ids
reduced_df = pd.DataFrame(reduced_df)
reduced_df['id'] = ids
......@@ -66,5 +76,7 @@ class PCATask(AnalyticTask):
categories=categories)
return {
'data': reduced_df.to_json(orient='records')
'data': reduced_df.to_json(orient='records'),
'loadings': loadings.to_json(orient='records'),
'variance_ratios': variance_ratios.tolist()
}
\ No newline at end of file
......@@ -62,3 +62,38 @@ class TestPCATask:
subsets=[])
data = pd.read_json(result['data'])
assert data['id'].unique().tolist() == [101, 104]
def test_correct_loadings(self):
features = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 20],
[102, 'foo', 10], [102, 'bar', 15],
[103, 'foo', 15], [103, 'bar', 10],
[104, 'foo', 20], [104, 'bar', 5]],
columns=['id', 'feature', 'value'])
]
result = self.task.main(features=features,
categories=[],
n_components=2,
whiten=False,
id_filter=[],
subsets=[])
loadings = pd.read_json(result['loadings'])
assert loadings['0'].tolist()[0] == -loadings['0'].tolist()[1]
assert loadings['1'].tolist()[0] == loadings['1'].tolist()[1]
def test_correct_variance_ratios(self):
features = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 5],
[102, 'foo', 10], [102, 'bar', 5],
[103, 'foo', 15], [103, 'bar', 5],
[104, 'foo', 20], [104, 'bar', 5]],
columns=['id', 'feature', 'value'])
]
result = self.task.main(features=features,
categories=[],
n_components=2,
whiten=False,
id_filter=[],
subsets=[])
variance_ratios = result['variance_ratios']
assert variance_ratios == [1, 0]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment