Commit 72793e60 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

added variant ratios to PCA output

parent 6cbd51bd
Pipeline #2269 failed with stage
in 1 minute and 16 seconds
...@@ -5,6 +5,7 @@ from functools import reduce ...@@ -5,6 +5,7 @@ from functools import reduce
import logging import logging
import pandas as pd import pandas as pd
import numpy as np
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer from sklearn.preprocessing import Imputer
...@@ -38,6 +39,7 @@ class PCATask(AnalyticTask): ...@@ -38,6 +39,7 @@ class PCATask(AnalyticTask):
# make matrix of data # make matrix of data
df = df.pivot(index='feature', columns='id', values='value') df = df.pivot(index='feature', columns='id', values='value')
df = df.T df = df.T
feature_labels = list(df)
# apply id filter # apply id filter
if id_filter: if id_filter:
...@@ -56,6 +58,14 @@ class PCATask(AnalyticTask): ...@@ -56,6 +58,14 @@ class PCATask(AnalyticTask):
pca.fit(df) pca.fit(df)
reduced_df = pca.transform(df) reduced_df = pca.transform(df)
# get explained variance ratios of components
variance_ratios = pca.explained_variance_ratio_
# get loadings
loadings = -1 * pca.components_.T * np.sqrt(pca.explained_variance_)
loadings = pd.DataFrame(loadings)
loadings['feature'] = feature_labels
# re-assign ids # re-assign ids
reduced_df = pd.DataFrame(reduced_df) reduced_df = pd.DataFrame(reduced_df)
reduced_df['id'] = ids reduced_df['id'] = ids
...@@ -66,5 +76,7 @@ class PCATask(AnalyticTask): ...@@ -66,5 +76,7 @@ class PCATask(AnalyticTask):
categories=categories) categories=categories)
return { return {
'data': reduced_df.to_json(orient='records') 'data': reduced_df.to_json(orient='records'),
'loadings': loadings.to_json(orient='records'),
'variance_ratios': variance_ratios.tolist()
} }
\ No newline at end of file
...@@ -62,3 +62,38 @@ class TestPCATask: ...@@ -62,3 +62,38 @@ class TestPCATask:
subsets=[]) subsets=[])
data = pd.read_json(result['data']) data = pd.read_json(result['data'])
assert data['id'].unique().tolist() == [101, 104] assert data['id'].unique().tolist() == [101, 104]
def test_correct_loadings(self):
features = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 20],
[102, 'foo', 10], [102, 'bar', 15],
[103, 'foo', 15], [103, 'bar', 10],
[104, 'foo', 20], [104, 'bar', 5]],
columns=['id', 'feature', 'value'])
]
result = self.task.main(features=features,
categories=[],
n_components=2,
whiten=False,
id_filter=[],
subsets=[])
loadings = pd.read_json(result['loadings'])
assert loadings['0'].tolist()[0] == -loadings['0'].tolist()[1]
assert loadings['1'].tolist()[0] == loadings['1'].tolist()[1]
def test_correct_variance_ratios(self):
features = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 5],
[102, 'foo', 10], [102, 'bar', 5],
[103, 'foo', 15], [103, 'bar', 5],
[104, 'foo', 20], [104, 'bar', 5]],
columns=['id', 'feature', 'value'])
]
result = self.task.main(features=features,
categories=[],
n_components=2,
whiten=False,
id_filter=[],
subsets=[])
variance_ratios = result['variance_ratios']
assert variance_ratios == [1, 0]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment