Commit d5621eb8 authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

Added pca analysis

parent 7bf7cc34
Pipeline #2266 failed with stage
in 1 minute and 56 seconds
"""Module containing analysis code for pca."""
from typing import List, TypeVar
from functools import reduce
import logging
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from fractalis.analytics.task import AnalyticTask
from fractalis.analytics.tasks.shared import utils
T = TypeVar('T')
logger = logging.getLogger(__name__)
class PCATask(AnalyticTask):
"""PCATask implementing AnalyticsTask. This class is a
submittable celery task."""
name = 'compute-pca'
def main(self,
features: List[pd.DataFrame],
categories: List[pd.DataFrame],
n_components: int,
whiten: bool,
id_filter: List[T],
subsets: List[List[T]]) -> dict:
# merge input data into single df
df = reduce(lambda a, b: a.append(b), features)
if not subsets:
# empty subsets equals all samples in one subset
subsets = [df['id'].unique().tolist()]
# make matrix of data
df = df.pivot(index='feature', columns='id', values='value')
df = df.T
# save ids so we can re-assign them after pca
ids = df.index.tolist()
# replace missing values with row medians
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp.fit(df)
df = imp.transform(df)
# PCA
pca = PCA(n_components=n_components, whiten=whiten)
pca.fit(df)
reduced_df = pca.transform(df)
# re-assign ids
reduced_df = pd.DataFrame(reduced_df)
reduced_df['id'] = ids
# add category and subset column
reduced_df = utils.apply_subsets(df=reduced_df, subsets=subsets)
reduced_df = utils.apply_categories(df=reduced_df,
categories=categories)
return {
'data': reduced_df.to_json(orient='records')
}
\ No newline at end of file
"""This module provides test for the pca task."""
import json
import pandas as pd
from fractalis.analytics.tasks.pca.main import PCATask
# noinspection PyMissingTypeHints
class TestPCATask:
task = PCATask()
def test_correct_output(self):
features = [
pd.DataFrame([[101, 'foo', 5], [101, 'bar', 6],
[102, 'foo', 10], [102, 'bar', 11],
[103, 'foo', 15], [103, 'bar', 16],
[104, 'foo', 20], [104, 'bar', 21]],
columns=['id', 'feature', 'value']),
pd.DataFrame([[101, 'baz', 5],
[102, 'baz', 10],
[104, 'baz', 20],
[105, 'baz', 100]],
columns=['id', 'feature', 'value'])
]
categories = [
pd.DataFrame([[101, '_', 'a'],
[102, '_', 'a'],
[104, '_', 'a']],
columns=['id', 'feature', 'value'])
]
result = self.task.main(features=features,
categories=categories,
n_components=2,
whiten=False,
id_filter=[],
subsets=[])
data = pd.read_json(result['data'])
assert data.shape == (5, 5)
assert '0' in list(data)
assert '1' in list(data)
assert 'category' in list(data)
assert 'subset' in list(data)
assert 'id' in list(data)
assert data['id'].tolist() == [101, 102, 103, 104, 105]
assert data['subset'].unique().tolist() == [0]
assert data['category'].unique().tolist() == ['a', None]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment