Commit 0a1484cb authored by Sascha Herzinger's avatar Sascha Herzinger
Browse files

added more random data generators

parent 6a5d9f8b
Pipeline #2347 failed with stage
in 44 seconds
"""This module provides sample data."""
import pandas as pd
import random
from fractalis.data.etl import ETL
class RandomCategoricalETL(ETL):
name = 'test_categorical_etl'
produces = 'categorical'
@staticmethod
def can_handle(handler: str, descriptor: dict) -> bool:
return handler == 'test' and \
descriptor['data_type'] == 'categorical'
def extract(self, server: str,
token: str, descriptor: dict) -> pd.DataFrame:
data = pd.DataFrame([random.choice(descriptor['values'])
for i in range(descriptor['num_samples'])])
return data
def transform(self, raw_data: pd.DataFrame,
descriptor: dict) -> pd.DataFrame:
raw_data.insert(0, 'id', raw_data.index.astype('str'))
df = pd.melt(raw_data, id_vars='id', var_name='feature')
return df
"""This module provides sample data."""
import pandas as pd
import numpy as np
import string
import random
from fractalis.data.etl import ETL
class RandomNumericalETL(ETL):
name = 'test_numerical_etl'
produces = 'numerical'
@staticmethod
def can_handle(handler: str, descriptor: dict) -> bool:
return handler == 'test' and \
descriptor['data_type'] == 'numerical'
def extract(self, server: str,
token: str, descriptor: dict) -> pd.DataFrame:
feature = ''.join(random.choice(string.ascii_letters + string.digits)
for _ in range(30))
data = pd.DataFrame(np.random.randn(descriptor['num_samples']).tolist(),
columns=[feature])
return data
def transform(self, raw_data: pd.DataFrame,
descriptor: dict) -> pd.DataFrame:
raw_data.insert(0, 'id', raw_data.index.astype('str'))
df = pd.melt(raw_data, id_vars='id', var_name='feature')
return df
...@@ -2,11 +2,13 @@ ...@@ -2,11 +2,13 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import string
import random
from fractalis.data.etl import ETL from fractalis.data.etl import ETL
class RandomNumericalETL(ETL): class RandomNumericalArrayETL(ETL):
name = 'test_numerical_array_etl' name = 'test_numerical_array_etl'
produces = 'numerical_array' produces = 'numerical_array'
...@@ -18,8 +20,12 @@ class RandomNumericalETL(ETL): ...@@ -18,8 +20,12 @@ class RandomNumericalETL(ETL):
def extract(self, server: str, def extract(self, server: str,
token: str, descriptor: dict) -> pd.DataFrame: token: str, descriptor: dict) -> pd.DataFrame:
features = [''.join(random.choice(string.ascii_letters + string.digits)
for _ in range(10))
for _ in range(descriptor['num_features'])]
data = pd.DataFrame(np.random.randn( data = pd.DataFrame(np.random.randn(
descriptor['num_samples'], descriptor['num_features']).tolist()) descriptor['num_samples'], descriptor['num_features']).tolist(),
columns=features)
return data return data
def transform(self, raw_data: pd.DataFrame, def transform(self, raw_data: pd.DataFrame,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment