Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

forest.py 2.08 KB
Newer Older
Laura Denies's avatar
Laura Denies committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
#!/usr/bin/env python

import pandas as pd
import numpy as np
import pickle

# Open file with pd.read_csv
df = pd.read_csv("matrix_model_complete.tsv", sep='\t')
#print(df.head())

X = df.drop('class', axis=1)
X = X.drop('#', axis=1)
y = df['class']

#print(X)
#print(y)

from sklearn.model_selection import train_test_split

# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=66)

from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

#random forest model_selection
rfc = RandomForestClassifier(n_estimators=1600, max_depth=340, max_features='auto')
rfc.fit(X_train, y_train)

# predictions
rfc_predict = rfc.predict(X_test)

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

rfc_cv_score= cross_val_score(rfc, X, y, cv=10, scoring='roc_auc')

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

#filename = 'finalized_model_3f.sav'
#pickle.dump(rfc, open(filename, 'wb'))
# Tuning Hyperparameters
#from sklearn.model_selection import RandomizedSearchCV

# number of trees in random forest
#n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# number of features as every split
#max_features = ['auto', 'sqrt']

# max depth
#max_depth = [int(x) for x in np.linspace(100, 500, num=11)]
#max_depth.append(None)

# create random grid
#random_grid = {
#	'n_estimators': n_estimators,
#	'max_features': max_features,
#	'max_depth': max_depth
#	}

# Random search of parameters
#rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the model
#rfc_random.fit(X_train, y_train)

#print(results)
#print(rfc_random.best_params_)