from helpers.clean import *
from helpers.eda import *
from helpers.model import *
from helpers.evaluate import *
import time
start_notebook = time.time()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import ADASYN, \
SMOTE
from sklearn.ensemble import AdaBoostClassifier, \
RandomForestClassifier, \
VotingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, \
plot_tree
from sklearn.metrics import accuracy_score, \
classification_report, \
confusion_matrix, \
ConfusionMatrixDisplay, \
f1_score, \
make_scorer, \
recall_score, \
roc_curve, \
precision_score
from sklearn.model_selection import cross_val_score, \
train_test_split, \
GridSearchCV
seed = 42
teal = '#01ACA5'
purple = '#BF40BF'
sns.set_palette(sns.color_palette([teal, purple]))
white_median = dict(color='yellow')
pd.set_option('display.max_columns', 9)
import warnings
warnings.filterwarnings('ignore')
header = ['age', 'class_of_worker', 'industry_code', 'occupation_code', 'education', 'wage_per_hour',
'enrolled_in_edu_inst_last_wk', 'marital_status', 'major_industry_code',
'major_occupation_code', 'race', 'hispanic_origin', 'sex', 'member_of_a_labor_union',
'reason_for_unemployment', 'full_or_part_time_employment_stat', 'capital_gains', 'capital_losses',
'divdends_from_stocks', 'tax_filer_status', 'region_of_previous_residence',
'state_of_previous_residence', 'detailed_household_and_family_stat',
'detailed_household_summary_in_household', 'unknown_column', 'migration_code_change_in_msa',
'migration_code_change_in_reg', 'migration_code_move_within_reg', 'live_in_this_house_1_year_ago',
'migration_prev_res_in_sunbelt', 'num_persons_worked_for_employer', 'family_members_under_18',
'country_of_birth_father', 'country_of_birth_mother', 'country_of_birth_self',
'citizenship', 'own_business_or_self_employed', 'fill_inc_questionnaire_for_veterans_admin',
'veterans_benefits', 'weeks_worked_in_year', 'year', 'taxable_income_amount']
columns_to_drop_eda = ['weeks_worked_in_year',
'industry_code',
'hispanic_origin',
'detailed_household_and_family_stat',
'country_of_birth_father',
'country_of_birth_mother',
'country_of_birth_self',
'own_business_or_self_employed',
'veterans_benefits',
'year']
start_time = time.time()
raw_train = pd.read_csv('./data/census_income_learn.csv', names=header, sep=', ')
raw_test = pd.read_csv('./data/census_income_test.csv', names=header, sep=', ')
train = pd.read_csv('./data/census_income_learn.csv', names=header, sep=', ')
test = pd.read_csv('./data/census_income_test.csv', names=header, sep=', ')
end_time = time.time()
execution_time = end_time - start_time
print(f'Loading time: {round(execution_time)} seconds')
Loading time: 9 seconds
start_time = time.time()
train = clean_df(train, *columns_to_drop_eda)
test = clean_df(test, *columns_to_drop_eda)
end_time = time.time()
execution_time = end_time - start_time
print(f'Cleaning time: {round(execution_time)} seconds')
Cleaning time: 8 seconds
get_cleaning_metrics(raw_train, train, raw_test, test, type_of_test_set='Test')
79785 observations removed from the train set. 39607 observations removed from the Test set. 30 columns removed from the train set. 30 columns removed from the Test set. Train dataset observations reduced by 40.0%. Test dataset observations reduced by 39.7%. Train dataset columns reduced by 71.4%. Test dataset columns reduced by 71.4%. The training set has 119738 observations and 12 columns. The training set has 60155 observations and 12 columns.
fig = plot_imbalance(train)
plt.show()
start_time = time.time()
train_x, train_y = train.drop('taxable_income_amount', axis=1), train.taxable_income_amount
test_x, test_y = test.drop('taxable_income_amount', axis=1), test.taxable_income_amount
dr = DummyClassifier(strategy='most_frequent')
dr_start_time = time.time()
prediction, model_metrics = get_model_results(train_x, train_y, test_x, test_y, dr)
dr_end_time = time.time()
dr_execution_time = dr_end_time - dr_start_time
print(f'\nExecution time: {round(dr_execution_time)} seconds')
Time to train: 0 seconds Time to predict: 0 seconds Total time: 0 seconds Accuracy:) 0.9013 F-score: 0.0 Recall: 0.0 precision recall f1-score support 0 0.90 1.00 0.95 54217 1 0.00 0.00 0.00 5938 accuracy 0.90 60155 macro avg 0.45 0.50 0.47 60155 weighted avg 0.81 0.90 0.85 60155 [[54217 0] [ 5938 0]] Execution time: 0 seconds
title = 'Confusion Matrix - Dummy'
predicted_y = prediction
display_confusion_matrix(title=title, test_y=test_y, predicted_y=predicted_y)
train_x_dummies = pd.get_dummies(train_x)
test_x_dummies = pd.get_dummies(test_x)
train_scaler = StandardScaler().fit(train_x_dummies)
train_x_scaled = train_scaler.transform(train_x_dummies)
test_scaler = StandardScaler().fit(test_x_dummies)
test_x_scaled = test_scaler.transform(test_x_dummies)
train_x_split, valid_x, train_y_split, valid_y = train_test_split(train_x_scaled,
train_y,
test_size=0.33,
random_state=42)
train_x = train_x_split
train_y = train_y_split
test_x = test_x_scaled
fig = plot_imbalance(test)
plt.show()
test.taxable_income_amount.value_counts(normalize=True)
0 0.901288 1 0.098712 Name: taxable_income_amount, dtype: float64
model = SMOTE()
smote_x, smote_y = model.fit_resample(train_x, train_y)
smote_y = smote_y.astype('int')
adasyn = ADASYN(random_state=42)
adasyn_x, adasyn_y = adasyn.fit_resample(train_x, train_y)
dr = DummyClassifier(strategy='most_frequent')
dr_start_time = time.time()
prediction, model_metrics = get_model_results(train_x, train_y, train_x, train_y, dr)
dr_end_time = time.time()
dr_execution_time = dr_end_time - dr_start_time
print(f'\nExecution time: {round(dr_execution_time)} seconds')
Time to train: 0 seconds Time to predict: 0 seconds Total time: 0 seconds Accuracy:) 0.9004 F-score: 0.0 Recall: 0.0 precision recall f1-score support 0 0.90 1.00 0.95 72230 1 0.00 0.00 0.00 7994 accuracy 0.90 80224 macro avg 0.45 0.50 0.47 80224 weighted avg 0.81 0.90 0.85 80224 [[72230 0] [ 7994 0]] Execution time: 0 seconds
title = 'Confusion Matrix - Dummy'
predicted_y = prediction
display_confusion_matrix(title=title, test_y=train_y, predicted_y=predicted_y)
end_time = time.time()
execution_time = end_time - start_time
print(f'Preprocessing time: {round(execution_time)} seconds')
Preprocessing time: 10 seconds
train_start = time.time()
models = {}
dr = DummyClassifier(strategy='most_frequent')
models['Dummy classifier'] = dr
lr = LogisticRegression(random_state=seed) #class_weight={0:1, 1:15},
models['Logistic regression'] = lr
nn = MLPClassifier(max_iter=300)
models['Neural net'] = nn
dt = DecisionTreeClassifier(random_state=seed) #max_depth=5, class_weight={0: 1, 1: 12}
models['Decision tree'] = dt
rf = RandomForestClassifier(random_state=seed) #class_weight={0: 1, 1: 12},
#criterion='gini',
#max_depth=13,
#max_features='log2',
#min_samples_leaf=10,
#n_estimators=26,
#n_jobs=-1,
models['Random forest'] = rf
gnb = GaussianNB()
models['Naive Bayes'] = gnb
knn = KNeighborsClassifier(n_neighbors=1, weights='distance')
models['K-nearest neighbors'] = knn
svm = SVC(C=0.1, kernel='poly', random_state=seed)
models['Support vector machine'] = svm
qda = QuadraticDiscriminantAnalysis()
models['Quadratic Discriminant Analysis'] = qda
abc = AdaBoostClassifier(random_state=seed) #n_estimators=100,
models['Adaboost'] = qda
print('Models originally considered:')
for key in models.keys():
print('-', key)
Models originally considered: - Dummy classifier - Logistic regression - Neural net - Decision tree - Random forest - Naive Bayes - K-nearest neighbors - Support vector machine - Quadratic Discriminant Analysis - Adaboost
# Combine the classifiers in the ensemble model
ensemble_model = VotingClassifier(estimators=[('lr', lr),
('nn', nn),
('dt', dt),
('rf', rf),
('knn', knn),
('svm', svm),
('qda', qda),
('abc', abc)],
voting='hard')
modeling_start = time.time()
metric_columns = columns=['model', 'time_to_train', 'time_to_predict', 'total_time', 'accuracy', 'f1', 'recall']
metrics_train = metrics_valid = metrics_test = pd.DataFrame(columns=metric_columns)
train_start_time = time.time()
for model_name, model in models.items():
metrics_train = train_and_test_model(metrics_df=metrics_train,
train_x=train_x,
train_y=train_y,
test_x=train_x,
test_y=train_y,
model=model,
model_name=model_name)
DUMMY CLASSIFIER Time to train: 0 seconds Time to predict: 0 seconds Total time: 0 seconds Accuracy:) 0.9004 F-score: 0.0 Recall: 0.0 precision recall f1-score support 0 0.90 1.00 0.95 72230 1 0.00 0.00 0.00 7994 accuracy 0.90 80224 macro avg 0.45 0.50 0.47 80224 weighted avg 0.81 0.90 0.85 80224 [[72230 0] [ 7994 0]]
LOGISTIC REGRESSION Time to train: 1 seconds Time to predict: 0 seconds Total time: 1 seconds Accuracy:) 0.9114 F-score: 0.3774 Recall: 0.2695 precision recall f1-score support 0 0.92 0.98 0.95 72230 1 0.63 0.27 0.38 7994 accuracy 0.91 80224 macro avg 0.78 0.63 0.66 80224 weighted avg 0.89 0.91 0.90 80224 [[70962 1268] [ 5840 2154]]
NEURAL NET Time to train: 38 seconds Time to predict: 0 seconds Total time: 38 seconds Accuracy:) 0.9213 F-score: 0.4633 Recall: 0.3409 precision recall f1-score support 0 0.93 0.99 0.96 72230 1 0.72 0.34 0.46 7994 accuracy 0.92 80224 macro avg 0.83 0.66 0.71 80224 weighted avg 0.91 0.92 0.91 80224 [[71186 1044] [ 5269 2725]]
DECISION TREE Time to train: 0 seconds Time to predict: 0 seconds Total time: 0 seconds Accuracy:) 0.9866 F-score: 0.9289 Recall: 0.88 precision recall f1-score support 0 0.99 1.00 0.99 72230 1 0.98 0.88 0.93 7994 accuracy 0.99 80224 macro avg 0.99 0.94 0.96 80224 weighted avg 0.99 0.99 0.99 80224 [[72112 118] [ 959 7035]]
RANDOM FOREST Time to train: 4 seconds Time to predict: 1 seconds Total time: 5 seconds Accuracy:) 0.9865 F-score: 0.9303 Recall: 0.9049 precision recall f1-score support 0 0.99 1.00 0.99 72230 1 0.96 0.90 0.93 7994 accuracy 0.99 80224 macro avg 0.97 0.95 0.96 80224 weighted avg 0.99 0.99 0.99 80224 [[71906 324] [ 760 7234]]
NAIVE BAYES Time to train: 0 seconds Time to predict: 0 seconds Total time: 0 seconds Accuracy:) 0.4603 F-score: 0.2554 Recall: 0.9288 precision recall f1-score support 0 0.98 0.41 0.58 72230 1 0.15 0.93 0.26 7994 accuracy 0.46 80224 macro avg 0.56 0.67 0.42 80224 weighted avg 0.90 0.46 0.54 80224 [[29501 42729] [ 569 7425]]
K-NEAREST NEIGHBORS Time to train: 0 seconds Time to predict: 49 seconds Total time: 49 seconds Accuracy:) 0.9926 F-score: 0.9628 Recall: 0.9655 precision recall f1-score support 0 1.00 1.00 1.00 72230 1 0.96 0.97 0.96 7994 accuracy 0.99 80224 macro avg 0.98 0.98 0.98 80224 weighted avg 0.99 0.99 0.99 80224 [[71910 320] [ 276 7718]]
SUPPORT VECTOR MACHINE Time to train: 213 seconds Time to predict: 29 seconds Total time: 242 seconds Accuracy:) 0.9092 F-score: 0.2378 Recall: 0.1422 precision recall f1-score support 0 0.91 0.99 0.95 72230 1 0.73 0.14 0.24 7994 accuracy 0.91 80224 macro avg 0.82 0.57 0.59 80224 weighted avg 0.89 0.91 0.88 80224 [[71799 431] [ 6857 1137]]
QUADRATIC DISCRIMINANT ANALYSIS Time to train: 0 seconds Time to predict: 0 seconds Total time: 0 seconds Accuracy:) 0.3796 F-score: 0.2324 Recall: 0.9426 precision recall f1-score support 0 0.98 0.32 0.48 72230 1 0.13 0.94 0.23 7994 accuracy 0.38 80224 macro avg 0.56 0.63 0.36 80224 weighted avg 0.90 0.38 0.45 80224 [[22922 49308] [ 459 7535]]
ADABOOST Time to train: 0 seconds Time to predict: 0 seconds Total time: 0 seconds Accuracy:) 0.3796 F-score: 0.2324 Recall: 0.9426 precision recall f1-score support 0 0.98 0.32 0.48 72230 1 0.13 0.94 0.23 7994 accuracy 0.38 80224 macro avg 0.56 0.63 0.36 80224 weighted avg 0.90 0.38 0.45 80224 [[22922 49308] [ 459 7535]]
train_end_time = time.time()
train_execution_time = train_end_time - train_start_time
print(f'Train set execution time: {round(train_execution_time / 60)} minutes')
Train set execution time: 6 minutes
metrics_train.sort_values(by=['f1', 'time_to_predict'], ascending=False)
model | time_to_train | time_to_predict | total_time | accuracy | f1 | recall | |
---|---|---|---|---|---|---|---|
0 | K-nearest neighbors | 0 | 49 | 49 | 0.992571 | 0.962824 | 0.965474 |
0 | Random forest | 4 | 1 | 5 | 0.986488 | 0.930298 | 0.904929 |
0 | Decision tree | 0 | 0 | 0 | 0.986575 | 0.928897 | 0.880035 |
0 | Neural net | 38 | 0 | 38 | 0.921308 | 0.463317 | 0.340881 |
0 | Logistic regression | 1 | 0 | 1 | 0.911398 | 0.377365 | 0.269452 |
0 | Naive Bayes | 0 | 0 | 0 | 0.460286 | 0.255383 | 0.928822 |
0 | Support vector machine | 213 | 29 | 242 | 0.909154 | 0.237816 | 0.142232 |
0 | Quadratic Discriminant Analysis | 0 | 0 | 0 | 0.379649 | 0.232429 | 0.942582 |
0 | Adaboost | 0 | 0 | 0 | 0.379649 | 0.232429 | 0.942582 |
0 | Dummy classifier | 0 | 0 | 0 | 0.900354 | 0.0 | 0.0 |
We can already identify models that perform poorly on the very dataset they were trained upon. Let's get rid of those.
poor_models = metrics_train[metrics_train.f1 < 0.50].model
So our efforts will most probably focus on obtaining a good decision tree, and then building a random forest on top of it to mitigate overfitting risks.
for model in poor_models:
if model in models.keys():
del models[model]
print('Models finally considered for training:')
for key in models.keys():
print('-', key)
Models finally considered for training: - Decision tree - Random forest - K-nearest neighbors
We can now consider:
Based on this assessment above, let's give a chance to these algorithms on the validation set
valid_start_time = time.time()
for model_name, model in models.items():
metrics_valid = train_and_test_model(metrics_df=metrics_valid,
train_x=train_x,
train_y=train_y,
test_x=valid_x,
test_y=valid_y,
model=model,
model_name=model_name)
DECISION TREE Time to train: 0 seconds Time to predict: 0 seconds Total time: 0 seconds Accuracy:) 0.8769 F-score: 0.3845 Recall: 0.3871 precision recall f1-score support 0 0.93 0.93 0.93 35590 1 0.38 0.39 0.38 3924 accuracy 0.88 39514 macro avg 0.66 0.66 0.66 39514 weighted avg 0.88 0.88 0.88 39514 [[33131 2459] [ 2405 1519]]
RANDOM FOREST Time to train: 4 seconds Time to predict: 1 seconds Total time: 5 seconds Accuracy:) 0.9058 F-score: 0.4261 Recall: 0.3522 precision recall f1-score support 0 0.93 0.97 0.95 35590 1 0.54 0.35 0.43 3924 accuracy 0.91 39514 macro avg 0.74 0.66 0.69 39514 weighted avg 0.89 0.91 0.90 39514 [[34409 1181] [ 2542 1382]]
K-NEAREST NEIGHBORS Time to train: 0 seconds Time to predict: 24 seconds Total time: 24 seconds Accuracy:) 0.8737 F-score: 0.3794 Recall: 0.3886 precision recall f1-score support 0 0.93 0.93 0.93 35590 1 0.37 0.39 0.38 3924 accuracy 0.87 39514 macro avg 0.65 0.66 0.65 39514 weighted avg 0.88 0.87 0.88 39514 [[32999 2591] [ 2399 1525]]
valid_end_time = time.time()
valid_execution_time = valid_end_time - valid_start_time
print(f'Validation set execution time: {round(valid_execution_time / 60)} minutes')
Validation set execution time: 1 minutes
metrics_valid.sort_values(by=['f1', 'time_to_predict'], ascending=False)
model | time_to_train | time_to_predict | total_time | accuracy | f1 | recall | |
---|---|---|---|---|---|---|---|
0 | Random forest | 4 | 1 | 5 | 0.90578 | 0.426083 | 0.352192 |
0 | Decision tree | 0 | 0 | 0 | 0.876904 | 0.38446 | 0.387105 |
0 | K-nearest neighbors | 0 | 24 | 24 | 0.873716 | 0.379353 | 0.388634 |
The models we considered all lost about 0.5 points on their F1 score. That's a severe hit. Before we try to improve them individually, let's see if an additional manipulation on the dataset, to deal with its imbalance, would improve our results.
Also, KNN is now our worst performing model, and neural net our best performing one. Random Forest is close on its tail though, so it definitely looks like a combination of work on a decision tree and a random forest would be the way to go.
Out of curiosity, what are the most important features recommended by a Random Forest model?
# fitting the model
model = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
model.fit(train_x, train_y)
# plotting feature importances
features = train_x_dummies.columns
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(10,15))
plt.title('Feature importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative importance')
plt.show()
smote_metrics_valid = pd.DataFrame(columns=metric_columns)
for model_name, model in models.items():
smote_metrics_valid = train_and_test_model(metrics_df=smote_metrics_valid,
train_x=smote_x,
train_y=smote_y,
test_x=valid_x,
test_y=valid_y,
model=model,
model_name=model_name)
DECISION TREE Time to train: 1 seconds Time to predict: 0 seconds Total time: 1 seconds Accuracy:) 0.8679 F-score: 0.3763 Recall: 0.4011 precision recall f1-score support 0 0.93 0.92 0.93 35590 1 0.35 0.40 0.38 3924 accuracy 0.87 39514 macro avg 0.64 0.66 0.65 39514 weighted avg 0.88 0.87 0.87 39514 [[32722 2868] [ 2350 1574]]
RANDOM FOREST Time to train: 9 seconds Time to predict: 1 seconds Total time: 10 seconds Accuracy:) 0.8837 F-score: 0.4455 Recall: 0.4704 precision recall f1-score support 0 0.94 0.93 0.94 35590 1 0.42 0.47 0.45 3924 accuracy 0.88 39514 macro avg 0.68 0.70 0.69 39514 weighted avg 0.89 0.88 0.89 39514 [[33073 2517] [ 2078 1846]]
K-NEAREST NEIGHBORS Time to train: 0 seconds Time to predict: 43 seconds Total time: 43 seconds Accuracy:) 0.8489 F-score: 0.3869 Recall: 0.4801 precision recall f1-score support 0 0.94 0.89 0.91 35590 1 0.32 0.48 0.39 3924 accuracy 0.85 39514 macro avg 0.63 0.68 0.65 39514 weighted avg 0.88 0.85 0.86 39514 [[31658 3932] [ 2040 1884]]
smote_metrics_valid.sort_values(by=['f1', 'time_to_predict'], ascending=False)
model | time_to_train | time_to_predict | total_time | accuracy | f1 | recall | |
---|---|---|---|---|---|---|---|
0 | Random forest | 9 | 1 | 10 | 0.883712 | 0.445517 | 0.470438 |
0 | K-nearest neighbors | 0 | 43 | 43 | 0.848864 | 0.386858 | 0.480122 |
0 | Decision tree | 1 | 0 | 1 | 0.867946 | 0.376285 | 0.401121 |
We don't see any improvement here. Let's try with ADASYN.
adasyn_metrics_valid = pd.DataFrame(columns=metric_columns)
for model_name, model in models.items():
adasyn_metrics_valid = train_and_test_model(metrics_df=adasyn_metrics_valid,
train_x=adasyn_x,
train_y=adasyn_y,
test_x=valid_x,
test_y=valid_y,
model=model,
model_name=model_name)
DECISION TREE Time to train: 1 seconds Time to predict: 0 seconds Total time: 1 seconds Accuracy:) 0.8698 F-score: 0.3778 Recall: 0.3981 precision recall f1-score support 0 0.93 0.92 0.93 35590 1 0.36 0.40 0.38 3924 accuracy 0.87 39514 macro avg 0.65 0.66 0.65 39514 weighted avg 0.88 0.87 0.87 39514 [[32807 2783] [ 2362 1562]]
RANDOM FOREST Time to train: 10 seconds Time to predict: 1 seconds Total time: 11 seconds Accuracy:) 0.8828 F-score: 0.4471 Recall: 0.4771 precision recall f1-score support 0 0.94 0.93 0.93 35590 1 0.42 0.48 0.45 3924 accuracy 0.88 39514 macro avg 0.68 0.70 0.69 39514 weighted avg 0.89 0.88 0.89 39514 [[33012 2578] [ 2052 1872]]
K-NEAREST NEIGHBORS Time to train: 0 seconds Time to predict: 44 seconds Total time: 44 seconds Accuracy:) 0.8455 F-score: 0.3852 Recall: 0.4875 precision recall f1-score support 0 0.94 0.88 0.91 35590 1 0.32 0.49 0.39 3924 accuracy 0.85 39514 macro avg 0.63 0.69 0.65 39514 weighted avg 0.88 0.85 0.86 39514 [[31495 4095] [ 2011 1913]]
Let's also have a look at the decision tree we built earlier:
fig = plt.figure(figsize=(15, 10))
plot_tree(dt,
feature_names=train_x_dummies.columns,
class_names=['0', '1'],
filled=True, impurity=True,
rounded=True,
max_depth=5
)
plt.show()
Now let's look at our models' performance:
adasyn_metrics_valid.sort_values(by=['f1', 'time_to_predict'], ascending=False)
model | time_to_train | time_to_predict | total_time | accuracy | f1 | recall | |
---|---|---|---|---|---|---|---|
0 | Random forest | 10 | 1 | 11 | 0.882826 | 0.447098 | 0.477064 |
0 | K-nearest neighbors | 0 | 44 | 44 | 0.845472 | 0.385219 | 0.487513 |
0 | Decision tree | 1 | 0 | 1 | 0.869793 | 0.377797 | 0.398063 |
No improvement either. We'd like to avoid PCA, since we'd lose in explainability, so let's try some simple hyparameter tuning first.
tuning_start = time.time()
rf = RandomForestClassifier()
param_grid = {'class_weight': ['balanced'],
'n_estimators': [10,100,500,700],
'max_features': ['auto','log2'],
'max_depth' : [10, 100],
'min_samples_leaf': [0.001, 0.01, 0.05],
'criterion': ['gini', 'entropy']
}
#scorer = {'F1': make_scorer(f1_score)}
grid_rf = GridSearchCV(rf, param_grid=param_grid, scoring='f1', cv=5, verbose=1)
grid_rf.fit(train_x, train_y)
print(grid_rf.best_params_)
prediction = grid_rf.predict(valid_x)
Fitting 5 folds for each of 96 candidates, totalling 480 fits {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 0.001, 'n_estimators': 500}
grid_rf.best_score_
0.44402874570737405
tuning_end = time.time()
tuning_execution_time = tuning_end - tuning_start
print()
ConfusionMatrixDisplay.from_predictions(valid_y, prediction, cmap='winter')
plt.title('Confusion Matrix - Random Forest with SMOTE (testing data)')
#plt.savefig(fname='011.png', format='png', dpi=400)
plt.show()
Let's keep our best performing model so far:
rf = RandomForestClassifier(class_weight='balanced',
criterion='gini',
max_depth=100, max_features='auto',
min_samples_leaf=0.001,
n_estimators=300)
final_metrics = pd.DataFrame(columns=metric_columns)
final_metrics = train_and_test_model(metrics_df=final_metrics,
train_x=train_x,
train_y=train_y,
test_x=test_x,
test_y=test_y,
model=rf,
model_name='Random Forest')
RANDOM FOREST Time to train: 6 seconds Time to predict: 1 seconds Total time: 7 seconds Accuracy:) 0.7968 F-score: 0.4424 Recall: 0.8166 precision recall f1-score support 0 0.98 0.79 0.88 54217 1 0.30 0.82 0.44 5938 accuracy 0.80 60155 macro avg 0.64 0.81 0.66 60155 weighted avg 0.91 0.80 0.83 60155 [[43083 11134] [ 1089 4849]]
The model we selected in the end provides a satisfying score, with a AUC of 0.81. However, the metric remains optimistic as the are still some important misclassification errors. The good news is that the performance of the model is on par with its performance on the test set, making it reliable for scalability purposes.
end_notebook = time.time()
print(f'Analysis took {round((end_notebook - start_notebook) / 60)} minutes to run.')
Analysis took 38 minutes to run.