import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize, StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
df = pd.read_table("german.data", sep="\s+",
names = ['existingchecking', 'duration','history','purpose','amount','savings','employment','installmentrate','statussex','debtors','residencenum','property','age','installmentplan','housing','creditsatbanks','job','liablepeople','telephone','foreign','classification'])
print(df.head(3))
df.info()
No Missing Value Present in the Dataset
df.isna().sum().head(5)
df.shape
df.describe()
df.classification.value_counts()
Imbalanced Data: Classification consists of 700 Good, 300 Bad
We can apply SMOTE to add simulate more Bad classification. As model biased towards Good classification might blur the prediction of Bad classification.
Numerical Columns = "duration","amount","installmentrate","residencenum","age","creditsatbanks","liablepeople", "classification"
Qualitative Columns = 'existingchecking', 'history', 'purpose', 'savings', 'employment', 'statussex', 'debtors', 'property', 'installmentplan', 'housing', 'job', 'telephone', 'foreign','classification'
num_val = ["duration","amount","installmentrate","residencenum","age","creditsatbanks","liablepeople"]
df[num_val] = StandardScaler().fit_transform(df[num_val])
df
print(df.shape)
Classification is not included as it will be transformed into one hot coding system.
1: (Good) will be transformed to 1
2: (Bad) will be transformed into 0.
df.classification.replace([1,2], [1,0], inplace=True)
df.classification.head(3)
qual_val = ['existingchecking', 'history', 'purpose', 'savings', 'employment', 'statussex', 'debtors', 'property', 'installmentplan', 'housing', 'job', 'telephone', 'foreign']
le = LabelEncoder()
df[qual_val] = df[qual_val].apply(lambda col: le.fit_transform(col))
print(df.head(3))
print(df.shape)
Splitting Data 80/20:
X = df.drop('classification', axis=1)
y = df['classification']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)
241 Bad - 559 Good
np.unique(y_train, return_counts=True)
print("X: {}, y:{}, X_train:{}, y_train:{}, X_test: {}, y_test: {} ".format(X.shape,y.shape,X_train.shape,y_train.shape,X_test.shape,y_test.shape))
Data augmentation for the minority class method also referred to as the Synthetic Minority Oversampling Technique
sm = SMOTE(ratio='auto')
X_train, y_train = sm.fit_sample(X_train, y_train)
print("X: {}, y:{}, X_train:{}, y_train:{}, X_test: {}, y_test: {} ".format(X.shape,y.shape,X_train.shape,y_train.shape,X_test.shape,y_test.shape))
559 Bad - 559 Good
np.unique(y_train, return_counts=True)
import os
from xgboost import XGBClassifier
from xgboost import plot_importance
import xgboost as xgb
from matplotlib import pyplot
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.tree import export_graphviz
selector = SelectFromModel(estimator=xgb.XGBClassifier()).fit(X_train, y_train)
X_train_XG = selector.transform(X_train)
X_test_XG = selector.transform(X_test)
print("Selected Columns:",X.loc[:,selector.get_support()].columns.tolist(),"\n")
#Train the XGboost Model for Classification
model1 = xgb.XGBClassifier()
model2 = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)
train_model1 = model1.fit(X_train_XG, y_train)
train_model2 = model2.fit(X_train_XG, y_train)
pred1 = train_model1.predict(X_test_XG)
pred2 = train_model2.predict(X_test_XG)
labels = ['Bad', 'Good']
print("Model 1: XGBClassifier (with default parameters)")
print(classification_report(y_test, pred1, target_names=labels))
print("Accuracy for Model 1: %.2f" % (accuracy_score(y_test, pred1)* 100))
logit_roc_auc = roc_auc_score(y_test, train_model1.predict(X_test_XG))
fpr, tpr, thresholds = roc_curve(y_test, train_model1.predict_proba(X_test_XG)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Model 1: XGBClassifier (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print("Model 2: XGBClassifier (with parameters)")
print(classification_report(y_test, pred2, target_names=labels))
print("Accuracy for Model 2: %.2f" % (accuracy_score(y_test, pred2) * 100))
logit_roc_auc = roc_auc_score(y_test, train_model2.predict(X_test_XG))
fpr, tpr, thresholds = roc_curve(y_test, train_model2.predict_proba(X_test_XG)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Model 1: XGBClassifier (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
selector = SelectFromModel(estimator=LogisticRegression()).fit(X_train, y_train)
X_train_log = selector.transform(X_train)
X_test_log = selector.transform(X_test)
print("Selected Columns:",X.loc[:,selector.get_support()].columns.tolist(),"\n")
print(selector.estimator_.coef_)
logreg = LogisticRegression()
logreg.fit(X_train_log, y_train)
pred3 = logreg.predict(X_test_log)
print("Model 3: Logistic Regression")
print(classification_report(y_test, pred3, target_names=labels))
print("Accuracy for Model 3: %.2f" % (accuracy_score(y_test, pred3) * 100))
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test_log))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test_log)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
rf = RandomForestClassifier(criterion='gini',
n_estimators=700,
min_samples_split=10,
min_samples_leaf=1,
max_features='auto',
oob_score=True,
random_state=1,
n_jobs=-1)
rf.fit(X_train, y_train)
pred4 = rf.predict(X_test)
estimator = rf.estimators_[5]
print("Model 4: Random Forest Classifier")
print(classification_report(y_test, pred4, target_names=labels))
print("Accuracy for Model 4: %.2f" % (accuracy_score(y_test, pred4) * 100))
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train_XG, y_train)
pred5 = xgb_model.predict(X_test_XG)
print("Model 5: XGBoost(binary:logistic)")
print(classification_report(y_test, pred5, target_names=labels))
print("Accuracy for Model 5: %.2f" % (accuracy_score(y_test, pred5) * 100))
logit_roc_auc = roc_auc_score(y_test, xgb_model.predict(X_test_XG))
fpr, tpr, thresholds = roc_curve(y_test, xgb_model.predict_proba(X_test_XG)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGBoost(binary:logistic) (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()