German Credit Dataset

Anılcan Atik

Library

In [183]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize, StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
In [184]:
df = pd.read_table("german.data", sep="\s+",
                   names = ['existingchecking', 'duration','history','purpose','amount','savings','employment','installmentrate','statussex','debtors','residencenum','property','age','installmentplan','housing','creditsatbanks','job','liablepeople','telephone','foreign','classification'])
print(df.head(3))
  existingchecking  duration history purpose  amount savings employment  \
0              A11         6     A34     A43    1169     A65        A75   
1              A12        48     A32     A43    5951     A61        A73   
2              A14        12     A34     A46    2096     A61        A74   

   installmentrate statussex debtors  ...  property age  installmentplan  \
0                4       A93    A101  ...      A121  67             A143   
1                2       A92    A101  ...      A121  22             A143   
2                2       A93    A101  ...      A121  49             A143   

  housing creditsatbanks   job liablepeople  telephone foreign classification  
0    A152              2  A173            1       A192    A201              1  
1    A152              1  A173            1       A191    A201              2  
2    A152              1  A172            2       A191    A201              1  

[3 rows x 21 columns]
In [185]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   existingchecking  1000 non-null   object
 1   duration          1000 non-null   int64 
 2   history           1000 non-null   object
 3   purpose           1000 non-null   object
 4   amount            1000 non-null   int64 
 5   savings           1000 non-null   object
 6   employment        1000 non-null   object
 7   installmentrate   1000 non-null   int64 
 8   statussex         1000 non-null   object
 9   debtors           1000 non-null   object
 10  residencenum      1000 non-null   int64 
 11  property          1000 non-null   object
 12  age               1000 non-null   int64 
 13  installmentplan   1000 non-null   object
 14  housing           1000 non-null   object
 15  creditsatbanks    1000 non-null   int64 
 16  job               1000 non-null   object
 17  liablepeople      1000 non-null   int64 
 18  telephone         1000 non-null   object
 19  foreign           1000 non-null   object
 20  classification    1000 non-null   int64 
dtypes: int64(8), object(13)
memory usage: 164.2+ KB

Checking for Missing Values:

No Missing Value Present in the Dataset

In [186]:
df.isna().sum().head(5)
Out[186]:
existingchecking    0
duration            0
history             0
purpose             0
amount              0
dtype: int64
In [187]:
df.shape
Out[187]:
(1000, 21)
In [188]:
df.describe()
Out[188]:
duration amount installmentrate residencenum age creditsatbanks liablepeople classification
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 20.903000 3271.258000 2.973000 2.845000 35.546000 1.407000 1.155000 1.300000
std 12.058814 2822.736876 1.118715 1.103718 11.375469 0.577654 0.362086 0.458487
min 4.000000 250.000000 1.000000 1.000000 19.000000 1.000000 1.000000 1.000000
25% 12.000000 1365.500000 2.000000 2.000000 27.000000 1.000000 1.000000 1.000000
50% 18.000000 2319.500000 3.000000 3.000000 33.000000 1.000000 1.000000 1.000000
75% 24.000000 3972.250000 4.000000 4.000000 42.000000 2.000000 1.000000 2.000000
max 72.000000 18424.000000 4.000000 4.000000 75.000000 4.000000 2.000000 2.000000
In [189]:
df.classification.value_counts()
Out[189]:
1    700
2    300
Name: classification, dtype: int64

Preprocessing

Imbalanced Data: Classification consists of 700 Good, 300 Bad

We can apply SMOTE to add simulate more Bad classification. As model biased towards Good classification might blur the prediction of Bad classification.

Numerical Columns = "duration","amount","installmentrate","residencenum","age","creditsatbanks","liablepeople", "classification"

Qualitative Columns = 'existingchecking', 'history', 'purpose', 'savings', 'employment', 'statussex', 'debtors', 'property', 'installmentplan', 'housing', 'job', 'telephone', 'foreign','classification'

Standardization for Numerical Features:

In [190]:
num_val = ["duration","amount","installmentrate","residencenum","age","creditsatbanks","liablepeople"]

StandardScaler:

In [191]:
df[num_val] = StandardScaler().fit_transform(df[num_val])
In [192]:
df
Out[192]:
existingchecking duration history purpose amount savings employment installmentrate statussex debtors ... property age installmentplan housing creditsatbanks job liablepeople telephone foreign classification
0 A11 -1.236478 A34 A43 -0.745131 A65 A75 0.918477 A93 A101 ... A121 2.766456 A143 A152 1.027079 A173 -0.428290 A192 A201 1
1 A12 2.248194 A32 A43 0.949817 A61 A73 -0.870183 A92 A101 ... A121 -1.191404 A143 A152 -0.704926 A173 -0.428290 A191 A201 2
2 A14 -0.738668 A34 A46 -0.416562 A61 A74 -0.870183 A93 A101 ... A121 1.183312 A143 A152 -0.704926 A172 2.334869 A191 A201 1
3 A11 1.750384 A32 A42 1.634247 A61 A74 -0.870183 A93 A103 ... A122 0.831502 A143 A153 -0.704926 A173 2.334869 A191 A201 1
4 A11 0.256953 A33 A40 0.566664 A61 A73 0.024147 A93 A101 ... A124 1.535122 A143 A153 1.027079 A173 2.334869 A191 A201 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 A14 -0.738668 A32 A42 -0.544162 A61 A74 0.024147 A92 A101 ... A121 -0.399832 A143 A152 -0.704926 A172 -0.428290 A191 A201 1
996 A11 0.754763 A32 A41 0.207612 A61 A73 0.918477 A91 A101 ... A122 0.391740 A143 A152 -0.704926 A174 -0.428290 A192 A201 1
997 A14 -0.738668 A32 A43 -0.874503 A61 A75 0.918477 A93 A101 ... A123 0.215835 A143 A152 -0.704926 A173 -0.428290 A191 A201 1
998 A11 1.999289 A32 A43 -0.505528 A61 A73 0.918477 A93 A101 ... A124 -1.103451 A143 A153 -0.704926 A173 -0.428290 A192 A201 2
999 A12 1.999289 A34 A41 0.462457 A62 A71 0.024147 A93 A101 ... A123 -0.751642 A143 A152 -0.704926 A173 -0.428290 A191 A201 1

1000 rows × 21 columns

In [193]:
print(df.shape)
(1000, 21)

Encoding Categorical Features:

One Hot Encoding the Classification Label:

Classification is not included as it will be transformed into one hot coding system.
1: (Good) will be transformed to 1
2: (Bad) will be transformed into 0.

In [194]:
df.classification.replace([1,2], [1,0], inplace=True)
df.classification.head(3)
Out[194]:
0    1
1    0
2    1
Name: classification, dtype: int64

Encoding the Categorical Features:

In [195]:
qual_val = ['existingchecking', 'history', 'purpose', 'savings', 'employment', 'statussex', 'debtors', 'property', 'installmentplan', 'housing', 'job', 'telephone', 'foreign']
In [196]:
le = LabelEncoder()
df[qual_val] = df[qual_val].apply(lambda col: le.fit_transform(col))
print(df.head(3))
print(df.shape)
   existingchecking  duration  history  purpose    amount  savings  \
0                 0 -1.236478        4        4 -0.745131        4   
1                 1  2.248194        2        4  0.949817        0   
2                 3 -0.738668        4        7 -0.416562        0   

   employment  installmentrate  statussex  debtors  ...  property       age  \
0           4         0.918477          2        0  ...         0  2.766456   
1           2        -0.870183          1        0  ...         0 -1.191404   
2           3        -0.870183          2        0  ...         0  1.183312   

   installmentplan  housing  creditsatbanks  job  liablepeople  telephone  \
0                2        1        1.027079    2     -0.428290          1   
1                2        1       -0.704926    2     -0.428290          0   
2                2        1       -0.704926    1      2.334869          0   

   foreign  classification  
0        0               1  
1        0               0  
2        0               1  

[3 rows x 21 columns]
(1000, 21)

Splitting Dataset:

Splitting Data 80/20:

In [197]:
X = df.drop('classification', axis=1)
y = df['classification']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)
Checking the Balanced Dataset:

241 Bad - 559 Good

In [198]:
np.unique(y_train, return_counts=True)
Out[198]:
(array([0, 1], dtype=int64), array([241, 559], dtype=int64))
Checking the validity of train/test spliting:
In [199]:
print("X: {}, y:{}, X_train:{}, y_train:{}, X_test: {}, y_test: {} ".format(X.shape,y.shape,X_train.shape,y_train.shape,X_test.shape,y_test.shape))
X: (1000, 20), y:(1000,), X_train:(800, 20), y_train:(800,), X_test: (200, 20), y_test: (200,) 

Applying SMOTE to create Balanced Data:

Data augmentation for the minority class method also referred to as the Synthetic Minority Oversampling Technique

In [200]:
sm = SMOTE(ratio='auto')
X_train, y_train = sm.fit_sample(X_train, y_train)
C:\Users\Joahn\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:86: FutureWarning: Function safe_indexing is deprecated; safe_indexing is deprecated in version 0.22 and will be removed in version 0.24.
  warnings.warn(msg, category=FutureWarning)
Checking the validity of train/test spliting:
In [201]:
print("X: {}, y:{}, X_train:{}, y_train:{}, X_test: {}, y_test: {} ".format(X.shape,y.shape,X_train.shape,y_train.shape,X_test.shape,y_test.shape))
X: (1000, 20), y:(1000,), X_train:(1118, 20), y_train:(1118,), X_test: (200, 20), y_test: (200,) 
Checking the Balanced Dataset:

559 Bad - 559 Good

In [202]:
np.unique(y_train, return_counts=True)
Out[202]:
(array([0, 1], dtype=int64), array([559, 559], dtype=int64))

Creating Models:

Binary Classification Problem:

Model Library:

In [203]:
import os
from xgboost import XGBClassifier
from xgboost import plot_importance
import xgboost as xgb
from matplotlib import pyplot
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.tree import export_graphviz

Model 1 :XGBoost (gbtree) Default Parameters:

Feature Selection and Training:

In [204]:
selector = SelectFromModel(estimator=xgb.XGBClassifier()).fit(X_train, y_train)
X_train_XG = selector.transform(X_train)
X_test_XG = selector.transform(X_test)

print("Selected Columns:",X.loc[:,selector.get_support()].columns.tolist(),"\n")

#Train the XGboost Model for Classification
model1 = xgb.XGBClassifier()
model2 = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)

train_model1 = model1.fit(X_train_XG, y_train)
train_model2 = model2.fit(X_train_XG, y_train)
Selected Columns: ['existingchecking', 'duration', 'history', 'savings', 'installmentrate', 'statussex', 'installmentplan', 'telephone'] 

Testing Model:

In [205]:
pred1 = train_model1.predict(X_test_XG)
pred2 = train_model2.predict(X_test_XG)

Model Evaluation:

In [206]:
labels = ['Bad', 'Good']
In [207]:
print("Model 1: XGBClassifier (with default parameters)")
print(classification_report(y_test, pred1, target_names=labels))
print("Accuracy for Model 1: %.2f" % (accuracy_score(y_test, pred1)* 100))
Model 1: XGBClassifier (with default parameters)
              precision    recall  f1-score   support

         Bad       0.63      0.53      0.57        59
        Good       0.81      0.87      0.84       141

    accuracy                           0.77       200
   macro avg       0.72      0.70      0.71       200
weighted avg       0.76      0.77      0.76       200

Accuracy for Model 1: 77.00
In [208]:
logit_roc_auc = roc_auc_score(y_test, train_model1.predict(X_test_XG))
fpr, tpr, thresholds = roc_curve(y_test, train_model1.predict_proba(X_test_XG)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Model 1: XGBClassifier (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

Model 2: XGBClassifier gbTree With Parameters

In [209]:
print("Model 2: XGBClassifier (with parameters)")
print(classification_report(y_test, pred2, target_names=labels))
print("Accuracy for Model 2: %.2f" % (accuracy_score(y_test, pred2) * 100))
Model 2: XGBClassifier (with parameters)
              precision    recall  f1-score   support

         Bad       0.57      0.44      0.50        59
        Good       0.79      0.86      0.82       141

    accuracy                           0.73       200
   macro avg       0.68      0.65      0.66       200
weighted avg       0.72      0.73      0.72       200

Accuracy for Model 2: 73.50
In [210]:
logit_roc_auc = roc_auc_score(y_test, train_model2.predict(X_test_XG))
fpr, tpr, thresholds = roc_curve(y_test, train_model2.predict_proba(X_test_XG)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Model 1: XGBClassifier (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

Model 3: Logistic Regression:

Feature Selection and Training:

In [211]:
selector = SelectFromModel(estimator=LogisticRegression()).fit(X_train, y_train)
X_train_log = selector.transform(X_train)
X_test_log = selector.transform(X_test)
print("Selected Columns:",X.loc[:,selector.get_support()].columns.tolist(),"\n")
print(selector.estimator_.coef_)
Selected Columns: ['existingchecking', 'history', 'amount', 'savings', 'installmentrate', 'telephone', 'foreign'] 

[[ 0.53602362 -0.28434835  0.48173625  0.01097887 -0.35975949  0.30054941
   0.15146281 -0.52741737  0.25310772  0.28527416 -0.02596656 -0.16146728
   0.19540711  0.22014108  0.1777848  -0.07283421  0.03242496 -0.17741165
   0.53308671  1.21383814]]
In [212]:
logreg = LogisticRegression()
logreg.fit(X_train_log, y_train)
pred3 = logreg.predict(X_test_log)

Model Evaluation:

In [213]:
print("Model 3: Logistic Regression")
print(classification_report(y_test, pred3, target_names=labels))
print("Accuracy for Model 3: %.2f" % (accuracy_score(y_test, pred3) * 100))
Model 3: Logistic Regression
              precision    recall  f1-score   support

         Bad       0.48      0.66      0.56        59
        Good       0.83      0.70      0.76       141

    accuracy                           0.69       200
   macro avg       0.66      0.68      0.66       200
weighted avg       0.73      0.69      0.70       200

Accuracy for Model 3: 69.00
In [214]:
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test_log))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test_log)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

Model 4 :Random Forest Classifier:

In [215]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(X_train, y_train)
pred4 = rf.predict(X_test)
estimator = rf.estimators_[5]

Model Evaluation:

In [216]:
print("Model 4: Random Forest Classifier")
print(classification_report(y_test, pred4, target_names=labels))
print("Accuracy for Model 4: %.2f" % (accuracy_score(y_test, pred4) * 100))
Model 4: Random Forest Classifier
              precision    recall  f1-score   support

         Bad       0.66      0.46      0.54        59
        Good       0.80      0.90      0.85       141

    accuracy                           0.77       200
   macro avg       0.73      0.68      0.69       200
weighted avg       0.76      0.77      0.76       200

Accuracy for Model 4: 77.00

Model 5: XGBoost (binary:logistic)

In [217]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train_XG, y_train)

pred5 = xgb_model.predict(X_test_XG)
In [218]:
print("Model 5: XGBoost(binary:logistic)")
print(classification_report(y_test, pred5, target_names=labels))
print("Accuracy for Model 5: %.2f" % (accuracy_score(y_test, pred5) * 100))
Model 5: XGBoost(binary:logistic)
              precision    recall  f1-score   support

         Bad       0.63      0.53      0.57        59
        Good       0.81      0.87      0.84       141

    accuracy                           0.77       200
   macro avg       0.72      0.70      0.71       200
weighted avg       0.76      0.77      0.76       200

Accuracy for Model 5: 77.00
In [219]:
logit_roc_auc = roc_auc_score(y_test, xgb_model.predict(X_test_XG))
fpr, tpr, thresholds = roc_curve(y_test, xgb_model.predict_proba(X_test_XG)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGBoost(binary:logistic) (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
In [ ]:
 
In [ ]: