Instructions
Requirements and Specifications
- To predict whether a liability customer will buy a loan or not.
- Which variables are most significant for making predictions.
- Which segment of customers should be targeted more.
Source Code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing, tree
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
### Read Data
df = pd.read_csv('My_Bank.csv')
df.head(10)
print(f"This dataset has {len(df)} rows")
### Show the number of NaN values in each column
df.isnull().sum()
### Remove non-useful columns
df = df.drop(columns = ['CUST_ID'])
### Convert ACC_OP_DATE to Numeric
df['ACC_OP_DATE'] = pd.to_datetime(df['ACC_OP_DATE']).dt.strftime("%m%d%Y").astype(int)
df.head(5)
### Categorize object columns
object_columns = df.select_dtypes(include=['object']).columns
for col in object_columns:
values = df[col].unique()
values_dict = {x[0]: x[1] for x in zip(values, range(len(values)))}
df[col] = df[col].map(values_dict)
### Normalize data
df_norm = (df-df.min())/(df.max()-df.min())
df_norm.head()
### Extract target column
Y = df_norm['TARGET']
X = df_norm.drop(columns=['TARGET'])
X.head()
print(f"There are {len(X.columns)} variables and {len(X)} records")
### Display correlation map to see the relation between variables
f = plt.figure(figsize = (10,10))
plt.matshow(df_norm.corr(), fignum = f.number)
plt.colorbar()
plt.xticks(range(len(df_norm.columns)), df_norm.columns, rotation=90);
plt.yticks(range(len(df_norm.columns)), df_norm.columns);
plt.show()
### Split data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(
... X, Y, test_size=0.3, random_state=42)
### Build LogisticRegression Model
model = LogisticRegression()
model.fit(X_train, Y_train)
### Score
model.score(X_test, Y_test)
### Create a plot of model's accuracy vs. K best features
scores = []
for k in range(1, len(X.columns)):
X_new = SelectKBest(chi2, k = k).fit_transform(X, Y)
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_new, Y, test_size=0.3, random_state=42)
model = LogisticRegression()
model.fit(X_train2, Y_train2)
score = model.score(X_test2, Y_test2)
scores.append(score)
plt.plot(range(1, len(X.columns)), scores)
plt.grid(True)
plt.xlabel('Number of Features')
plt.ylabel("Model's Accuracy")
### Pick optimal number of features
kopt = range(1, len(X.columns))[np.argmax(scores)]
print(f"The optimal number of features is {kopt}, giving a model accuracy of {max(scores)*100.0}%")
Xopt_lr = SelectKBest(chi2, k = kopt).fit_transform(X, Y)
# Build a new model but only with best features
### Select best features
X_new = SelectKBest(chi2, k=kopt).fit_transform(X, Y)
### Split into Train and Test with new X values
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X, Y, test_size=0.3, random_state=42)
### Build Model
model2 = LogisticRegression()
model2.fit(X_train2, Y_train2)
model2.score(X_test2, Y_test2)
# Decision Tree
treeClf = tree.DecisionTreeClassifier()
treeClf.fit(X_train, Y_train)
treeClf.score(X_test, Y_test)
### Select K best features and run again the decision tree
scoresTree = []
for k in range(1, len(X.columns)):
X_new = SelectKBest(chi2, k = k).fit_transform(X, Y)
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X_new, Y, test_size=0.3, random_state=42)
treeClf = tree.DecisionTreeClassifier()
treeClf.fit(X_train3, Y_train3)
score = treeClf.score(X_test3, Y_test3)
scoresTree.append(score)
plt.plot(range(1, len(X.columns)), scores)
plt.grid(True)
plt.xlabel('Number of Features')
plt.ylabel("Model's Accuracy")
koptTree = range(1, len(X.columns))[np.argmax(scoresTree)]
print(f"The optimal number of features for Decision Tree is {koptTree}, giving a model accuracy of {max(scoresTree)*100.0}%")
Xopt_tree = SelectKBest(chi2, k = koptTree).fit_transform(X, Y)
### Plot Scores of both LogisticRegression and DecisionTree vs. Number of features
plt.plot(range(1, len(X.columns)), scores, label = 'LogisticRegression')
plt.plot(range(1, len(X.columns)), scoresTree, label = 'DecisionTree')
plt.legend()
plt.grid(True)
plt.xlabel('Number of Features')
plt.ylabel("Model's Accuracy")
plt.show()
So we see that the Decision Tree has a higher accuracy than the Logistic Regression.
For the Decision Tree, the optimal number of features is 19, while for Logistic Regression is 29.