## Instructions

**Objective**

## Requirements and Specifications

**Additional Project : Bancassurance**

**Description**

**Background and Context**

**Objective**

- To predict whether a liability customer will buy a loan or not.
- Which variables are most significant for making predictions.
- Which segment of customers should be targeted more.

**Source Code
**

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn import preprocessing, tree

import seaborn as sns

from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import chi2

### Read Data

df = pd.read_csv('My_Bank.csv')

df.head(10)

print(f"This dataset has {len(df)} rows")

### Show the number of NaN values in each column

df.isnull().sum()

### Remove non-useful columns

df = df.drop(columns = ['CUST_ID'])

### Convert ACC_OP_DATE to Numeric

df['ACC_OP_DATE'] = pd.to_datetime(df['ACC_OP_DATE']).dt.strftime("%m%d%Y").astype(int)

df.head(5)

### Categorize object columns

object_columns = df.select_dtypes(include=['object']).columns

for col in object_columns:

values = df[col].unique()

values_dict = {x[0]: x[1] for x in zip(values, range(len(values)))}

df[col] = df[col].map(values_dict)

### Normalize data

df_norm = (df-df.min())/(df.max()-df.min())

df_norm.head()

### Extract target column

Y = df_norm['TARGET']

X = df_norm.drop(columns=['TARGET'])

X.head()

print(f"There are {len(X.columns)} variables and {len(X)} records")

### Display correlation map to see the relation between variables

f = plt.figure(figsize = (10,10))

plt.matshow(df_norm.corr(), fignum = f.number)

plt.colorbar()

plt.xticks(range(len(df_norm.columns)), df_norm.columns, rotation=90);

plt.yticks(range(len(df_norm.columns)), df_norm.columns);

plt.show()

### Split data into train and test

X_train, X_test, Y_train, Y_test = train_test_split(

... X, Y, test_size=0.3, random_state=42)

### Build LogisticRegression Model

model = LogisticRegression()

model.fit(X_train, Y_train)

### Score

model.score(X_test, Y_test)

### Create a plot of model's accuracy vs. K best features

scores = []

for k in range(1, len(X.columns)):

X_new = SelectKBest(chi2, k = k).fit_transform(X, Y)

X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_new, Y, test_size=0.3, random_state=42)

model = LogisticRegression()

model.fit(X_train2, Y_train2)

score = model.score(X_test2, Y_test2)

scores.append(score)

plt.plot(range(1, len(X.columns)), scores)

plt.grid(True)

plt.xlabel('Number of Features')

plt.ylabel("Model's Accuracy")

### Pick optimal number of features

kopt = range(1, len(X.columns))[np.argmax(scores)]

print(f"The optimal number of features is {kopt}, giving a model accuracy of {max(scores)*100.0}%")

Xopt_lr = SelectKBest(chi2, k = kopt).fit_transform(X, Y)

# Build a new model but only with best features

### Select best features

X_new = SelectKBest(chi2, k=kopt).fit_transform(X, Y)

### Split into Train and Test with new X values

X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X, Y, test_size=0.3, random_state=42)

### Build Model

model2 = LogisticRegression()

model2.fit(X_train2, Y_train2)

model2.score(X_test2, Y_test2)

# Decision Tree

treeClf = tree.DecisionTreeClassifier()

treeClf.fit(X_train, Y_train)

treeClf.score(X_test, Y_test)

### Select K best features and run again the decision tree

scoresTree = []

for k in range(1, len(X.columns)):

X_new = SelectKBest(chi2, k = k).fit_transform(X, Y)

X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X_new, Y, test_size=0.3, random_state=42)

treeClf = tree.DecisionTreeClassifier()

treeClf.fit(X_train3, Y_train3)

score = treeClf.score(X_test3, Y_test3)

scoresTree.append(score)

plt.plot(range(1, len(X.columns)), scores)

plt.grid(True)

plt.xlabel('Number of Features')

plt.ylabel("Model's Accuracy")

koptTree = range(1, len(X.columns))[np.argmax(scoresTree)]

print(f"The optimal number of features for Decision Tree is {koptTree}, giving a model accuracy of {max(scoresTree)*100.0}%")

Xopt_tree = SelectKBest(chi2, k = koptTree).fit_transform(X, Y)

### Plot Scores of both LogisticRegression and DecisionTree vs. Number of features

plt.plot(range(1, len(X.columns)), scores, label = 'LogisticRegression')

plt.plot(range(1, len(X.columns)), scoresTree, label = 'DecisionTree')

plt.legend()

plt.grid(True)

plt.xlabel('Number of Features')

plt.ylabel("Model's Accuracy")

plt.show()

So we see that the Decision Tree has a higher accuracy than the Logistic Regression.

For the Decision Tree, the optimal number of features is 19, while for Logistic Regression is 29.