Exercise - Multivariate Analysis - Jupyter Notebook

4/5/23, 3:14 PM Exercise - Multivariate Analysis - Jupyter Notebook
1 Import Libraries
In [51]: import pandas as pd
import statistics
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
2 Import and Inspect The Data

In [52]: df = pd.read_csv('cp_data_eng_trial2.csv')
In [53]: #First 5 rows of Data

df.head()
Out[53]:
NO Customer_ID Response Sex Age Job questionnaire1 questionnaire2 questionnaire3 questionnaire4 questionnaire5 avg_charge charge_avg_per_mon charge_Monday contraction_day contraction_day_JP
0 1 80000018 no reply F 35.0 NaN 0.0 0.0 0.0 0.0 0.0 28656.0 28656.0 NaN 7.0 Saturday
1 2 80000042 reply M 39.0 general payer1 0.0 0.0 0.0 0.0 0.0 22815.0 29660.0 29320.0 3.0 Tuesday
2 3 80000234 no reply F 43.0 NaN 0.0 0.0 0.0 0.0 0.0 11912.0 11912.0 13920.0 1.0 Sunday
3 4 80000273 no reply F 45.0 NaN 0.0 0.0 0.0 0.0 0.0 3860.0 3860.0 NaN 4.0 Wednesday
4 5 80000529 no reply M 33.0 general payer1 0.0 0.0 0.0 0.0 0.0 9483.0 9483.0 NaN 1.0 Sunday
In [54]: #Data Set Info

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1293 entries, 0 to 1292
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 NO 1293 non-null int64
1 Customer_ID 1293 non-null int64
2 Response 1003 non-null object
3 Sex 1232 non-null object
4 Age 1232 non-null float64
5 Job 771 non-null object
6 questionnaire1 1095 non-null float64
11 avg_charge 1033 non-null float64
12 charge_avg_per_mon 1033 non-null float64
13 charge_Monday 567 non-null float64
14 contraction_day 1086 non-null float64
15 contraction_day_JP 1086 non-null object
dtypes: float64(10), int64(2), object(4)
memory usage: 161.8+ KB
In [55]: #Missing Values Percentage

df.isnull().sum()/len(df)*100
Out[55]: NO 0.000000
Customer_ID 0.000000
Response 22.428461
Sex 4.717711
Age 4.717711
Job 40.371230
questionnaire1 15.313225
avg_charge 20.108275
charge_avg_per_mon 20.108275
charge_Monday 56.148492
contraction_day 16.009281
contraction_day_JP 16.009281
dtype: float64
localhost:8888/notebooks/Desktop/Abeam Business Analytics/Exercise - Multivariate Analysis.ipynb 1/14

In [56]: #Check for Duplicate Data

df.duplicated().sum()
Out[56]: 1
In [57]: #Dropping Duplicated Data

df.drop_duplicates(inplace=True)

#Re-Check for Duplicate Data
df.duplicated().sum()
Out[57]: 0
In [58]: #Drop Unused column (No and Customer_ID) since all the data has represented each customer and no duplicates value around it
df.drop('NO', axis=1, inplace=True)
df.drop('Customer_ID', axis=1, inplace=True)
3 Univariate Analysis for Data Distribution Check

In [59]: #Seperate Column to Numerical and Categorical
numerical = df.select_dtypes('number').columns.tolist()
categorical = df.select_dtypes('object').columns.tolist()
3.1 Numerical Feature
In [60]: #Statistical Summary of Numerical Column
df[numerical].describe()
Out[60]:
Age questionnaire1 questionnaire2 questionnaire3 questionnaire4 questionnaire5 avg_charge charge_avg_per_mon charge_Monday contraction_day
count 1231.000000 1094.000000 1094.000000 1094.000000 1094.000000 1094.000000 1032.000000 1032.000000 566.000000 1085.000000
mean 35.593014 0.259598 0.255941 0.267824 0.247715 0.250457 23944.236434 26352.431202 43289.169611 4.024885
std 12.294327 0.943158 0.936373 0.975701 0.914391 0.926076 18245.408867 20544.741974 55657.258269 2.044326
min 14.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2110.000000 2110.000000 2040.000000 1.000000
25% 28.000000 0.000000 0.000000 0.000000 0.000000 0.000000 9837.250000 10404.500000 8377.500000 2.000000
50% 35.000000 0.000000 0.000000 0.000000 0.000000 0.000000 19804.500000 21635.500000 24205.000000 4.000000
75% 42.000000 0.000000 0.000000 0.000000 0.000000 0.000000 32113.750000 35390.250000 52312.500000 6.000000
max 200.000000 5.000000 5.000000 5.000000 5.000000 5.000000 110525.000000 122530.000000 361110.000000 7.000000

In [61]: #Data Distribution of Numerical Feature

fig = plt.figure(figsize=(25,15))
ax_value = []
for x in range(len(numerical[0:5])):
ax_value.append(f'ax{x}')
for i in range(len(numerical[0:5])):
ax_value[i] = fig.add_subplot(1, len(numerical[0:5]), i + 1)
df[numerical[i]].plot(kind='hist', density=True, color='red', ax=ax_value[i])
df[numerical[i]].plot(kind='kde', color='blue', ax=ax_value[i])
ax_value[i].set_title(f'{numerical[i]} feature distribution', fontweight='bold', pad=20)
ax_value[i].set_xlabel(f'{numerical[i]} value')
ax_value[i].set_ylabel('Density')
plt.tight_layout()

In [62]: #Data Distribution of Numerical Feature (Cont'd)

ax_value = []
for x in range(len(numerical[5:])):
for i in range(len(numerical[5:])):
ax_value[i] = fig.add_subplot(1, len(numerical[5:]), i + 1)
df[numerical[i+5]].plot(kind='hist', density=True, color='red', ax=ax_value[i])
df[numerical[i+5]].plot(kind='kde', color='blue', ax=ax_value[i])
ax_value[i].set_title(f'{numerical[i+5]} feature distribution', fontweight='bold', pad=20)
ax_value[i].set_xlabel(f'{numerical[i+5]} value')
ax_value[i].set_ylabel('Density')
plt.tight_layout()

3.2 Categorical Feature
In [63]: #Categorical Feature Distribution
ax_value = []
for x in range(len(categorical[:])):
for i in range(len(categorical[:])):
ax_value[i] = fig.add_subplot(1, len(categorical[:]), i + 1)
df[categorical[i]].value_counts().plot(kind='bar', color='orange', ax=ax_value[i])
ax_value[i].set_title(f'{categorical[i]} column value count', fontweight='bold')
ax_value[i].set_xlabel(f'{categorical[i]} value')
ax_value[i].set_ylabel('Count Value')
plt.tight_layout()
4 Handling Missing Values
4.1 Response Feature
Since Mr.Matsui Reccommend us to ignore the missing values on Response, we're just going to drop all the missing values that exist in Response Feature.
In [64]: #Drop Missing Values in Response Feature

df.dropna(subset=['Response'],inplace=True)

#Missing Values Percentage
Out[64]: Response 0.000000

Sex 0.000000
Age 0.000000
Job 29.341317
avg_charge 0.000000
dtype: float64
4.2 Job Feature
Since the Job Feature has a missing values in the data set, thus we might want to fill those missing values with 'others' Job type.

In [65]: #Filling Job Missing Value with 'Other' Job Type

df['Job'].fillna('other', inplace=True)

#Re-check Missing Values Percentage
Out[65]: Response 0.000000

Sex 0.000000
Age 0.000000
Job 0.000000
avg_charge 0.000000
dtype: float64
4.3 Charge_Monday Feature
Handling the missing values for Charge_Monday Feature is quite tricky since it is a continous data and has a skewed distribution. For this feature we are going to try and see its correlation with the other Charge Feature.
In [66]: #Correlation of Each Feature

df[~df['charge_Monday'].isnull()].corr()
Out[66]:
Age questionnaire1 questionnaire2 questionnaire3 questionnaire4 questionnaire5 avg_charge charge_avg_per_mon charge_Monday contraction_day
Age 1.000000 0.032906 0.014135 0.006136 -0.001323 0.015016 0.396975 0.405574 0.349913 -0.021527
questionnaire1 0.032906 1.000000 0.843211 0.829832 0.824646 0.824504 0.058995 0.055955 0.024961 0.037177
questionnaire2 0.014135 0.843211 1.000000 0.838714 0.813646 0.847733 0.059102 0.064618 0.013562 0.047116
questionnaire3 0.006136 0.829832 0.838714 1.000000 0.859590 0.767813 0.052321 0.054370 0.010840 0.025837
questionnaire4 -0.001323 0.824646 0.813646 0.859590 1.000000 0.795184 0.034849 0.041772 0.004101 0.064088
questionnaire5 0.015016 0.824504 0.847733 0.767813 0.795184 1.000000 0.037809 0.045932 -0.012615 0.028988
avg_charge 0.396975 0.058995 0.059102 0.052321 0.034849 0.037809 1.000000 0.979626 0.647098 -0.008232
charge_avg_per_mon 0.405574 0.055955 0.064618 0.054370 0.041772 0.045932 0.979626 1.000000 0.671252 -0.004326
charge_Monday 0.349913 0.024961 0.013562 0.010840 0.004101 -0.012615 0.647098 0.671252 1.000000 0.007808
contraction_day -0.021527 0.037177 0.047116 0.025837 0.064088 0.028988 -0.008232 -0.004326 0.007808 1.000000
Since there is no feature that has a strong correlation with the Charge_Monday, we are going to fill the value using median after we split the data into train and test set to prevent data leakage.
In [67]: #Checking for Charge Monday Correlation

df[(~df['charge_Monday'].isnull()) & (df['avg_charge']==df['charge_avg_per_mon'])]
Out[67]:
Response Sex Age Job questionnaire1 questionnaire2 questionnaire3 questionnaire4 questionnaire5 avg_charge charge_avg_per_mon charge_Monday contraction_day contraction_day_JP
2 no reply F 43.0 other 0.0 0.0 0.0 0.0 0.0 11912.0 11912.0 13920.0 1.0 Sunday
7 no reply F 33.0 other 0.0 0.0 0.0 0.0 0.0 17200.0 17200.0 22490.0 1.0 Sunday
18 no reply F 34.0 general payer1 0.0 0.0 0.0 0.0 0.0 18572.0 18572.0 44060.0 5.0 Thursday
23 no reply F 34.0 part time job 0.0 0.0 0.0 0.0 0.0 16873.0 16873.0 26030.0 5.0 Thursday
25 no reply F 31.0 general payer1 0.0 0.0 0.0 0.0 0.0 57432.0 57432.0 69930.0 6.0 Friday
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
974 no reply M 32.0 other 0.0 0.0 0.0 0.0 0.0 34597.0 34597.0 42130.0 1.0 Sunday
984 reply F 24.0 other 2.0 4.0 1.0 2.0 4.0 13183.0 13183.0 13810.0 3.0 Tuesday
986 no reply F 27.0 general payer3 0.0 0.0 0.0 0.0 0.0 44138.0 44138.0 83560.0 5.0 Thursday
996 no reply F 61.0 general payer3 0.0 0.0 0.0 0.0 0.0 33713.0 33713.0 42990.0 3.0 Tuesday
999 no reply M 44.0 sole proprietorship 0.0 0.0 0.0 0.0 0.0 43745.0 43745.0 71090.0 3.0 Tuesday
294 rows × 14 columns

In [68]: df[(df['charge_Monday'].isnull()) & (df['avg_charge']==df['charge_avg_per_mon'])]
Out[68]:
Response Sex Age Job questionnaire1 questionnaire2 questionnaire3 questionnaire4 questionnaire5 avg_charge charge_avg_per_mon charge_Monday contraction_day contraction_day_JP
0 no reply F 35.0 other 0.0 0.0 0.0 0.0 0.0 28656.0 28656.0 NaN 7.0 Saturday
3 no reply F 45.0 other 0.0 0.0 0.0 0.0 0.0 3860.0 3860.0 NaN 4.0 Wednesday
4 no reply M 33.0 general payer1 0.0 0.0 0.0 0.0 0.0 9483.0 9483.0 NaN 1.0 Sunday
6 reply M 38.0 general payer1 0.0 0.0 0.0 0.0 0.0 28581.0 28581.0 NaN 2.0 Monday
8 no reply F 26.0 general payer1 0.0 0.0 0.0 0.0 0.0 28854.0 28854.0 NaN 5.0 Thursday
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
994 no reply F 39.0 other 0.0 0.0 0.0 0.0 0.0 26886.0 26886.0 NaN 2.0 Monday
995 no reply M 23.0 general payer3 0.0 0.0 0.0 0.0 0.0 5700.0 5700.0 NaN 6.0 Friday
997 no reply F 71.0 doctor 0.0 0.0 0.0 0.0 0.0 104266.0 104266.0 NaN 2.0 Monday
998 no reply M 50.0 general payer3 0.0 0.0 0.0 0.0 0.0 34014.0 34014.0 NaN 1.0 Sunday
1002 no reply M 200.0 general payer3 0.0 0.0 0.0 0.0 0.0 74310.0 74310.0 NaN 7.0 Saturday
5 Feature Engineering / Data Transformation
5.1 Job Feature
Since General Payer1, General Payer2, and General Payer3 has the same meaning, replace the value with General Payer.
In [69]: def general_payer(x):

if x in ['general payer1', 'general payer2', 'general payer3']:
x = 'general payer'
else:
x
return x

df['Job'] = df['Job'].apply(general_payer)
In [70]: #Recheck for Job Value

df['Job'].value_counts()
Out[70]: other 324

general payer 256
part time job 164
student 122
sole proprietorship 64
doctor 37
lawyer 35
Name: Job, dtype: int64
Since decision tree model in python can't take a string input, we will encode the Job Feature using one hot encoding.
In [73]: job_encode = pd.get_dummies(df['Job'], columns=['Job'], prefix='is')

job_encode
Out[73]:
is_doctor is_general payer is_lawyer is_other is_part time job is_sole proprietorship is_student
0 0 0 0 1 0 0 0
1 0 1 0 0 0 0 0
2 0 0 0 1 0 0 0
3 0 0 0 1 0 0 0
4 0 1 0 0 0 0 0
... ... ... ... ... ... ... ...
997 1 0 0 0 0 0 0
998 0 1 0 0 0 0 0
999 0 0 0 0 0 1 0
1001 1 0 0 0 0 0 0
1002 0 1 0 0 0 0 0

In [74]: #Merge to Back to the data set

df= pd.concat([df,job_encode],axis=1)
df
Out[74]:
Response Sex Age Job questionnaire1 questionnaire2 questionnaire3 questionnaire4 questionnaire5 avg_charge charge_avg_per_mon charge_Monday contraction_day contraction_day_JP is_doctor is_general payer is_lawyer is_other is_part time job is_sole proprietorship is_student
0 no reply F 35.0 other 0.0 0.0 0.0 0.0 0.0 28656.0 28656.0 NaN 7.0 Saturday 0 0 0 1 0 0 0
1 reply M 39.0 general payer 0.0 0.0 0.0 0.0 0.0 22815.0 29660.0 29320.0 3.0 Tuesday 0 1 0 0 0 0 0
2 no reply F 43.0 other 0.0 0.0 0.0 0.0 0.0 11912.0 11912.0 13920.0 1.0 Sunday 0 0 0 1 0 0 0
3 no reply F 45.0 other 0.0 0.0 0.0 0.0 0.0 3860.0 3860.0 NaN 4.0 Wednesday 0 0 0 1 0 0 0
4 no reply M 33.0 general payer 0.0 0.0 0.0 0.0 0.0 9483.0 9483.0 NaN 1.0 Sunday 0 1 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
997 no reply F 71.0 doctor 0.0 0.0 0.0 0.0 0.0 104266.0 104266.0 NaN 2.0 Monday 1 0 0 0 0 0 0
998 no reply M 50.0 general payer 0.0 0.0 0.0 0.0 0.0 34014.0 34014.0 NaN 1.0 Sunday 0 1 0 0 0 0 0
999 no reply M 44.0 sole proprietorship 0.0 0.0 0.0 0.0 0.0 43745.0 43745.0 71090.0 3.0 Tuesday 0 0 0 0 0 1 0
1001 no reply F 20.0 doctor 0.0 0.0 0.0 0.0 0.0 57064.0 74623.0 255230.0 4.0 Wednesday 1 0 0 0 0 0 0
1002 no reply M 20.0 general payer 0.0 0.0 0.0 0.0 0.0 74310.0 74310.0 NaN 7.0 Saturday 0 1 0 0 0 0 0
In [75]: #Drop the Previous Job Feature

df.drop('Job',axis=1, inplace=True)
5.2 Age Feature
Since there is an exact outlier which is the customer who is 200 years old and Mr.Matsui said it is a typo, we should replace this value with 20 years old instead.
In [71]: #Age Count

df['Age'].value_counts()
Out[71]: 30.0 44
36.0 42
39.0 40
35.0 39
33.0 39
38.0 39
34.0 37
31.0 37
29.0 36
37.0 36
42.0 35
32.0 33
28.0 32
26.0 31
43.0 31
40.0 30
41.0 29
27.0 28
23.0 24
22 0 23
In [72]: #Change 200 to 20
df['Age'] = df['Age'].apply(lambda x: 20 if x == 200 else x)
5.3 Drop Unusefull Feature
Mr. Matsui mentioned in the hearing result that Contraction_Day and Contraction_Day_JP has the same meaning so one of them should be droped. Besides that, Mr.Matsui also mentioned that the questionairre is targeted for the customer who bought the product so certainly it has no contribution in predicting the customer
response.
In [76]: #Dropping Contraction_Day_JP

df.drop('contraction_day_JP', axis=1, inplace=True)
In [77]: #Dropping Questionnaire Value

quest = []
for i in numerical:
if 'questionnaire' in i:
quest.append(i)

df.drop(quest, axis=1, inplace=True)

In [78]: #Data Set First 5 Row

df.head()
Out[78]:
Response Sex Age avg_charge charge_avg_per_mon charge_Monday contraction_day is_doctor is_general payer is_lawyer is_other is_part time job is_sole proprietorship is_student
0 no reply F 35.0 28656.0 28656.0 NaN 7.0 0 0 0 1 0 0 0
1 reply M 39.0 22815.0 29660.0 29320.0 3.0 0 1 0 0 0 0 0
2 no reply F 43.0 11912.0 11912.0 13920.0 1.0 0 0 0 1 0 0 0
3 no reply F 45.0 3860.0 3860.0 NaN 4.0 0 0 0 1 0 0 0
4 no reply M 33.0 9483.0 9483.0 NaN 1.0 0 1 0 0 0 0 0
5.4 Changing Response and Sex Feature to Numerical
For model interpretation purposes, the response and Sex feature is going to be transform to numerical feature.
In [79]: #Sex Feature

df['is_female'] = df['Sex'].apply(lambda x: 1 if x=='F' else 0)
df.head()
Out[79]:
Response Sex Age avg_charge charge_avg_per_mon charge_Monday contraction_day is_doctor is_general payer is_lawyer is_other is_part time job is_sole proprietorship is_student is_female
0 no reply F 35.0 28656.0 28656.0 NaN 7.0 0 0 0 1 0 0 0 1
1 reply M 39.0 22815.0 29660.0 29320.0 3.0 0 1 0 0 0 0 0 0
2 no reply F 43.0 11912.0 11912.0 13920.0 1.0 0 0 0 1 0 0 0 1
3 no reply F 45.0 3860.0 3860.0 NaN 4.0 0 0 0 1 0 0 0 1
4 no reply M 33.0 9483.0 9483.0 NaN 1.0 0 1 0 0 0 0 0 0
In [80]: #Response Feature

df['is_responding'] = df['Response'].apply(lambda x: 1 if x=='reply' else 0)
df.head()
Out[80]:
Response Sex Age avg_charge charge_avg_per_mon charge_Monday contraction_day is_doctor is_general payer is_lawyer is_other is_part time job is_sole proprietorship is_student is_female is_responding
0 no reply F 35.0 28656.0 28656.0 NaN 7.0 0 0 0 1 0 0 0 1 0
1 reply M 39.0 22815.0 29660.0 29320.0 3.0 0 1 0 0 0 0 0 0 1
2 no reply F 43.0 11912.0 11912.0 13920.0 1.0 0 0 0 1 0 0 0 1 0
3 no reply F 45.0 3860.0 3860.0 NaN 4.0 0 0 0 1 0 0 0 1 0
4 no reply M 33.0 9483.0 9483.0 NaN 1.0 0 1 0 0 0 0 0 0 0
In [81]: #Dropping Response and Sex Feature

df.drop(['Response', 'Sex'], axis=1, inplace=True)
In [82]: #Recheck data

df.head()
Out[82]:
Age avg_charge charge_avg_per_mon charge_Monday contraction_day is_doctor is_general payer is_lawyer is_other is_part time job is_sole proprietorship is_student is_female is_responding
0 35.0 28656.0 28656.0 NaN 7.0 0 0 0 1 0 0 0 1 0
1 39.0 22815.0 29660.0 29320.0 3.0 0 1 0 0 0 0 0 0 1
2 43.0 11912.0 11912.0 13920.0 1.0 0 0 0 1 0 0 0 1 0
3 45.0 3860.0 3860.0 NaN 4.0 0 0 0 1 0 0 0 1 0
4 33.0 9483.0 9483.0 NaN 1.0 0 1 0 0 0 0 0 0 0
6 Model Building
6.1 Split Train Test
We will use 80% Train Data and 20% Test Data.

In [83]: from sklearn.model_selection import train_test_split

#Define Feature and Target
X = df.drop('is_responding',axis=1)
Y = df['is_responding']

#Split Train Test
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=df['is_responding'], random_state=42)
In [84]: #Filling charge_Monday missing value on training data and testing data
X_train['charge_Monday'].fillna(X_train['charge_Monday'].median(), inplace=True)
X_test['charge_Monday'].fillna(X_train['charge_Monday'].median(), inplace=True)
In [85]: #Recheck Missing Values

X_train.isnull().sum(), X_test.isnull().sum()
Out[85]: (Age 0
avg_charge 0
charge_avg_per_mon 0
charge_Monday 0
contraction_day 0
is_doctor 0
is_general payer 0
is_lawyer 0
is_other 0
is_part time job 0
is_sole proprietorship 0
is_student 0
is_female 0
dtype: int64,
Age 0
avg_charge 0
charge_avg_per_mon 0
charge_Monday 0
contraction_day 0
is_doctor 0
is_general payer 0
is_lawyer 0
is_other 0
is_part time job 0
is_sole proprietorship 0
is_student 0
is_female 0
dtype: int64)
6.2 DecisionTree Baseline Model Building

In [86]: from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,Y_train)
Out[86]: ▾ DecisionTreeClassifier
DecisionTreeClassifier(random_state=42)
In [87]: from sklearn.metrics import classification_report, f1_score, roc_auc_score,confusion_matrix, ConfusionMatrixDisplay

#Baseline Model Evaluation
y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)

#AUC Score for Training Data and Testing Data
print(f1_score(Y_train, y_pred_train),f1_score(Y_test, y_pred_test))
1.0 0.2769230769230769
From this evaluation we know that the model is overfitting. Thus we need to tune the decision tree to get the best model fitting.

6.3 DecisionTree tuning using Cost Complexity Pruning

In [88]: #Aplha Parameter
path = dt.cost_complexity_pruning_path(X_train,Y_train)
alpha = path['ccp_alphas']

#Evaluating and Visualizing Each Alpha to get the best alpha

f1_train,f1_test = [],[]

for i in alpha:
tree = DecisionTreeClassifier(random_state=42, ccp_alpha=i)
tree.fit(X_train,Y_train)
y_pred_train = tree.predict(X_train)
y_pred_test = tree.predict(X_test)
f1_train.append(f1_score(Y_train,y_pred_train))
f1_test.append(f1_score(Y_test,y_pred_test))
sns.set()
plt.figure(figsize=(15,10))
sns.lineplot(y=f1_train,x=alpha, label='Train F1 Score')
sns.lineplot(y=f1_test,x=alpha, label='Test F1 Score')
plt.xlabel('Alpha Parameter', fontweight='bold')
plt.ylabel('F1 Score',fontweight='bold')
plt.xticks(ticks=np.arange(0.0000,round(max(alpha),4),0.001))
plt.show()
From the Model we can conclude that alpha around 0.0022 is the best Alpha to be used since it has the Highest AUC Score on Test Data.
6.4 Fitting New Model and Evaluate the New Model

In [111]: alpha = 0.0022

#Fit the model using the best alpha
dt = DecisionTreeClassifier(random_state=42, ccp_alpha=alpha)
dt.fit(X_train,Y_train)

#Prediction Result
y_pred = dt.predict(X_test)

In [113]: #Classification Report for Test training data

print(classification_report(Y_train,dt.predict(X_train)))
precision recall f1-score support
0 0.91 0.99 0.95 673

1 0.90 0.51 0.65 128
accuracy 0.91 801

macro avg 0.91 0.75 0.80 801
weighted avg 0.91 0.91 0.90 801
In [114]: #Classification Report for Test Data

print(classification_report(Y_test,y_pred))
precision recall f1-score support
0 0.87 0.98 0.92 169

1 0.67 0.25 0.36 32
accuracy 0.86 201

macro avg 0.77 0.61 0.64 201
weighted avg 0.84 0.86 0.83 201
f1-score and the precision for identifying the customer is very low, Thus we need to improve the model using better feature engineering method or adding more predictive feature.
In [115]: #Evaluation from Confusion Matrix

cm = confusion_matrix(Y_test,y_pred)
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()
Out[115]: <sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1c27234e460>

In [116]: #Feature Importance using SHAP Value

import shap

explainer = shap.Explainer(dt.predict, X_train)
shap_values = explainer(X_train)
shap.summary_plot(shap_values)
Permutation explainer: 802it [00:17, 19.46it/s]

In [117]: import dtreeviz

viz = dtreeviz.model(dt, X_train, Y_train, target_name='Response',
feature_names=X_train.columns, class_names=['no reply', 'reply']
)
viz.view()
Out[117]:
≤ >

Exercise - Multivariate Analysis - Jupyter Notebook

Uploaded by

Copyright:

Available Formats

Exercise - Multivariate Analysis - Jupyter Notebook

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Exercise - Multivariate Analysis - Jupyter Notebook

Uploaded by

Copyright:

Available Formats

4/5/23, 3:14 PM Exercise - Multivariate Analysis - Jupyter Notebook

2 Import and Inspect The Data

In [53]: #First 5 rows of Data

In [54]: #Data Set Info

In [55]: #Missing Values Percentage

localhost:8888/notebooks/Desktop/Abeam Business Analytics/Exercise - Multivariate Analysis.ipynb 1/14

In [56]: #Check for Duplicate Data

In [57]: #Dropping Duplicated Data

3 Univariate Analysis for Data Distribution Check

localhost:8888/notebooks/Desktop/Abeam Business Analytics/Exercise - Multivariate Analysis.ipynb 2/14

In [61]: #Data Distribution of Numerical Feature

localhost:8888/notebooks/Desktop/Abeam Business Analytics/Exercise - Multivariate Analysis.ipynb 3/14

In [62]: #Data Distribution of Numerical Feature (Cont'd)

localhost:8888/notebooks/Desktop/Abeam Business Analytics/Exercise - Multivariate Analysis.ipynb 4/14

4 Handling Missing Values

In [64]: #Drop Missing Values in Response Feature

Out[64]: Response 0.000000

localhost:8888/notebooks/Desktop/Abeam Business Analytics/Exercise - Multivariate Analysis.ipynb 5/14

In [65]: #Filling Job Missing Value with 'Other' Job Type

Out[65]: Response 0.000000

In [66]: #Correlation of Each Feature

In [67]: #Checking for Charge Monday Correlation

294 rows × 14 columns

localhost:8888/notebooks/Desktop/Abeam Business Analytics/Exercise - Multivariate Analysis.ipynb 6/14

In [68]: df[(df['charge_Monday'].isnull()) & (df['avg_charge']==df['charge_avg_per_mon'])]

346 rows × 14 columns

5 Feature Engineering / Data Transformation

In [69]: def general_payer(x):

In [70]: #Recheck for Job Value

Out[70]: other 324

In [73]: job_encode = pd.get_dummies(df['Job'], columns=['Job'], prefix='is')

... ... ... ... ... ... ... ...

1002 rows × 7 columns

localhost:8888/notebooks/Desktop/Abeam Business Analytics/Exercise - Multivariate Analysis.ipynb 7/14

In [74]: #Merge to Back to the data set

1002 rows × 21 columns

In [75]: #Drop the Previous Job Feature

In [71]: #Age Count

5.3 Drop Unusefull Feature

In [76]: #Dropping Contraction_Day_JP

In [77]: #Dropping Questionnaire Value

localhost:8888/notebooks/Desktop/Abeam Business Analytics/Exercise - Multivariate Analysis.ipynb 8/14

In [78]: #Data Set First 5 Row

0 no reply F 35.0 28656.0 28656.0 NaN 7.0 0 0 0 1 0 0 0

1 reply M 39.0 22815.0 29660.0 29320.0 3.0 0 1 0 0 0 0 0

2 no reply F 43.0 11912.0 11912.0 13920.0 1.0 0 0 0 1 0 0 0

3 no reply F 45.0 3860.0 3860.0 NaN 4.0 0 0 0 1 0 0 0

4 no reply M 33.0 9483.0 9483.0 NaN 1.0 0 1 0 0 0 0 0

5.4 Changing Response and Sex Feature to Numerical

In [79]: #Sex Feature

0 no reply F 35.0 28656.0 28656.0 NaN 7.0 0 0 0 1 0 0 0 1

1 reply M 39.0 22815.0 29660.0 29320.0 3.0 0 1 0 0 0 0 0 0

2 no reply F 43.0 11912.0 11912.0 13920.0 1.0 0 0 0 1 0 0 0 1

3 no reply F 45.0 3860.0 3860.0 NaN 4.0 0 0 0 1 0 0 0 1

4 no reply M 33.0 9483.0 9483.0 NaN 1.0 0 1 0 0 0 0 0 0

In [80]: #Response Feature

0 no reply F 35.0 28656.0 28656.0 NaN 7.0 0 0 0 1 0 0 0 1 0

1 reply M 39.0 22815.0 29660.0 29320.0 3.0 0 1 0 0 0 0 0 0 1

2 no reply F 43.0 11912.0 11912.0 13920.0 1.0 0 0 0 1 0 0 0 1 0

3 no reply F 45.0 3860.0 3860.0 NaN 4.0 0 0 0 1 0 0 0 1 0