Python Analysis
Python Analysis
Python Analysis
import pandas as pd
In [2]:
df = pd.read_csv('survey.csv')
In [3]:
df.head()
Out[3]:
27-08-2014 United
0 37 Female IL NaN No Yes Of
11:29 States
27-08-2014 United
1 44 M IN NaN No No Ra
11:29 States
27-08-2014
2 32 Male Canada NaN NaN No No Ra
11:29
27-08-2014 United
3 31 Male NaN NaN Yes Yes Of
11:29 Kingdom
27-08-2014 United
4 31 Male TX NaN No No Ne
11:30 States
5 rows × 27 columns
In [4]:
df.tail()
Out[4]:
12-09-2015 United
1254 26 male NaN No No Yes
11:17 Kingdom
26-09-2015 United
1255 32 Male IL No Yes Yes
01:07 States
07-11-2015 United
1256 34 male CA No Yes Yes Som
12:36 States
30-11-2015 United
1257 46 f NC No No No
21:25 States
01-02-2016 United
1258 25 Male IL No Yes Yes Som
23:04 States
5 rows × 27 columns
In [5]:
df.shape
Out[5]:
(1259, 27)
In [6]:
df.columns
Out[6]:
In [7]:
df.duplicated().sum()
Out[7]:
In [8]:
df = df.drop_duplicates()
In [9]:
df.isnull().sum()
Out[9]:
Timestamp 0
Age 0
Gender 0
Country 0
state 514
self_employed 18
family_history 0
treatment 0
work_interfere 264
no_employees 0
remote_work 0
tech_company 0
benefits 0
care_options 0
wellness_program 0
seek_help 0
anonymity 0
leave 0
mental_health_consequence 0
phys_health_consequence 0
coworkers 0
supervisor 0
mental_health_interview 0
phys_health_interview 0
mental_vs_physical 0
obs_consequence 0
comments 1094
dtype: int64
In [10]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1258 entries, 0 to 1258
Data columns (total 27 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Timestamp 1258 non-null object
1 Age 1258 non-null int64
2 Gender 1258 non-null object
3 Country 1258 non-null object
4 state 744 non-null object
5 self_employed 1240 non-null object
6 family_history 1258 non-null object
7 treatment 1258 non-null object
8 work_interfere 994 non-null object
9 no_employees 1258 non-null object
10 remote_work 1258 non-null object
11 tech_company 1258 non-null object
12 benefits 1258 non-null object
13 care_options 1258 non-null object
14 wellness_program 1258 non-null object
15 seek_help 1258 non-null object
16 anonymity 1258 non-null object
17 leave 1258 non-null object
18 mental_health_consequence 1258 non-null object
19 phys_health_consequence 1258 non-null object
20 coworkers 1258 non-null object
21 supervisor 1258 non-null object
22 mental_health_interview 1258 non-null object
23 phys_health_interview 1258 non-null object
24 mental_vs_physical 1258 non-null object
25 obs_consequence 1258 non-null object
26 comments 164 non-null object
dtypes: int64(1), object(26)
memory usage: 275.2+ KB
In [11]:
df.describe()
Out[11]:
Age
count 1.258000e+03
mean 7.949129e+07
std 2.819419e+09
min -1.726000e+03
25% 2.700000e+01
50% 3.100000e+01
75% 3.600000e+01
max 1.000000e+11
In [12]:
df.nunique()
Out[12]:
Timestamp 884
Age 53
Gender 49
Country 48
state 45
self_employed 2
family_history 2
treatment 2
work_interfere 4
no_employees 6
remote_work 2
tech_company 2
benefits 3
care_options 3
wellness_program 3
seek_help 3
anonymity 3
leave 5
mental_health_consequence 3
phys_health_consequence 3
coworkers 3
supervisor 3
mental_health_interview 3
phys_health_interview 3
mental_vs_physical 3
obs_consequence 2
comments 160
dtype: int64
In [13]:
In [14]:
import warnings
warnings.filterwarnings('ignore')
In [15]:
for i in df_cat.columns:
print(df_cat[i].unique())
for i in df_cat.columns:
print(df_cat[i].value_counts())
No 1094
Yes 146
Name: self_employed, dtype: int64
No 767
Yes 491
Name: family_history, dtype: int64
Yes 636
No 622
Name: treatment, dtype: int64
Sometimes 465
Never 213
Rarely 172
Often 144
Name: work_interfere, dtype: int64
Jun-25 289
26-100 289
More than 1000 282
100-500 176
01-May 162
500-1000 60
Name: no_employees, dtype: int64
No 882
Yes 376
Name: remote_work, dtype: int64
Yes 1031
No 227
Name: tech_company, dtype: int64
Yes 477
Don't know 408
No 373
Name: benefits, dtype: int64
No 500
Yes 444
Not sure 314
Name: care_options, dtype: int64
No 841
Yes 229
Don't know 188
Name: wellness_program, dtype: int64
No 645
Don't know 363
Yes 250
Name: seek_help, dtype: int64
Don't know 819
Yes 375
No 64
Name: anonymity, dtype: int64
Don't know 562
Somewhat easy 266
Very easy 206
Somewhat difficult 126
Very difficult 98
Name: leave, dtype: int64
No 490
Maybe 477
Yes 291
Name: mental_health_consequence, dtype: int64
No 925
Maybe 272
Yes 61
Name: phys_health_consequence, dtype: int64
Some of them 773
No 260
Yes 225
Name: coworkers, dtype: int64
Yes 516
No 392
Some of them 350
Name: supervisor, dtype: int64
No 1007
Maybe 207
Yes 44
Name: mental_health_interview, dtype: int64
Maybe 557
No 499
Yes 202
Name: phys_health_interview, dtype: int64
Don't know 576
Yes 343
No 339
Name: mental_vs_physical, dtype: int64
No 1074
Yes 184
Name: obs_consequence, dtype: int64
In [18]:
for i in df_cat.columns:
plt.figure(figsize = (14,6))
sns.countplot(df_cat[i],data=df_cat, palette = 'hls')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [19]:
print(df['Country'].value_counts())
print(df['state'].unique())
['IL' 'IN' nan 'TX' 'TN' 'MI' 'OH' 'CA' 'CT' 'MD' 'NY' 'NC' 'MA' 'IA' 'PA'
'WA' 'WI' 'UT' 'NM' 'OR' 'FL' 'MN' 'MO' 'AZ' 'CO' 'GA' 'DC' 'NE' 'WV'
'OK' 'KS' 'VA' 'NH' 'KY' 'AL' 'NV' 'NJ' 'SC' 'VT' 'SD' 'ID' 'MS' 'RI'
'WY' 'LA' 'ME']
In [21]:
In [22]:
plt.figure(figsize = (14,6))
sns.boxplot(df['Age'],data=df, palette = 'hls')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [23]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
Age 9.0
dtype: float64
In [24]:
df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
Out[24]:
More than
1 44 M NaN No No Rarely
1000
More than
1256 34 male No Yes Yes Sometimes
1000
In [25]:
df.shape
Out[25]:
(1258, 23)
In [26]:
df['Age'].unique()
Out[27]:
array([37, 44, 32, 31, 33, 35, 39, 42, 23, 29, 36, 27, 46, 41, 34, 30, 40,
38, 50, 24, 18, 28, 26, 22, 19, 25, 45, 21, 43, 56, 60, 54, 55, 48,
20, 57, 58, 47, 62, 51, 65, 49, 5, 53, 61, 8, 11, 72],
dtype=int64)
In [28]:
plt.figure(figsize = (14,6))
sns.histplot(df['Age'], kde = True, palette = 'hls')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [29]:
df['Gender'].unique()
Out[30]:
In [31]:
df['Gender'].value_counts()
Out[31]:
Male 987
Female 247
Other 19
Name: Gender, dtype: int64
In [32]:
plt.figure(figsize = (14,6))
sns.countplot(df['Gender'],data=df, palette = 'hls')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [33]:
gender_data = df['Gender'].value_counts()
centre_circle = plt.Circle((0,0),0.40,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.show()
In [34]:
plt.figure(figsize = (14,6))
plt.title('Employement Type of the Employees who are seeking Treatment',
fontsize=18, fontweight='bold')
sns.countplot(df['self_employed'], hue = df['treatment'],data=df,
palette = 'hls')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [35]:
plt.figure(figsize = (20,6))
sns.countplot(df['family_history'], hue = df['treatment'], palette='hls')
plt.title('Family History of Survey Respondents', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [36]:
plt.figure(figsize = (20,6))
sns.countplot(df['work_interfere'], hue = df['treatment'],
palette = 'hls')
plt.title('Work Interference of Survey Respondents', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [37]:
plt.figure(figsize = (20,6))
sns.countplot(df['remote_work'], hue = df['treatment'], palette='hls')
plt.title('Working Type of Survey Respondents', fontsize=18, fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [38]:
plt.figure(figsize = (20,6))
sns.countplot(df['tech_company'], hue = df['treatment'],
palette='hls')
plt.title('Company Category of Survey Respondents', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [39]:
plt.figure(figsize = (20,6))
sns.countplot(df['benefits'], hue = df['treatment'], palette='hls')
plt.title('Benefits provided to the Employees', fontsize=18, fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [40]:
plt.figure(figsize = (20,6))
sns.countplot(df['care_options'], hue = df['treatment'], palette='hls')
plt.title('Mental Heathcare provided to the Employees', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [41]:
plt.figure(figsize = (20,6))
sns.countplot(df['wellness_program'], hue = df['treatment'],
palette='hls')
plt.title('Mental Health in Wellness Program provided to the Employees',
fontsize=16, fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [42]:
plt.figure(figsize = (20,6))
sns.countplot(df['anonymity'], hue = df['treatment'],
palette='hls')
plt.title('Anonymity for Mental Health provided to the Employees', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [43]:
plt.figure(figsize = (20,6))
sns.countplot(df['leave'], hue = df['treatment'],
palette='hls')
plt.title('Leave for Mental Health provided to the Employees', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [44]:
plt.figure(figsize = (20,6))
sns.countplot(df['mental_health_consequence'], hue = df['treatment'],
palette='hls')
plt.title('Consequence of discussing Mental Health with Employer', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [45]:
plt.figure(figsize = (20,6))
sns.countplot(df['phys_health_consequence'], hue = df['treatment'],
palette='hls')
plt.title('Consequence of discussing Physical Health with Employer',
fontsize=18, fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [46]:
plt.figure(figsize = (20,6))
sns.countplot(df['coworkers'], hue = df['treatment'],
palette='hls')
plt.title('Discussing Mental Health with the Coworkers', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [47]:
plt.figure(figsize = (20,6))
sns.countplot(df['supervisor'], hue = df['treatment'],
palette='hls')
plt.title('Discussing Mental Health with the Supervisor', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [48]:
plt.figure(figsize = (20,6))
sns.countplot(df['mental_health_interview'], hue = df['treatment'],
palette='hls')
plt.title('Discussing Mental Health with the Interviewer', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [49]:
plt.figure(figsize = (20,6))
sns.countplot(df['phys_health_interview'], hue = df['treatment'],
palette='hls')
plt.title('Discussing Physical Health with the Interviewer', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [50]:
plt.figure(figsize = (20,6))
sns.countplot(df['mental_vs_physical'], hue = df['treatment'],
palette='hls')
plt.title('Mental or Physical Health', fontsize=18, fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [51]:
plt.figure(figsize = (20,6))
sns.countplot(df['obs_consequence'], hue = df['treatment'],
palette='hls')
plt.title('Consequence observed for disclosing the Mental Health', fontsize=18,
fontweight='bold')
plt.xticks(rotation = 90)
plt.yticks(rotation = 90)
plt.show()
In [52]:
In [53]:
df['self_employed'] = df['self_employed'].fillna('No')
In [54]:
df.isnull().sum()
Out[54]:
Age 0
Gender 0
self_employed 0
family_history 0
treatment 0
work_interfere 0
no_employees 0
remote_work 0
tech_company 0
benefits 0
care_options 0
wellness_program 0
seek_help 0
anonymity 0
leave 0
mental_health_consequence 0
phys_health_consequence 0
coworkers 0
supervisor 0
mental_health_interview 0
phys_health_interview 0
mental_vs_physical 0
obs_consequence 0
dtype: int64
In [55]:
In [56]:
In [57]:
label_encoder = LabelEncoder()
for col in object_cols:
label_encoder.fit(df[col])
df[col] = label_encoder.transform(df[col])
In [58]:
corr = df.corr()
In [59]:
plt.figure(figsize = (20,10))
sns.heatmap(corr, annot = True)
plt.show()
In [60]:
In [61]:
key = ['LogisticRegression','KNeighborsClassifier','DecisionTreeClassifier','RandomForestCla
value = [LogisticRegression(), KNeighborsClassifier(n_neighbors = 2, weights ='uniform'), De
models = dict(zip(key,value))
models
Out[62]:
{'LogisticRegression': LogisticRegression(),
'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=2),
'DecisionTreeClassifier': DecisionTreeClassifier(random_state=10),
'RandomForestClassifier': RandomForestClassifier(n_estimators=60, random_stat
e=0),
'GradientBoostingClassifier': GradientBoostingClassifier(random_state=20),
'AdaBoostClassifier': AdaBoostClassifier(),
'XGBClassifier': XGBClassifier(base_score=None, booster='gbtree', callbacks=N
one,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=0,
reg_alpha=None, reg_lambda=None, ...)}
In [63]:
predicted =[]
for name,algo in models.items():
model=algo
model.fit(X_train,y_train)
predict = model.predict(X_test)
acc = accuracy_score(y_test, predict)
predicted.append(acc)
print(name,acc)
LogisticRegression 0.7929936305732485
KNeighborsClassifier 0.7006369426751592
DecisionTreeClassifier 0.7261146496815286
RandomForestClassifier 0.8152866242038217
GradientBoostingClassifier 0.8089171974522293
AdaBoostClassifier 0.7929936305732485
XGBClassifier 0.7993630573248408
In [64]:
plt.figure(figsize = (10,5))
ax = sns.barplot(x = predicted, y = key, palette='hls', order=predicted.sort())
plt.title("Plotting the Model Accuracies", fontsize=16, fontweight="bold")
plt.show()
In [65]:
import numpy as np
In [66]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
pred = gbc.predict(X_test)
cf_matrix = confusion_matrix(y_test, pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True,
fmt='.2%')
plt.title('Confusion Matrix of Gradient Boost Classifier', fontweight='bold', fontsize=16)
plt.xlabel('Predicted', fontweight='bold', fontsize=12)
plt.ylabel('Actual', fontweight='bold', fontsize=12)
plt.show()
In [67]: