Predict Heart Disease

Download as pdf or txt
Download as pdf or txt
You are on page 1of 55

Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

Data Analysis - Final Project-Sena


SenaKaya
2023-06-09
#read the dataset #first 5 row

library(readxl)
heartd<- read.csv("/Users/senakaya/Desktop/UCSC/data analysis/final-Personal Key Indi
cators of Heart Disease/heart_2020_cleaned.csv")
head(heartd)

## HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth


## 1 No 16.60 Yes No No 3 30
## 2 No 20.34 No No Yes 0 0
## 3 No 26.58 Yes No No 20 30
## 4 No 24.21 No No No 0 0
## 5 No 23.71 No No No 28 0
## 6 Yes 28.87 Yes No No 6 0
## DiffWalking Sex AgeCategory Race Diabetic PhysicalActivity GenHealth
## 1 No Female 55-59 White Yes Yes Very good
## 2 No Female 80 or older White No Yes Very good
## 3 No Male 65-69 White Yes Yes Fair
## 4 No Female 75-79 White No No Good
## 5 Yes Female 40-44 White No Yes Very good
## 6 Yes Female 75-79 Black No No Fair
## SleepTime Asthma KidneyDisease SkinCancer
## 1 5 Yes No Yes
## 2 7 No No No
## 3 8 Yes No No
## 4 6 No No Yes
## 5 8 No No No
## 6 12 No No No

#dimension of dataset

dim(heartd)

## [1] 319795 18

#to checks number of null values

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 1 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

sum(heartd.stringsAsFactors=FALSE)

## [1] 0

sum(is.na(heartd))

## [1] 0

#a list of objects and their structure

str(heartd)

## 'data.frame': 319795 obs. of 18 variables:


## $ HeartDisease : chr "No" "No" "No" "No" ...
## $ BMI : num 16.6 20.3 26.6 24.2 23.7 ...
## $ Smoking : chr "Yes" "No" "Yes" "No" ...
## $ AlcoholDrinking : chr "No" "No" "No" "No" ...
## $ Stroke : chr "No" "Yes" "No" "No" ...
## $ PhysicalHealth : num 3 0 20 0 28 6 15 5 0 0 ...
## $ MentalHealth : num 30 0 30 0 0 0 0 0 0 0 ...
## $ DiffWalking : chr "No" "No" "No" "No" ...
## $ Sex : chr "Female" "Female" "Male" "Female" ...
## $ AgeCategory : chr "55-59" "80 or older" "65-69" "75-79" ...
## $ Race : chr "White" "White" "White" "White" ...
## $ Diabetic : chr "Yes" "No" "Yes" "No" ...
## $ PhysicalActivity: chr "Yes" "Yes" "Yes" "No" ...
## $ GenHealth : chr "Very good" "Very good" "Fair" "Good" ...
## $ SleepTime : num 5 7 8 6 8 12 4 9 5 10 ...
## $ Asthma : chr "Yes" "No" "Yes" "No" ...
## $ KidneyDisease : chr "No" "No" "No" "No" ...
## $ SkinCancer : chr "Yes" "No" "No" "Yes" ...

#summary of the dataset

summary(heartd)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 2 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## HeartDisease BMI Smoking AlcoholDrinking


## Length:319795 Min. :12.02 Length:319795 Length:319795
## Class :character 1st Qu.:24.03 Class :character Class :character
## Mode :character Median :27.34 Mode :character Mode :character
## Mean :28.33
## 3rd Qu.:31.42
## Max. :94.85
## Stroke PhysicalHealth MentalHealth DiffWalking
## Length:319795 Min. : 0.000 Min. : 0.000 Length:319795
## Class :character 1st Qu.: 0.000 1st Qu.: 0.000 Class :character
## Mode :character Median : 0.000 Median : 0.000 Mode :character
## Mean : 3.372 Mean : 3.898
## 3rd Qu.: 2.000 3rd Qu.: 3.000
## Max. :30.000 Max. :30.000
## Sex AgeCategory Race Diabetic
## Length:319795 Length:319795 Length:319795 Length:319795
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## PhysicalActivity GenHealth SleepTime Asthma
## Length:319795 Length:319795 Min. : 1.000 Length:319795
## Class :character Class :character 1st Qu.: 6.000 Class :character
## Mode :character Mode :character Median : 7.000 Mode :character
## Mean : 7.097
## 3rd Qu.: 8.000
## Max. :24.000
## KidneyDisease SkinCancer
## Length:319795 Length:319795
## Class :character Class :character
## Mode :character Mode :character
##
##
##

#table of HeartDisease

table(heartd$HeartDisease)

##
## No Yes
## 292422 27373

#the number of different values for each column

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 3 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

library(dplyr)

##
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':


##
## filter, lag

## The following objects are masked from 'package:base':


##
## intersect, setdiff, setequal, union

heartd %>% summarise_all(n_distinct)

## HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth


## 1 2 3604 2 2 2 31 31
## DiffWalking Sex AgeCategory Race Diabetic PhysicalActivity GenHealth
## 1 2 2 13 6 4 2 5
## SleepTime Asthma KidneyDisease SkinCancer
## 1 24 2 2 2

#EDA (EXPLORATORY DATA ANALYSIS) #histogram of BMI(Body Mass Index)

library(ggplot2)

p<-ggplot(data = heartd, aes(x = BMI)) + geom_histogram(binwidth = 3,fill='blue',colo


ur="red")+ coord_cartesian(xlim = c(10, 60))
p+ geom_vline(aes(xintercept=mean(BMI)), color="yellow", linetype="dashed", linewidth
=1)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 4 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#histogram of Physical Health

p<-ggplot(data = heartd, aes(x = heartd$PhysicalHealth)) + geom_histogram(binwidth =


3,fill='blue',colour="red")+ coord_cartesian(xlim = c(0, 30))
p+ geom_vline(aes(xintercept=mean(heartd$PhysicalHealth)), color="yellow", linetype="
dashed", size=1)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Use of `heartd$PhysicalHealth` is discouraged.


## ℹ Use `PhysicalHealth` instead.
## Use of `heartd$PhysicalHealth` is discouraged.
## ℹ Use `PhysicalHealth` instead.

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 5 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#histogram of Mental Health

p<-ggplot(data = heartd, aes(x = MentalHealth )) + geom_histogram(binwidth = 3,fill='


blue',colour="red")+ coord_cartesian(xlim = c(0, 30))
p+ geom_vline(aes(xintercept=mean(MentalHealth)), color="yellow", linetype="dashed",
size=1)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 6 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#histogram of Sleep Time

p<-ggplot(data = heartd, aes(x = SleepTime)) + geom_histogram(binwidth = 1,fill='blue


',colour="red")+ coord_cartesian(xlim = c(2, 12))
p+ geom_vline(aes(xintercept=mean(SleepTime)), color="yellow", linetype="dashed", siz
e=1)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 7 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#remove the scientific notations from the axis

options(scipen = 999)

#Create the stacked bar plot for two categorical variable heart disease and smoking

ggplot(heartd, aes(x = HeartDisease, fill = Smoking)) +


geom_bar() +
xlab("HeartDisease") +
ylab("Count") +
coord_cartesian(ylim = c(0, 300000))

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 8 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

ggtitle("Bar Plot of Heart Disease and Smoking")

## $title
## [1] "Bar Plot of Heart Disease and Smoking"
##
## attr(,"class")
## [1] "labels"

#Create the stacked bar plot for two categorical variable heart disease and alcohol drinking

ggplot(heartd, aes(x = HeartDisease, fill = AlcoholDrinking)) +


geom_bar() +
xlab("HeartDisease") +
ylab("Count") +
coord_cartesian(ylim = c(0, 300000))+
ggtitle("Bar Plot of Heart Disease and Alcohol Drinking")

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 9 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#Create the bar stacked plot for two categorical variable heart disease and sex

ggplot(heartd, aes(x = HeartDisease, fill = Sex)) +


geom_bar() +
xlab("HeartDisease") +
ylab("Count") +
coord_cartesian(ylim = c(0, 300000))+
ggtitle("Bar Plot of Heart Disease and Sex")

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 10 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#Create the stacked bar plot for two categorical variable heart disease and asthma

ggplot(heartd, aes(x = HeartDisease, fill = Asthma)) +


geom_bar() +
xlab("HeartDisease") +
ylab("Count") +
coord_cartesian(ylim = c(0, 300000))+
ggtitle("Bar Plot of Heart Disease and Asthma")

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 11 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#Create the stacked bar plot for two categorical variable heart disease and kidney disease

ggplot(heartd, aes(x = HeartDisease, fill = KidneyDisease)) +


geom_bar() +
xlab("HeartDisease") +
ylab("Count") +
coord_cartesian(ylim = c(0, 300000))+
ggtitle("Bar Plot of Heart Disease and Kidney Disease")

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 12 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#Create the stacked bar plot for two categorical variable heart disease and skin cancer

ggplot(heartd, aes(x = HeartDisease, fill = SkinCancer)) +


geom_bar() +
xlab("HeartDisease") +
ylab("Count") +
coord_cartesian(ylim = c(0, 300000))+
ggtitle("Bar Plot of Heart Disease and Skin Cancer")

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 13 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#Count the frequencies of each combination of the two variables #Create the bar chart

counts <- table(heartd$HeartDisease, heartd$Smoking)

barplot(counts, beside = TRUE, legend = TRUE,


xlab = "HeartDisease", ylab = "Count",
main = "Bar Chart of Heart Disease and Smoking")

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 14 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#Count the frequencies of each combination of the two variables #Define colors for the bars #Create the bar
chart

counts <- table(heartd$HeartDisease, heartd$AgeCategory)

colors <- c("yellow", "green")

barplot(counts, beside = TRUE, legend = TRUE,


xlab = "HeartDisease", ylab = "Count",
main = "Bar Chart of Heart Disease and Age Category", col = colors)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 15 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#Count the frequencies of each combination of the two variables

counts <- table(heartd$HeartDisease, heartd$Race)

colors <- c("yellow", "green")

barplot(counts, beside = TRUE, legend = TRUE,


xlab = "HeartDisease", ylab = "Count",
main = "Bar Chart of HeartDisease and Race", col = colors)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 16 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#Count the frequencies of each combination of the two variables

counts <- table(heartd$HeartDisease, heartd$GenHealth)

colors <- c("yellow", "green")

barplot(counts, beside = TRUE, legend = TRUE,


xlab = "HeartDisease", ylab = "Count",
main = "Bar Chart of HeartDisease and GenHealth", col = colors)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 17 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

Count the frequencies of each combination


of the two variables
counts <- table(heartd$HeartDisease, heartd$SleepTime)

colors <- c("yellow", "green")

barplot(counts, beside = TRUE, legend = TRUE,


xlab = "HeartDisease", ylab = "Count",
main = "Bar Chart of HeartDisease and Sleep Time", col = colors)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 18 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

Count the frequencies of each combination


of the two variables
counts <- table(heartd$HeartDisease, heartd$PhysicalHealth)

colors <- c("yellow", "green")

barplot(counts, beside = TRUE, legend = TRUE,


xlab = "HeartDisease", ylab = "Count",
main = "Bar Chart of HeartDisease and Physical Health", col = colors)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 19 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

Replace values in the data frame


heartd <- replace(heartd, heartd == 'Yes', 1)
heartd <- replace(heartd, heartd == 'No', 0)
heartd <- replace(heartd, heartd == 'Male', 1)
heartd <- replace(heartd, heartd == 'Female', 0)
heartd <- replace(heartd, heartd == 'No, borderline diabetes', 0)
heartd <- replace(heartd, heartd == 'Yes (during pregnancy)', 1)
#ordinal
heartd <- replace(heartd, heartd == 'Excellent', 5)
heartd <- replace(heartd, heartd == 'Very good', 4)
heartd <- replace(heartd, heartd == 'Good', 3)
heartd <- replace(heartd, heartd == 'Fair', 2)
heartd <- replace(heartd, heartd == 'Poor', 1)

Convert ‘Diabetic’ column to integer


file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 20 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

heartd$Diabetic <- as.numeric(heartd$Diabetic)

Extract the first two characters from the


“Age” column
heartd$AgeCategory <- substr(heartd$AgeCategory, 1, 2)

Print the first five row of modified “Age”


column
print(heartd$AgeCategory[1:5])

## [1] "55" "80" "65" "75" "40"

#convert all “0”s and “1”s to integer except c( “Race”)

num_columns <- setdiff(colnames(heartd), c( "Race"))

heartd_num <- as.data.frame(lapply(heartd[num_columns], as.numeric))

str(heartd_num)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 21 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## 'data.frame': 319795 obs. of 17 variables:


## $ HeartDisease : num 0 0 0 0 0 1 0 0 0 0 ...
## $ BMI : num 16.6 20.3 26.6 24.2 23.7 ...
## $ Smoking : num 1 0 1 0 0 1 0 1 0 0 ...
## $ AlcoholDrinking : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Stroke : num 0 1 0 0 0 0 0 0 0 0 ...
## $ PhysicalHealth : num 3 0 20 0 28 6 15 5 0 0 ...
## $ MentalHealth : num 30 0 30 0 0 0 0 0 0 0 ...
## $ DiffWalking : num 0 0 0 0 1 1 0 1 0 1 ...
## $ Sex : num 0 0 1 0 0 0 0 0 0 1 ...
## $ AgeCategory : num 55 80 65 75 40 75 70 80 80 65 ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0 0 ...
## $ PhysicalActivity: num 1 1 1 0 1 0 1 0 0 1 ...
## $ GenHealth : num 4 4 2 3 4 2 2 3 2 3 ...
## $ SleepTime : num 5 7 8 6 8 12 4 9 5 10 ...
## $ Asthma : num 1 0 1 0 0 0 1 1 0 0 ...
## $ KidneyDisease : num 0 0 0 0 0 0 0 0 1 0 ...
## $ SkinCancer : num 1 0 0 1 0 0 1 0 0 0 ...

library(reshape2)
library(RColorBrewer)
library(corrplot)

## corrplot 0.92 loaded

library(ggcorrplot)

#Create correlation matrix #Plot correlation matrix #Calculate the correlation matrix

cor_matrix <- cor(heartd_num)

ggcorrplot(cor_matrix, type = "lower") +


geom_tile() +
scale_fill_gradientn(colours = brewer.pal(11, "RdBu"),
limits = c(-1, 1),
breaks = seq(-1, 1, by = 0.2)) +

labs(x = "", y = "") +


theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

## Scale for fill is already present.


## Adding another scale for fill, which will replace the existing scale.

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 22 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

cor_matrix <- cor(heartd_num)

print(cor_matrix)

## HeartDisease BMI Smoking AlcoholDrinking


## HeartDisease 1.000000000 0.051803191 0.10776416 -0.032079743
## BMI 0.051803191 1.000000000 0.02311811 -0.038816223
## Smoking 0.107764156 0.023118112 1.00000000 0.111767520
## AlcoholDrinking -0.032079743 -0.038816223 0.11176752 1.000000000
## Stroke 0.196835299 0.019732982 0.06122604 -0.019857914
## PhysicalHealth 0.170720972 0.109787544 0.11535241 -0.017254288
## MentalHealth 0.028590715 0.064130569 0.08515729 0.051281973
## DiffWalking 0.201258049 0.181678264 0.12007416 -0.035327583
## Sex 0.070040476 0.026939645 0.08505249 0.004200142
## AgeCategory 0.232324637 -0.001740121 0.13038380 -0.058836643
## Diabetic 0.174782320 0.200442834 0.05554379 -0.057786229
## PhysicalActivity -0.100029934 -0.150615994 -0.09717377 0.017486983
## GenHealth -0.243182458 -0.230719780 -0.17471733 0.029780388
## SleepTime 0.008326647 -0.051822254 -0.03033564 -0.005065451

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 23 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## Asthma 0.041444151 0.092345019 0.02414853 -0.002202100


## KidneyDisease 0.145197099 0.050767533 0.03491969 -0.028280092
## SkinCancer 0.093316878 -0.033643619 0.03397739 -0.005702370
## Stroke PhysicalHealth MentalHealth DiffWalking
## HeartDisease 0.196835299 0.17072097 0.02859071 0.20125805
## BMI 0.019732982 0.10978754 0.06413057 0.18167826
## Smoking 0.061226040 0.11535241 0.08515729 0.12007416
## AlcoholDrinking -0.019857914 -0.01725429 0.05128197 -0.03532758
## Stroke 1.000000000 0.13701383 0.04646706 0.17414321
## PhysicalHealth 0.137013827 1.00000000 0.28798667 0.42837280
## MentalHealth 0.046467061 0.28798667 1.00000000 0.15223467
## DiffWalking 0.174143214 0.42837280 0.15223467 1.00000000
## Sex -0.003091055 -0.04090384 -0.10005847 -0.06885956
## AgeCategory 0.137279527 0.11078930 -0.15545468 0.24255230
## Diabetic 0.104466795 0.15397533 0.02970857 0.20925469
## PhysicalActivity -0.079455195 -0.23228318 -0.09580810 -0.27852396
## GenHealth -0.168090049 -0.48269718 -0.24162528 -0.41379735
## SleepTime 0.011899981 -0.06138663 -0.11971679 -0.02221636
## Asthma 0.038866140 0.11790658 0.11400817 0.10322205
## KidneyDisease 0.091166841 0.14219718 0.03728113 0.15306375
## SkinCancer 0.048116104 0.04169969 -0.03341219 0.06484040
## Sex AgeCategory Diabetic PhysicalActivity
## HeartDisease 0.070040476 0.232324637 0.174782320 -0.100029934
## BMI 0.026939645 -0.001740121 0.200442834 -0.150615994
## Smoking 0.085052486 0.130383805 0.055543794 -0.097173766
## AlcoholDrinking 0.004200142 -0.058836643 -0.057786229 0.017486983
## Stroke -0.003091055 0.137279527 0.104466795 -0.079455195
## PhysicalHealth -0.040903839 0.110789301 0.153975332 -0.232283177
## MentalHealth -0.100058473 -0.155454678 0.029708569 -0.095808105
## DiffWalking -0.068859559 0.242552299 0.209254691 -0.278523964
## Sex 1.000000000 -0.067681898 -0.002054385 0.048246846
## AgeCategory -0.067681898 1.000000000 0.196801318 -0.121465289
## Diabetic -0.002054385 0.196801318 1.000000000 -0.136599474
## PhysicalActivity 0.048246846 -0.121465289 -0.136599474 1.000000000
## GenHealth 0.025947385 -0.188235591 -0.270245831 0.281248774
## SleepTime -0.015703748 0.103071031 0.003452185 0.003848841
## Asthma -0.069191116 -0.058234926 0.046886332 -0.041525882
## KidneyDisease -0.009083858 0.122696590 0.149112620 -0.081827321
## SkinCancer 0.013433800 0.262207609 0.034077309 -0.001327810
## GenHealth SleepTime Asthma KidneyDisease
## HeartDisease -0.24318246 0.008326647 0.0414441511 0.145197099
## BMI -0.23071978 -0.051822254 0.0923450190 0.050767533
## Smoking -0.17471733 -0.030335635 0.0241485320 0.034919686
## AlcoholDrinking 0.02978039 -0.005065451 -0.0022021002 -0.028280092
## Stroke -0.16809005 0.011899981 0.0388661403 0.091166841
## PhysicalHealth -0.48269718 -0.061386632 0.1179065802 0.142197185
## MentalHealth -0.24162528 -0.119716788 0.1140081744 0.037281128

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 24 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## DiffWalking -0.41379735 -0.022216357 0.1032220488 0.153063752


## Sex 0.02594739 -0.015703748 -0.0691911164 -0.009083858
## AgeCategory -0.18823559 0.103071031 -0.0582349260 0.122696590
## Diabetic -0.27024583 0.003452185 0.0468863320 0.149112620
## PhysicalActivity 0.28124877 0.003848841 -0.0415258818 -0.081827321
## GenHealth 1.00000000 0.063071012 -0.1373424390 -0.174642269
## SleepTime 0.06307101 1.000000000 -0.0482452803 0.006237934
## Asthma -0.13734244 -0.048245280 1.0000000000 0.039707000
## KidneyDisease -0.17464227 0.006237934 0.0397070003 1.000000000
## SkinCancer -0.05064137 0.041266167 -0.0003964769 0.061816217
## SkinCancer
## HeartDisease 0.0933168777
## BMI -0.0336436185
## Smoking 0.0339773873
## AlcoholDrinking -0.0057023705
## Stroke 0.0481161045
## PhysicalHealth 0.0416996855
## MentalHealth -0.0334121907
## DiffWalking 0.0648404018
## Sex 0.0134337997
## AgeCategory 0.2622076094
## Diabetic 0.0340773087
## PhysicalActivity -0.0013278105
## GenHealth -0.0506413674
## SleepTime 0.0412661675
## Asthma -0.0003964769
## KidneyDisease 0.0618162165
## SkinCancer 1.0000000000

#Find correlations above 0.75 #Check if there are any high correlations

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 25 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

high_corr <- which(cor_matrix > 0.75 & cor_matrix < 1, arr.ind = TRUE)

if (nrow(high_corr) == 0) {
cat("No correlations above 0.75 found.\n")
} else {
# Print the pairs of variables with high correlation
for (i in 1:nrow(high_corr)) {
row_num <- high_corr[i, "row"]
col_num <- high_corr[i, "col"]
row_name <- rownames(cor_matrix)[row_num]
col_name <- colnames(cor_matrix)[col_num]
correlation <- cor_matrix[row_num, col_num]
cat("High correlation (", correlation, ") between", row_name, "and", col_name, "\
n")
}
}

## No correlations above 0.75 found.

#Create a copy of the original data frame

df_encoded <- heartd


str(df_encoded)

## 'data.frame': 319795 obs. of 18 variables:


## $ HeartDisease : chr "0" "0" "0" "0" ...
## $ BMI : num 16.6 20.3 26.6 24.2 23.7 ...
## $ Smoking : chr "1" "0" "1" "0" ...
## $ AlcoholDrinking : chr "0" "0" "0" "0" ...
## $ Stroke : chr "0" "1" "0" "0" ...
## $ PhysicalHealth : num 3 0 20 0 28 6 15 5 0 0 ...
## $ MentalHealth : num 30 0 30 0 0 0 0 0 0 0 ...
## $ DiffWalking : chr "0" "0" "0" "0" ...
## $ Sex : chr "0" "0" "1" "0" ...
## $ AgeCategory : chr "55" "80" "65" "75" ...
## $ Race : chr "White" "White" "White" "White" ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0 0 ...
## $ PhysicalActivity: chr "1" "1" "1" "0" ...
## $ GenHealth : chr "4" "4" "2" "3" ...
## $ SleepTime : num 5 7 8 6 8 12 4 9 5 10 ...
## $ Asthma : chr "1" "0" "1" "0" ...
## $ KidneyDisease : chr "0" "0" "0" "0" ...
## $ SkinCancer : chr "1" "0" "0" "1" ...

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 26 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#ONE-HOT ENCODING #Perform one-hot encoding for the “Race” column

df_encoded <- heartd

#Extract the “Race” column

race_column <- df_encoded$Race

#Perform one-hot encoding for the “Race” column

encoded_cols <- model.matrix(~ 0 + factor(race_column))


colnames(encoded_cols) <- paste("Race", colnames(encoded_cols), sep = "_")

#Add the encoded columns to the data set

df_encoded <- cbind(df_encoded, encoded_cols)

#Remove the original “Race” column

df_encoded <- df_encoded[, !names(df_encoded) %in% "Race"]

#Print the encoded data set

print(head(df_encoded))

## HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth


## 1 0 16.60 1 0 0 3 30
## 2 0 20.34 0 0 1 0 0
## 3 0 26.58 1 0 0 20 30
## 4 0 24.21 0 0 0 0 0
## 5 0 23.71 0 0 0 28 0
## 6 1 28.87 1 0 0 6 0
## DiffWalking Sex AgeCategory Diabetic PhysicalActivity GenHealth SleepTime
## 1 0 0 55 1 1 4 5
## 2 0 0 80 0 1 4 7
## 3 0 1 65 1 1 2 8
## 4 0 0 75 0 0 3 6
## 5 1 0 40 0 1 4 8
## 6 1 0 75 0 0 2 12
## Asthma KidneyDisease SkinCancer
## 1 1 0 1
## 2 0 0 0
## 3 1 0 0
## 4 0 0 1

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 27 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## 5 0 0 0
## 6 0 0 0
## Race_factor(race_column)American Indian/Alaskan Native
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Race_factor(race_column)Asian Race_factor(race_column)Black
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## Race_factor(race_column)Hispanic Race_factor(race_column)Other
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Race_factor(race_column)White
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 0

str(df_encoded)

## 'data.frame': 319795 obs. of 23 variables:


## $ HeartDisease : chr "0" "0" "0" "0" ..
.
## $ BMI : num 16.6 20.3 26.6 24.
2 23.7 ...
## $ Smoking : chr "1" "0" "1" "0" ..
.
## $ AlcoholDrinking : chr "0" "0" "0" "0" ..
.
## $ Stroke : chr "0" "1" "0" "0" ..
.
## $ PhysicalHealth : num 3 0 20 0 28 6 15 5
0 0 ...

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 28 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## $ MentalHealth : num 30 0 30 0 0 0 0 0
0 0 ...
## $ DiffWalking : chr "0" "0" "0" "0" ..
.
## $ Sex : chr "0" "0" "1" "0" ..
.
## $ AgeCategory : chr "55" "80" "65" "75
" ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0
0 ...
## $ PhysicalActivity : chr "1" "1" "1" "0" ..
.
## $ GenHealth : chr "4" "4" "2" "3" ..
.
## $ SleepTime : num 5 7 8 6 8 12 4 9 5
10 ...
## $ Asthma : chr "1" "0" "1" "0" ..
.
## $ KidneyDisease : chr "0" "0" "0" "0" ..
.
## $ SkinCancer : chr "1" "0" "0" "1" ..
.
## $ Race_factor(race_column)American Indian/Alaskan Native: num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Asian : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Black : num 0 0 0 0 0 1 0 0 0
0 ...
## $ Race_factor(race_column)Hispanic : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Other : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)White : num 1 1 1 1 1 0 1 1 1
1 ...

#Convert specific columns to integers (only independent variables - dependent variable must be a factor)

columns_to_convert <- c("Smoking", "SkinCancer", "KidneyDisease", "Asthma", "GenHealt


h",
"AgeCategory","Stroke", "AlcoholDrinking","Sex", "DiffWalking
", "PhysicalActivity")
df_encoded[columns_to_convert] <- lapply(df_encoded[columns_to_convert], as.numeric)

#Identify numeric variables in the data frame

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 29 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

numeric_vars <- sapply(df_encoded, is.numeric)

str(df_encoded)

## 'data.frame': 319795 obs. of 23 variables:


## $ HeartDisease : chr "0" "0" "0" "0" ..
.
## $ BMI : num 16.6 20.3 26.6 24.
2 23.7 ...
## $ Smoking : num 1 0 1 0 0 1 0 1 0
0 ...
## $ AlcoholDrinking : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Stroke : num 0 1 0 0 0 0 0 0 0
0 ...
## $ PhysicalHealth : num 3 0 20 0 28 6 15 5
0 0 ...
## $ MentalHealth : num 30 0 30 0 0 0 0 0
0 0 ...
## $ DiffWalking : num 0 0 0 0 1 1 0 1 0
1 ...
## $ Sex : num 0 0 1 0 0 0 0 0 0
1 ...
## $ AgeCategory : num 55 80 65 75 40 75
70 80 80 65 ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0
0 ...
## $ PhysicalActivity : num 1 1 1 0 1 0 1 0 0
1 ...
## $ GenHealth : num 4 4 2 3 4 2 2 3 2
3 ...
## $ SleepTime : num 5 7 8 6 8 12 4 9 5
10 ...
## $ Asthma : num 1 0 1 0 0 0 1 1 0
0 ...
## $ KidneyDisease : num 0 0 0 0 0 0 0 0 1
0 ...
## $ SkinCancer : num 1 0 0 1 0 0 1 0 0
0 ...
## $ Race_factor(race_column)American Indian/Alaskan Native: num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Asian : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Black : num 0 0 0 0 0 1 0 0 0
0 ...
## $ Race_factor(race_column)Hispanic : num 0 0 0 0 0 0 0 0 0

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 30 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

0 ...
## $ Race_factor(race_column)Other : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)White : num 1 1 1 1 1 0 1 1 1
1 ...

SCALING #Apply min-max scaling to the numerical variables

scaled_data <- apply(df_encoded[, numeric_vars], 2, function(x) (x - min(x)) / (max(x


) - min(x)))

#Print the scaled data

print(head(scaled_data))

## BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth


## 1 0.05529398 1 0 0 0.1000000 1
## 2 0.10044670 0 0 1 0.0000000 0
## 3 0.17578172 1 0 0 0.6666667 1
## 4 0.14716890 0 0 0 0.0000000 0
## 5 0.14113244 0 0 0 0.9333333 0
## 6 0.20342871 1 0 0 0.2000000 0
## DiffWalking Sex AgeCategory Diabetic PhysicalActivity GenHealth SleepTime
## 1 0 0 0.5967742 1 1 0.75 0.1739130
## 2 0 0 1.0000000 0 1 0.75 0.2608696
## 3 0 1 0.7580645 1 1 0.25 0.3043478
## 4 0 0 0.9193548 0 0 0.50 0.2173913
## 5 1 0 0.3548387 0 1 0.75 0.3043478
## 6 1 0 0.9193548 0 0 0.25 0.4782609
## Asthma KidneyDisease SkinCancer
## 1 1 0 1
## 2 0 0 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 0
## Race_factor(race_column)American Indian/Alaskan Native
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Race_factor(race_column)Asian Race_factor(race_column)Black
## 1 0 0
## 2 0 0

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 31 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## Race_factor(race_column)Hispanic Race_factor(race_column)Other
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Race_factor(race_column)White
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 0

#Combine the scaled numeric variables with the non-numeric variables

scaled_dataframe <- cbind(scaled_data, df_encoded[!numeric_vars])


str(scaled_dataframe)

## 'data.frame': 319795 obs. of 23 variables:


## $ BMI : num 0.0553 0.1004 0.17
58 0.1472 0.1411 ...
## $ Smoking : num 1 0 1 0 0 1 0 1 0
0 ...
## $ AlcoholDrinking : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Stroke : num 0 1 0 0 0 0 0 0 0
0 ...
## $ PhysicalHealth : num 0.1 0 0.667 0 0.93
3 ...
## $ MentalHealth : num 1 0 1 0 0 0 0 0 0
0 ...
## $ DiffWalking : num 0 0 0 0 1 1 0 1 0
1 ...
## $ Sex : num 0 0 1 0 0 0 0 0 0
1 ...
## $ AgeCategory : num 0.597 1 0.758 0.91
9 0.355 ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0
0 ...
## $ PhysicalActivity : num 1 1 1 0 1 0 1 0 0

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 32 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

1 ...
## $ GenHealth : num 0.75 0.75 0.25 0.5
0.75 0.25 0.25 0.5 0.25 0.5 ...
## $ SleepTime : num 0.174 0.261 0.304
0.217 0.304 ...
## $ Asthma : num 1 0 1 0 0 0 1 1 0
0 ...
## $ KidneyDisease : num 0 0 0 0 0 0 0 0 1
0 ...
## $ SkinCancer : num 1 0 0 1 0 0 1 0 0
0 ...
## $ Race_factor(race_column)American Indian/Alaskan Native: num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Asian : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Black : num 0 0 0 0 0 1 0 0 0
0 ...
## $ Race_factor(race_column)Hispanic : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Other : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)White : num 1 1 1 1 1 0 1 1 1
1 ...
## $ HeartDisease : chr "0" "0" "0" "0" ..
.

#Print the scaled data frame

print(head(scaled_dataframe))

## BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth


## 1 0.05529398 1 0 0 0.1000000 1
## 2 0.10044670 0 0 1 0.0000000 0
## 3 0.17578172 1 0 0 0.6666667 1
## 4 0.14716890 0 0 0 0.0000000 0
## 5 0.14113244 0 0 0 0.9333333 0
## 6 0.20342871 1 0 0 0.2000000 0
## DiffWalking Sex AgeCategory Diabetic PhysicalActivity GenHealth SleepTime
## 1 0 0 0.5967742 1 1 0.75 0.1739130
## 2 0 0 1.0000000 0 1 0.75 0.2608696
## 3 0 1 0.7580645 1 1 0.25 0.3043478
## 4 0 0 0.9193548 0 0 0.50 0.2173913
## 5 1 0 0.3548387 0 1 0.75 0.3043478
## 6 1 0 0.9193548 0 0 0.25 0.4782609
## Asthma KidneyDisease SkinCancer
## 1 1 0 1

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 33 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## 2 0 0 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 0
## Race_factor(race_column)American Indian/Alaskan Native
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Race_factor(race_column)Asian Race_factor(race_column)Black
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## Race_factor(race_column)Hispanic Race_factor(race_column)Other
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Race_factor(race_column)White HeartDisease
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 0 1

max(scaled_dataframe$BMI)

## [1] 1

min(scaled_dataframe$BMI)

## [1] 0

# install.packages("caret", dependencies = c("Depends", "Suggests"))

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 34 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#install.packages('gower')

# install.packages('hardhat')

# install.packages('timechange')

# install.packages('ModelMetrics')

library(ggplot2)
library(lattice)
library(caret)

#SPLITING TEST AND TRAIN DATA SET

#Set the random seed for reproducibility #Specify the proportion of data to be allocated for the test set
#Create the train and test split

set.seed(123)

test_size <- 0.2

split <- createDataPartition(scaled_dataframe$HeartDisease, p = test_size, list = FAL


SE)

#Create the train set #Create the test set

test_set<- scaled_dataframe[split, ]

train_set <- scaled_dataframe[-split, ]

#dimension of test and train set

dim(test_set)

## [1] 63960 23

dim(train_set)

## [1] 255835 23

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 35 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

MODELLING Logistic Regression

#Create the formula for logistic regression

formula <- as.formula( HeartDisease ~ .)

#Convert the “HeartDisease” column to factor in both train_set and test_set

train_set$HeartDisease <- factor(train_set$HeartDisease)


test_set$HeartDisease <- factor(test_set$HeartDisease)

#Train the logistic regression model

weight = 0.8
model <- glm(formula, data = train_set,
family = "binomial",
weights = ifelse(train_set$HeartDisease == 1, weight, 1 - weight)
)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

#Make predictions on the test set

predictions <- predict(model, newdata = test_set, type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

#Convert the predictions to binary values

binary_predictions <- ifelse(predictions > 0.5, 1, 0)

#Calculate accuracy

accuracy <- mean(binary_predictions == test_set$HeartDisease)


cat("Accuracy:", accuracy, "\n")

## Accuracy: 0.8757661

TP <- sum(binary_predictions == 1 & test_set$HeartDisease == 1)


FN <- sum(binary_predictions == 0 & test_set$HeartDisease == 1)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 36 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#recall <- sensitivity(as.factor(test_set$HeartDisease), as.factor(predictions))

recall <- TP / (TP + FN)


cat("Recall:", recall, "\n")

## Recall: 0.4789041

#Calculate precision

precision <- sum(binary_predictions == 1 & test_set$HeartDisease == 1) / sum(binary_p


redictions == 1)
cat("Precision:", precision, "\n")

## Precision: 0.3398574

#Calculate F1 score

f1_score <- 2 * (precision * recall) / (precision + recall)


cat("F1 Score:", f1_score, "\n")

## F1 Score: 0.3975739

#Confusion Matrix

confusionMatrix(data=as.factor(binary_predictions), reference = as.factor(test_set$He


artDisease))

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 37 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## Confusion Matrix and Statistics


##
## Reference
## Prediction 0 1
## 0 53392 2853
## 1 5093 2622
##
## Accuracy : 0.8758
## 95% CI : (0.8732, 0.8783)
## No Information Rate : 0.9144
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3305
##
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.9129
## Specificity : 0.4789
## Pos Pred Value : 0.9493
## Neg Pred Value : 0.3399
## Prevalence : 0.9144
## Detection Rate : 0.8348
## Detection Prevalence : 0.8794
## Balanced Accuracy : 0.6959
##
## 'Positive' Class : 0
##

KNN Classifier

#Train the KNN classifier

library(caret)
k <- 5
formula <- as.formula(HeartDisease ~ .)
model <- train(formula, data = train_set, method = "knn", trControl = trainControl(me
thod = "none"), tuneGrid = data.frame(k = k))

#Make predictions on the test an training set

predictions <- predict(model, newdata = test_set)


predictions_train <- predict(model, newdata = train_set)

#Calculate accuracy

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 38 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

accuracy <- sum(predictions == test_set$HeartDisease) / length(test_set$HeartDisease)


cat("Accuracy:", accuracy, "\n")

## Accuracy: 0.9059412

accuracy_train <- sum(predictions_train == train_set$HeartDisease) / length(train_set


$HeartDisease)
cat("Accuracy_Train:", accuracy_train, "\n")

## Accuracy_Train: 0.9247523

#Calculate true positives

true_positives <- sum(predictions == 1 & test_set$HeartDisease == 1)


print(true_positives)

## [1] 749

#Calculate false positives

false_positives <- sum(predictions == 1 & test_set$HeartDisease == 0)


print(false_positives)

## [1] 1290

#Calculate false negatives

false_negatives <- sum(predictions == 0 & test_set$HeartDisease == 1)


print(false_negatives)

## [1] 4726

#Calculate recall

recall <- true_positives / (true_positives + false_negatives)


cat("Recall:", recall, "\n")

## Recall: 0.1368037

#Calculate precision

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 39 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

precision <- true_positives / (true_positives + false_positives)


cat("Precision:", precision, "\n")

## Precision: 0.3673369

#Calculate F1 score

f1_score <- 2 * (precision * recall) / (precision + recall)


cat("F1 Score:", f1_score, "\n")

## F1 Score: 0.1993612

#Confusion Matrix of the test set

confusionMatrix(data=as.factor(predictions), reference = as.factor(test_set$HeartDise


ase))

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 40 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## Confusion Matrix and Statistics


##
## Reference
## Prediction 0 1
## 0 57195 4726
## 1 1290 749
##
## Accuracy : 0.9059
## 95% CI : (0.9037, 0.9082)
## No Information Rate : 0.9144
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1604
##
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.9779
## Specificity : 0.1368
## Pos Pred Value : 0.9237
## Neg Pred Value : 0.3673
## Prevalence : 0.9144
## Detection Rate : 0.8942
## Detection Prevalence : 0.9681
## Balanced Accuracy : 0.5574
##
## 'Positive' Class : 0
##

#Confusion Matrix of the train set

confusionMatrix(data=as.factor(predictions_train), reference = as.factor(train_set$He


artDisease))

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 41 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## Confusion Matrix and Statistics


##
## Reference
## Prediction 0 1
## 0 231069 16383
## 1 2868 5515
##
## Accuracy : 0.9248
## 95% CI : (0.9237, 0.9258)
## No Information Rate : 0.9144
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.3326
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9877
## Specificity : 0.2518
## Pos Pred Value : 0.9338
## Neg Pred Value : 0.6579
## Prevalence : 0.9144
## Detection Rate : 0.9032
## Detection Prevalence : 0.9672
## Balanced Accuracy : 0.6198
##
## 'Positive' Class : 0
##

#RANDOM FOREST

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

##
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':


##
## margin

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 42 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## The following object is masked from 'package:dplyr':


##
## combine

#Get the current column names

column_names <- names(train_set)


column_names_test <- names(test_set)

#Replace “/” and ” ” with “_”

new_column_names <- gsub("/", "_", column_names)


new_column_names_test <- gsub("/", "_", column_names_test)

new_column_names <- gsub(" ", "_", new_column_names)


new_column_names_test <- gsub(" ", "_", new_column_names_test)

#Rename columns with similar issues

new_column_names <- sub(".*\\)", "Race_", new_column_names)


new_column_names_test <- sub(".*\\)", "Race_", new_column_names_test)

#Assign the new column names to the dataset

names(train_set) <- new_column_names


names(test_set) <- new_column_names_test

library(dplyr)

#Define the parameter grid for tuning

ntree_vals <- c(25,30,35,40,45,50,100)


mtry_vals <- c(sqrt(ncol(train_set)))
nodesize_vals <- c(20)

#Initialize empty dataframe to store results

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 43 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

results <- data.frame(ntree = integer(),


mtry = numeric(),
nodesize = integer(),
class_weight = character(),
accuracy = numeric(),
recall = numeric(),
precision = numeric(),
f1_score = numeric(),
stringsAsFactors = FALSE)

#Define the best metric value and class weights

best_metric <- 0
best_class_weights <- NULL
best_ntree <- 0
best_mtry <- 0
best_nodesize <- 0

#Iterate over parameter combinations

for (ntree in ntree_vals) {


for (mtry in mtry_vals) {
for (nodesize in nodesize_vals) {
# Iterate over class weight combinations
for (weight_1 in seq(0.3, 0.6, 0.4)) {
weight_0 <- 1 - weight_1
class_weights <- list(weight_0, weight_1)

# Train the Random Forest model with current parameter combination and class
weight
model <- randomForest(HeartDisease ~ .,
data = train_set,
ntree = ntree,
mtry = mtry,
importance = TRUE,
nodesize = nodesize,
classwt = class_weights)

# Make predictions on the test set


predictions <- predict(model, newdata = test_set)

# Calculate metrics
TP <- sum(predictions == 1 & test_set$HeartDisease == 1)
FN <- sum(predictions == 0 & test_set$HeartDisease == 1)

accuracy <- mean(predictions == test_set$HeartDisease)

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 44 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

recall <- TP / (TP + FN)


precision <- sum(predictions == 1 & test_set$HeartDisease == 1) / sum(predict
ions == 1)
f1_score <- 2 * (precision * recall) / (precision + recall)

# Append results to dataframe


results <- results %>%
add_row(ntree = ntree,
mtry = mtry,
nodesize = nodesize,
class_weight = paste(class_weights, collapse = ":"),
accuracy = accuracy,
recall = recall,
precision = precision,
f1_score = f1_score)
# Check if the current class weight combination improves the best metric
if (f1_score > best_metric) {
best_metric <- f1_score
best_class_weights <- class_weights
best_ntree <- ntree
best_mtry <- mtry
best_nodesize <- nodesize
}
}
}
}
}

#View the results

print(results)

## ntree mtry nodesize class_weight accuracy recall precision f1_score


## 1 25 4.795832 20 0.7:0.3 0.8687930 0.4832877 0.3223292 0.3867290
## 2 30 4.795832 20 0.7:0.3 0.8711538 0.4745205 0.3262999 0.3866935
## 3 35 4.795832 20 0.7:0.3 0.8708255 0.4737900 0.3252665 0.3857249
## 4 40 4.795832 20 0.7:0.3 0.8730926 0.4661187 0.3294604 0.3860525
## 5 45 4.795832 20 0.7:0.3 0.8712946 0.4789041 0.3277090 0.3891362
## 6 50 4.795832 20 0.7:0.3 0.8714665 0.4816438 0.3288030 0.3908114
## 7 100 4.795832 20 0.7:0.3 0.8727955 0.4821918 0.3324518 0.3935599

cat('\n')

#Print the best class weights and metric value

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 45 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

cat("Best Class Weights:", paste(best_class_weights, collapse = ":"), "\n")

## Best Class Weights: 0.7:0.3

cat("best_ntree:", best_ntree, "\n")

## best_ntree: 100

cat("best_mtry:", best_mtry, "\n")

## best_mtry: 4.795832

cat("best_nodesize:", best_nodesize, "\n")

## best_nodesize: 20

cat("Best F1 Score:", best_metric, "\n")

## Best F1 Score: 0.3935599

#BEST RANDOM FOREST MODEL

model <- randomForest(HeartDisease ~ .,


data = train_set,
ntree = best_ntree, # Number of trees in the forest
mtry = best_mtry, # Number of variables to consider
at each split
importance = TRUE, # Calculate variable importance
nodesize = best_nodesize, # Minimum size of terminal nodes
classwt = best_class_weights # Assign class weights
)

#Make predictions on the test set

predictions <- predict(model, newdata = test_set)


predictions_train <- predict(model, newdata = train_set)

#Calculate accuracy

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 46 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

accuracy <- mean(predictions == test_set$HeartDisease)


cat("Accuracy for test_set:", accuracy, "\n")

## Accuracy for test_set: 0.8727799

accuracy_train <- mean(predictions_train == train_set$HeartDisease)


cat("Accuracy Training:", accuracy_train, "\n")

## Accuracy Training: 0.9063576

#Recall

TP <- sum(predictions == 1 & test_set$HeartDisease == 1)


FN <- sum(predictions == 0 & test_set$HeartDisease == 1)
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")

## Recall: 0.4785388

TP <- sum(predictions_train == 1 & train_set$HeartDisease == 1)


FN <- sum(predictions_train == 0 & train_set$HeartDisease == 1)
recall_train <- TP / (TP + FN)
cat("Recall_Train:", recall_train, "\n")

## Recall_Train: 0.7005206

#Calculate precision

precision <- sum(predictions == 1 & test_set$HeartDisease == 1) / sum(predictions ==


1)
cat("Precision:", precision, "\n")

## Precision: 0.3315616

precision_train <- sum(predictions_train == 1 & train_set$HeartDisease == 1) / sum(pr


edictions_train == 1)
cat("Precision_train:", precision_train, "\n")

## Precision_train: 0.4685543

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 47 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

#Calculate F1 score

f1_score <- 2 * (precision * recall) / (precision + recall)


cat("F1 Score:", f1_score, "\n")

## F1 Score: 0.3917171

f1_score_train <- 2 * (precision_train * recall_train) / (precision_train + recall_tr


ain)
cat("F1 Score Train:", f1_score_train, "\n")

## F1 Score Train: 0.5615242

#Confusion matrix for test set

confusionMatrix(data=as.factor(predictions), reference = as.factor(test_set$HeartDise


ase))

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 48 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## Confusion Matrix and Statistics


##
## Reference
## Prediction 0 1
## 0 53203 2855
## 1 5282 2620
##
## Accuracy : 0.8728
## 95% CI : (0.8702, 0.8754)
## No Information Rate : 0.9144
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3233
##
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.9097
## Specificity : 0.4785
## Pos Pred Value : 0.9491
## Neg Pred Value : 0.3316
## Prevalence : 0.9144
## Detection Rate : 0.8318
## Detection Prevalence : 0.8765
## Balanced Accuracy : 0.6941
##
## 'Positive' Class : 0
##

#Confusion Matrix for train set

confusionMatrix(data=as.factor(predictions_train), reference = as.factor(train_set$He


artDisease))

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 49 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## Confusion Matrix and Statistics


##
## Reference
## Prediction 0 1
## 0 216538 6558
## 1 17399 15340
##
## Accuracy : 0.9064
## 95% CI : (0.9052, 0.9075)
## No Information Rate : 0.9144
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.5114
##
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.9256
## Specificity : 0.7005
## Pos Pred Value : 0.9706
## Neg Pred Value : 0.4686
## Prevalence : 0.9144
## Detection Rate : 0.8464
## Detection Prevalence : 0.8720
## Balanced Accuracy : 0.8131
##
## 'Positive' Class : 0
##

#Feature Importances

library(caret)
library(ggplot2)

#Calculate feature importances

importances <- varImp(model)

#Sort the importances in descending order

sorted_importances <- importances[order(importances$`0`, decreasing = TRUE), ]


sorted_importances = subset(sorted_importances, select = -c(1) )

#Print feature importances

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 50 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

print(sorted_importances)

## 1
## AgeCategory 57.17185052
## Stroke 48.55089641
## GenHealth 46.21302100
## Sex 38.19345449
## Diabetic 15.65786760
## PhysicalHealth 14.33448787
## KidneyDisease 13.83797069
## DiffWalking 10.79500337
## MentalHealth 10.00377272
## BMI 9.11022265
## Race_White 6.74038712
## AlcoholDrinking 6.29347386
## Race_Black 4.97982135
## Asthma 4.88838298
## Race_Asian 4.00014224
## Race_Hispanic 3.89790718
## PhysicalActivity 3.89514501
## SkinCancer 3.74235929
## SleepTime 3.17222214
## Smoking 2.65329054
## Race_Other 0.05098671
## Race_American_Indian_Alaskan_Native -0.78333496

#Create a dataframe for plotting

importance_df <- data.frame(variable = row.names(sorted_importances),


importance = sorted_importances)

#Plot the feature importances in a horizontal bar chart

ggplot(importance_df, aes(y = reorder(variable, X1), x = X1)) +


geom_bar(stat = "identity", fill = "blue") +
geom_text(aes(label = round(X1,1)), hjust = -0.1, size = 3, color = "black") +
labs(y = "Variable", x = "Importance") +
ggtitle("Feature Importances") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1))

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 51 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

library(caret)

#Define the range of weights to be tested

weight_range <- seq(0.1, 1, by = 0.1)

#Initialize an empty dataframe to store results

results_LR <- data.frame(weights = character(),


accuracy = numeric(),
recall = numeric(),
precision = numeric(),
f1_score = numeric(),
stringsAsFactors = FALSE)

#Perform parameter tuning with different class weights

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 52 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

model <- glm(HeartDisease ~ .,


data = train_set,
family = "binomial")
class_weights <- 'Default'

for (weight in weight_range) {


# Train the logistic regression model with class weights

# Make predictions on the test set


predictions <- predict(model, newdata = test_set, type = "response")

# Convert predictions to factors


predictions <- as.factor(ifelse(predictions > 0.5, 1, 0))

# Calculate evaluation metrics


TP <- sum(predictions == 1 & test_set$HeartDisease == 1)
FN <- sum(predictions == 0 & test_set$HeartDisease == 1)

accuracy <- mean(predictions == test_set$HeartDisease)


#recall <- sensitivity(as.factor(test_set$HeartDisease), as.factor(predictions))
recall <- TP / (TP + FN)
precision <- sum(predictions == 1 & test_set$HeartDisease == 1) / sum(predictions =
= 1)
f1_score <- 2 * (precision * recall) / (precision + recall)

# Append results to the dataframe

results_LR <- rbind(results_LR, data.frame(weights = class_weights,


accuracy = accuracy,
recall = recall,
precision = precision,
f1_score = f1_score,
stringsAsFactors = FALSE))

class_weights <- paste(1 - weight, weight, sep = ":")


model <- glm(HeartDisease ~ .,
data = train_set,
family = "binomial",
weights = ifelse(train_set$HeartDisease == 1, weight, 1 - weight))
}

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 53 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 54 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :


## prediction from a rank-deficient fit may be misleading

## Warning: glm.fit: algorithm did not converge

# Print the results


print(results_LR)

## weights accuracy recall precision f1_score


## 1 Default 0.9161507 0.1132420091 0.5496454 0.1877934272
## 2 0.9:0.1 0.9144153 0.0001826484 1.0000000 0.0003652301
## 3 0.8:0.2 0.9147905 0.0056621005 0.8378378 0.0112481858
## 4 0.7:0.3 0.9154472 0.0235616438 0.6753927 0.0455347688
## 5 0.6:0.4 0.9161664 0.0578995434 0.6084453 0.1057371581
## 6 0.5:0.5 0.9161507 0.1132420091 0.5496454 0.1877934272
## 7 0.4:0.6 0.9140869 0.1961643836 0.4953875 0.2810414759
## 8 0.3:0.7 0.9039556 0.3081278539 0.4173676 0.3545234843
## 9 0.2:0.8 0.8757661 0.4789041096 0.3398574 0.3975739196
## 10 0.1:0.9 0.7791119 0.7364383562 0.2411916 0.3633741889

file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 55 of 55

You might also like