Predict Heart Disease
Predict Heart Disease
Predict Heart Disease
library(readxl)
heartd<- read.csv("/Users/senakaya/Desktop/UCSC/data analysis/final-Personal Key Indi
cators of Heart Disease/heart_2020_cleaned.csv")
head(heartd)
#dimension of dataset
dim(heartd)
## [1] 319795 18
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 1 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
sum(heartd.stringsAsFactors=FALSE)
## [1] 0
sum(is.na(heartd))
## [1] 0
str(heartd)
summary(heartd)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 2 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#table of HeartDisease
table(heartd$HeartDisease)
##
## No Yes
## 292422 27373
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 3 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
library(dplyr)
##
## Attaching package: 'dplyr'
library(ggplot2)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 4 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 5 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 6 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 7 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
options(scipen = 999)
#Create the stacked bar plot for two categorical variable heart disease and smoking
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 8 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
## $title
## [1] "Bar Plot of Heart Disease and Smoking"
##
## attr(,"class")
## [1] "labels"
#Create the stacked bar plot for two categorical variable heart disease and alcohol drinking
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 9 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#Create the bar stacked plot for two categorical variable heart disease and sex
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 10 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#Create the stacked bar plot for two categorical variable heart disease and asthma
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 11 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#Create the stacked bar plot for two categorical variable heart disease and kidney disease
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 12 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#Create the stacked bar plot for two categorical variable heart disease and skin cancer
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 13 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#Count the frequencies of each combination of the two variables #Create the bar chart
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 14 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#Count the frequencies of each combination of the two variables #Define colors for the bars #Create the bar
chart
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 15 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 16 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 17 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 18 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 19 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
str(heartd_num)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 21 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
library(reshape2)
library(RColorBrewer)
library(corrplot)
library(ggcorrplot)
#Create correlation matrix #Plot correlation matrix #Calculate the correlation matrix
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 22 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
print(cor_matrix)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 23 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 24 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#Find correlations above 0.75 #Check if there are any high correlations
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 25 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
high_corr <- which(cor_matrix > 0.75 & cor_matrix < 1, arr.ind = TRUE)
if (nrow(high_corr) == 0) {
cat("No correlations above 0.75 found.\n")
} else {
# Print the pairs of variables with high correlation
for (i in 1:nrow(high_corr)) {
row_num <- high_corr[i, "row"]
col_num <- high_corr[i, "col"]
row_name <- rownames(cor_matrix)[row_num]
col_name <- colnames(cor_matrix)[col_num]
correlation <- cor_matrix[row_num, col_num]
cat("High correlation (", correlation, ") between", row_name, "and", col_name, "\
n")
}
}
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 26 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
print(head(df_encoded))
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 27 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
## 5 0 0 0
## 6 0 0 0
## Race_factor(race_column)American Indian/Alaskan Native
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Race_factor(race_column)Asian Race_factor(race_column)Black
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## Race_factor(race_column)Hispanic Race_factor(race_column)Other
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Race_factor(race_column)White
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 0
str(df_encoded)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 28 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
## $ MentalHealth : num 30 0 30 0 0 0 0 0
0 0 ...
## $ DiffWalking : chr "0" "0" "0" "0" ..
.
## $ Sex : chr "0" "0" "1" "0" ..
.
## $ AgeCategory : chr "55" "80" "65" "75
" ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0
0 ...
## $ PhysicalActivity : chr "1" "1" "1" "0" ..
.
## $ GenHealth : chr "4" "4" "2" "3" ..
.
## $ SleepTime : num 5 7 8 6 8 12 4 9 5
10 ...
## $ Asthma : chr "1" "0" "1" "0" ..
.
## $ KidneyDisease : chr "0" "0" "0" "0" ..
.
## $ SkinCancer : chr "1" "0" "0" "1" ..
.
## $ Race_factor(race_column)American Indian/Alaskan Native: num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Asian : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Black : num 0 0 0 0 0 1 0 0 0
0 ...
## $ Race_factor(race_column)Hispanic : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Other : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)White : num 1 1 1 1 1 0 1 1 1
1 ...
#Convert specific columns to integers (only independent variables - dependent variable must be a factor)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 29 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
str(df_encoded)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 30 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
0 ...
## $ Race_factor(race_column)Other : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)White : num 1 1 1 1 1 0 1 1 1
1 ...
print(head(scaled_data))
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 31 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## Race_factor(race_column)Hispanic Race_factor(race_column)Other
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Race_factor(race_column)White
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 0
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 32 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
1 ...
## $ GenHealth : num 0.75 0.75 0.25 0.5
0.75 0.25 0.25 0.5 0.25 0.5 ...
## $ SleepTime : num 0.174 0.261 0.304
0.217 0.304 ...
## $ Asthma : num 1 0 1 0 0 0 1 1 0
0 ...
## $ KidneyDisease : num 0 0 0 0 0 0 0 0 1
0 ...
## $ SkinCancer : num 1 0 0 1 0 0 1 0 0
0 ...
## $ Race_factor(race_column)American Indian/Alaskan Native: num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Asian : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Black : num 0 0 0 0 0 1 0 0 0
0 ...
## $ Race_factor(race_column)Hispanic : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Other : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)White : num 1 1 1 1 1 0 1 1 1
1 ...
## $ HeartDisease : chr "0" "0" "0" "0" ..
.
print(head(scaled_dataframe))
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 33 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
## 2 0 0 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 0
## Race_factor(race_column)American Indian/Alaskan Native
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Race_factor(race_column)Asian Race_factor(race_column)Black
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## Race_factor(race_column)Hispanic Race_factor(race_column)Other
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Race_factor(race_column)White HeartDisease
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 0 1
max(scaled_dataframe$BMI)
## [1] 1
min(scaled_dataframe$BMI)
## [1] 0
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 34 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#install.packages('gower')
# install.packages('hardhat')
# install.packages('timechange')
# install.packages('ModelMetrics')
library(ggplot2)
library(lattice)
library(caret)
#Set the random seed for reproducibility #Specify the proportion of data to be allocated for the test set
#Create the train and test split
set.seed(123)
test_set<- scaled_dataframe[split, ]
dim(test_set)
## [1] 63960 23
dim(train_set)
## [1] 255835 23
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 35 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
weight = 0.8
model <- glm(formula, data = train_set,
family = "binomial",
weights = ifelse(train_set$HeartDisease == 1, weight, 1 - weight)
)
#Calculate accuracy
## Accuracy: 0.8757661
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 36 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
## Recall: 0.4789041
#Calculate precision
## Precision: 0.3398574
#Calculate F1 score
## F1 Score: 0.3975739
#Confusion Matrix
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 37 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
KNN Classifier
library(caret)
k <- 5
formula <- as.formula(HeartDisease ~ .)
model <- train(formula, data = train_set, method = "knn", trControl = trainControl(me
thod = "none"), tuneGrid = data.frame(k = k))
#Calculate accuracy
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 38 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
## Accuracy: 0.9059412
## Accuracy_Train: 0.9247523
## [1] 749
## [1] 1290
## [1] 4726
#Calculate recall
## Recall: 0.1368037
#Calculate precision
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 39 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
## Precision: 0.3673369
#Calculate F1 score
## F1 Score: 0.1993612
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 40 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 41 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#RANDOM FOREST
library(randomForest)
## randomForest 4.7-1.1
##
## Attaching package: 'randomForest'
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 42 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
library(dplyr)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 43 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
best_metric <- 0
best_class_weights <- NULL
best_ntree <- 0
best_mtry <- 0
best_nodesize <- 0
# Train the Random Forest model with current parameter combination and class
weight
model <- randomForest(HeartDisease ~ .,
data = train_set,
ntree = ntree,
mtry = mtry,
importance = TRUE,
nodesize = nodesize,
classwt = class_weights)
# Calculate metrics
TP <- sum(predictions == 1 & test_set$HeartDisease == 1)
FN <- sum(predictions == 0 & test_set$HeartDisease == 1)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 44 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
print(results)
cat('\n')
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 45 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
## best_ntree: 100
## best_mtry: 4.795832
## best_nodesize: 20
#Calculate accuracy
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 46 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#Recall
## Recall: 0.4785388
## Recall_Train: 0.7005206
#Calculate precision
## Precision: 0.3315616
## Precision_train: 0.4685543
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 47 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#Calculate F1 score
## F1 Score: 0.3917171
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 48 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 49 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
#Feature Importances
library(caret)
library(ggplot2)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 50 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
print(sorted_importances)
## 1
## AgeCategory 57.17185052
## Stroke 48.55089641
## GenHealth 46.21302100
## Sex 38.19345449
## Diabetic 15.65786760
## PhysicalHealth 14.33448787
## KidneyDisease 13.83797069
## DiffWalking 10.79500337
## MentalHealth 10.00377272
## BMI 9.11022265
## Race_White 6.74038712
## AlcoholDrinking 6.29347386
## Race_Black 4.97982135
## Asthma 4.88838298
## Race_Asian 4.00014224
## Race_Hispanic 3.89790718
## PhysicalActivity 3.89514501
## SkinCancer 3.74235929
## SleepTime 3.17222214
## Smoking 2.65329054
## Race_Other 0.05098671
## Race_American_Indian_Alaskan_Native -0.78333496
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 51 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
library(caret)
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 52 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 53 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 54 of 55
Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 55 of 55