Predict Heart Disease

Data Analysis - Final Project-Sena 6/24/23, 10:00 PM
Data Analysis - Final Project-Sena

SenaKaya
2023-06-09
#read the dataset #first 5 row
library(readxl)
heartd<- read.csv("/Users/senakaya/Desktop/UCSC/data analysis/final-Personal Key Indi
cators of Heart Disease/heart_2020_cleaned.csv")
head(heartd)
## HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth

## 1 No 16.60 Yes No No 3 30
## 2 No 20.34 No No Yes 0 0
## 3 No 26.58 Yes No No 20 30
## 4 No 24.21 No No No 0 0
## 5 No 23.71 No No No 28 0
## 6 Yes 28.87 Yes No No 6 0
## DiffWalking Sex AgeCategory Race Diabetic PhysicalActivity GenHealth
## 1 No Female 55-59 White Yes Yes Very good
## 2 No Female 80 or older White No Yes Very good
## 3 No Male 65-69 White Yes Yes Fair
## 4 No Female 75-79 White No No Good
## 5 Yes Female 40-44 White No Yes Very good
## 6 Yes Female 75-79 Black No No Fair
## SleepTime Asthma KidneyDisease SkinCancer
## 1 5 Yes No Yes
## 2 7 No No No
## 3 8 Yes No No
## 4 6 No No Yes
## 5 8 No No No
## 6 12 No No No
#dimension of dataset
dim(heartd)
## [1] 319795 18
#to checks number of null values
file:///Users/senakaya/Desktop/UCSC/data%20analysis/PredictHeartDiseasefor18indicator-R%20Project-Sena.html Page 1 of 55
sum(heartd.stringsAsFactors=FALSE)
## [1] 0
sum(is.na(heartd))
## [1] 0
#a list of objects and their structure
str(heartd)
## 'data.frame': 319795 obs. of 18 variables:

## $ HeartDisease : chr "No" "No" "No" "No" ...
## $ BMI : num 16.6 20.3 26.6 24.2 23.7 ...
## $ Smoking : chr "Yes" "No" "Yes" "No" ...
## $ AlcoholDrinking : chr "No" "No" "No" "No" ...
## $ Stroke : chr "No" "Yes" "No" "No" ...
## $ PhysicalHealth : num 3 0 20 0 28 6 15 5 0 0 ...
## $ MentalHealth : num 30 0 30 0 0 0 0 0 0 0 ...
## $ DiffWalking : chr "No" "No" "No" "No" ...
## $ Sex : chr "Female" "Female" "Male" "Female" ...
## $ AgeCategory : chr "55-59" "80 or older" "65-69" "75-79" ...
## $ Race : chr "White" "White" "White" "White" ...
## $ Diabetic : chr "Yes" "No" "Yes" "No" ...
## $ PhysicalActivity: chr "Yes" "Yes" "Yes" "No" ...
## $ GenHealth : chr "Very good" "Very good" "Fair" "Good" ...
## $ SleepTime : num 5 7 8 6 8 12 4 9 5 10 ...
## $ Asthma : chr "Yes" "No" "Yes" "No" ...
## $ KidneyDisease : chr "No" "No" "No" "No" ...
## $ SkinCancer : chr "Yes" "No" "No" "Yes" ...
#summary of the dataset
summary(heartd)
## HeartDisease BMI Smoking AlcoholDrinking

## Length:319795 Min. :12.02 Length:319795 Length:319795
## Class :character 1st Qu.:24.03 Class :character Class :character
## Mode :character Median :27.34 Mode :character Mode :character
## Mean :28.33
## 3rd Qu.:31.42
## Max. :94.85
## Stroke PhysicalHealth MentalHealth DiffWalking
## Length:319795 Min. : 0.000 Min. : 0.000 Length:319795
## Class :character 1st Qu.: 0.000 1st Qu.: 0.000 Class :character
## Mode :character Median : 0.000 Median : 0.000 Mode :character
## Mean : 3.372 Mean : 3.898
## 3rd Qu.: 2.000 3rd Qu.: 3.000
## Max. :30.000 Max. :30.000
## Sex AgeCategory Race Diabetic
## Length:319795 Length:319795 Length:319795 Length:319795
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## PhysicalActivity GenHealth SleepTime Asthma
## Length:319795 Length:319795 Min. : 1.000 Length:319795
## Class :character Class :character 1st Qu.: 6.000 Class :character
## Mode :character Mode :character Median : 7.000 Mode :character
## Mean : 7.097
## 3rd Qu.: 8.000
## Max. :24.000
## KidneyDisease SkinCancer
## Length:319795 Length:319795
## Class :character Class :character
## Mode :character Mode :character
##
##
##
#table of HeartDisease
table(heartd$HeartDisease)
##
## No Yes
## 292422 27373
#the number of diﬀerent values for each column
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':

##
## filter, lag
## The following objects are masked from 'package:base':

##
## intersect, setdiff, setequal, union
heartd %>% summarise_all(n_distinct)

## 1 2 3604 2 2 2 31 31
## DiffWalking Sex AgeCategory Race Diabetic PhysicalActivity GenHealth
## 1 2 2 13 6 4 2 5
## SleepTime Asthma KidneyDisease SkinCancer
## 1 24 2 2 2
#EDA (EXPLORATORY DATA ANALYSIS) #histogram of BMI(Body Mass Index)
library(ggplot2)
p<-ggplot(data = heartd, aes(x = BMI)) + geom_histogram(binwidth = 3,fill='blue',colo

ur="red")+ coord_cartesian(xlim = c(10, 60))
p+ geom_vline(aes(xintercept=mean(BMI)), color="yellow", linetype="dashed", linewidth
=1)
#histogram of Physical Health
p<-ggplot(data = heartd, aes(x = heartd$PhysicalHealth)) + geom_histogram(binwidth =

3,fill='blue',colour="red")+ coord_cartesian(xlim = c(0, 30))
p+ geom_vline(aes(xintercept=mean(heartd$PhysicalHealth)), color="yellow", linetype="
dashed", size=1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Use of `heartd$PhysicalHealth` is discouraged.

## ℹ Use `PhysicalHealth` instead.
## Use of `heartd$PhysicalHealth` is discouraged.
## ℹ Use `PhysicalHealth` instead.
#histogram of Mental Health
p<-ggplot(data = heartd, aes(x = MentalHealth )) + geom_histogram(binwidth = 3,fill='

blue',colour="red")+ coord_cartesian(xlim = c(0, 30))
p+ geom_vline(aes(xintercept=mean(MentalHealth)), color="yellow", linetype="dashed",
size=1)
#histogram of Sleep Time
p<-ggplot(data = heartd, aes(x = SleepTime)) + geom_histogram(binwidth = 1,fill='blue

',colour="red")+ coord_cartesian(xlim = c(2, 12))
p+ geom_vline(aes(xintercept=mean(SleepTime)), color="yellow", linetype="dashed", siz
e=1)
#remove the scientific notations from the axis
options(scipen = 999)
#Create the stacked bar plot for two categorical variable heart disease and smoking
ggplot(heartd, aes(x = HeartDisease, fill = Smoking)) +

geom_bar() +
xlab("HeartDisease") +
ylab("Count") +
coord_cartesian(ylim = c(0, 300000))
ggtitle("Bar Plot of Heart Disease and Smoking")
## $title
## [1] "Bar Plot of Heart Disease and Smoking"
##
## attr(,"class")
## [1] "labels"
#Create the stacked bar plot for two categorical variable heart disease and alcohol drinking
ggplot(heartd, aes(x = HeartDisease, fill = AlcoholDrinking)) +

geom_bar() +
ylab("Count") +
coord_cartesian(ylim = c(0, 300000))+
ggtitle("Bar Plot of Heart Disease and Alcohol Drinking")
#Create the bar stacked plot for two categorical variable heart disease and sex
ggplot(heartd, aes(x = HeartDisease, fill = Sex)) +

geom_bar() +
ylab("Count") +
ggtitle("Bar Plot of Heart Disease and Sex")
#Create the stacked bar plot for two categorical variable heart disease and asthma
ggplot(heartd, aes(x = HeartDisease, fill = Asthma)) +

geom_bar() +
ylab("Count") +
ggtitle("Bar Plot of Heart Disease and Asthma")
#Create the stacked bar plot for two categorical variable heart disease and kidney disease
ggplot(heartd, aes(x = HeartDisease, fill = KidneyDisease)) +

geom_bar() +
ylab("Count") +
ggtitle("Bar Plot of Heart Disease and Kidney Disease")
#Create the stacked bar plot for two categorical variable heart disease and skin cancer
ggplot(heartd, aes(x = HeartDisease, fill = SkinCancer)) +

geom_bar() +
ylab("Count") +
ggtitle("Bar Plot of Heart Disease and Skin Cancer")
#Count the frequencies of each combination of the two variables #Create the bar chart
counts <- table(heartd$HeartDisease, heartd$Smoking)
barplot(counts, beside = TRUE, legend = TRUE,

xlab = "HeartDisease", ylab = "Count",
main = "Bar Chart of Heart Disease and Smoking")
#Count the frequencies of each combination of the two variables #Define colors for the bars #Create the bar
chart
counts <- table(heartd$HeartDisease, heartd$AgeCategory)
colors <- c("yellow", "green")

main = "Bar Chart of Heart Disease and Age Category", col = colors)
#Count the frequencies of each combination of the two variables
counts <- table(heartd$HeartDisease, heartd$Race)

main = "Bar Chart of HeartDisease and Race", col = colors)
#Count the frequencies of each combination of the two variables
counts <- table(heartd$HeartDisease, heartd$GenHealth)

main = "Bar Chart of HeartDisease and GenHealth", col = colors)
Count the frequencies of each combination

of the two variables
counts <- table(heartd$HeartDisease, heartd$SleepTime)

main = "Bar Chart of HeartDisease and Sleep Time", col = colors)
Count the frequencies of each combination

of the two variables
counts <- table(heartd$HeartDisease, heartd$PhysicalHealth)

main = "Bar Chart of HeartDisease and Physical Health", col = colors)
Replace values in the data frame

heartd <- replace(heartd, heartd == 'Yes', 1)
heartd <- replace(heartd, heartd == 'No', 0)
heartd <- replace(heartd, heartd == 'Male', 1)
heartd <- replace(heartd, heartd == 'Female', 0)
heartd <- replace(heartd, heartd == 'No, borderline diabetes', 0)
heartd <- replace(heartd, heartd == 'Yes (during pregnancy)', 1)
#ordinal
heartd <- replace(heartd, heartd == 'Excellent', 5)
heartd <- replace(heartd, heartd == 'Very good', 4)
heartd <- replace(heartd, heartd == 'Good', 3)
heartd <- replace(heartd, heartd == 'Fair', 2)
heartd <- replace(heartd, heartd == 'Poor', 1)
Convert ‘Diabetic’ column to integer

heartd$Diabetic <- as.numeric(heartd$Diabetic)
Extract the first two characters from the

“Age” column
heartd$AgeCategory <- substr(heartd$AgeCategory, 1, 2)
Print the first five row of modified “Age”

column
print(heartd$AgeCategory[1:5])
## [1] "55" "80" "65" "75" "40"
#convert all “0”s and “1”s to integer except c( “Race”)
num_columns <- setdiff(colnames(heartd), c( "Race"))
heartd_num <- as.data.frame(lapply(heartd[num_columns], as.numeric))
str(heartd_num)

## $ HeartDisease : num 0 0 0 0 0 1 0 0 0 0 ...
## $ BMI : num 16.6 20.3 26.6 24.2 23.7 ...
## $ Smoking : num 1 0 1 0 0 1 0 1 0 0 ...
## $ AlcoholDrinking : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Stroke : num 0 1 0 0 0 0 0 0 0 0 ...
## $ DiffWalking : num 0 0 0 0 1 1 0 1 0 1 ...
## $ Sex : num 0 0 1 0 0 0 0 0 0 1 ...
## $ AgeCategory : num 55 80 65 75 40 75 70 80 80 65 ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0 0 ...
## $ PhysicalActivity: num 1 1 1 0 1 0 1 0 0 1 ...
## $ GenHealth : num 4 4 2 3 4 2 2 3 2 3 ...
## $ SleepTime : num 5 7 8 6 8 12 4 9 5 10 ...
## $ Asthma : num 1 0 1 0 0 0 1 1 0 0 ...
## $ KidneyDisease : num 0 0 0 0 0 0 0 0 1 0 ...
## $ SkinCancer : num 1 0 0 1 0 0 1 0 0 0 ...
library(reshape2)
library(RColorBrewer)
library(corrplot)
## corrplot 0.92 loaded
library(ggcorrplot)
#Create correlation matrix #Plot correlation matrix #Calculate the correlation matrix
cor_matrix <- cor(heartd_num)
ggcorrplot(cor_matrix, type = "lower") +

geom_tile() +
scale_fill_gradientn(colours = brewer.pal(11, "RdBu"),
limits = c(-1, 1),
breaks = seq(-1, 1, by = 0.2)) +
labs(x = "", y = "") +

theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Scale for fill is already present.

## Adding another scale for fill, which will replace the existing scale.
cor_matrix <- cor(heartd_num)
print(cor_matrix)
## HeartDisease BMI Smoking AlcoholDrinking

## HeartDisease 1.000000000 0.051803191 0.10776416 -0.032079743
## BMI 0.051803191 1.000000000 0.02311811 -0.038816223
## Smoking 0.107764156 0.023118112 1.00000000 0.111767520
## AlcoholDrinking -0.032079743 -0.038816223 0.11176752 1.000000000
## Stroke 0.196835299 0.019732982 0.06122604 -0.019857914
## PhysicalHealth 0.170720972 0.109787544 0.11535241 -0.017254288
## MentalHealth 0.028590715 0.064130569 0.08515729 0.051281973
## DiffWalking 0.201258049 0.181678264 0.12007416 -0.035327583
## Sex 0.070040476 0.026939645 0.08505249 0.004200142
## AgeCategory 0.232324637 -0.001740121 0.13038380 -0.058836643
## Diabetic 0.174782320 0.200442834 0.05554379 -0.057786229
## PhysicalActivity -0.100029934 -0.150615994 -0.09717377 0.017486983
## GenHealth -0.243182458 -0.230719780 -0.17471733 0.029780388
## SleepTime 0.008326647 -0.051822254 -0.03033564 -0.005065451
## Asthma 0.041444151 0.092345019 0.02414853 -0.002202100

## KidneyDisease 0.145197099 0.050767533 0.03491969 -0.028280092
## SkinCancer 0.093316878 -0.033643619 0.03397739 -0.005702370
## Stroke PhysicalHealth MentalHealth DiffWalking
## HeartDisease 0.196835299 0.17072097 0.02859071 0.20125805
## BMI 0.019732982 0.10978754 0.06413057 0.18167826
## Smoking 0.061226040 0.11535241 0.08515729 0.12007416
## AlcoholDrinking -0.019857914 -0.01725429 0.05128197 -0.03532758
## Stroke 1.000000000 0.13701383 0.04646706 0.17414321
## PhysicalHealth 0.137013827 1.00000000 0.28798667 0.42837280
## MentalHealth 0.046467061 0.28798667 1.00000000 0.15223467
## DiffWalking 0.174143214 0.42837280 0.15223467 1.00000000
## Sex -0.003091055 -0.04090384 -0.10005847 -0.06885956
## AgeCategory 0.137279527 0.11078930 -0.15545468 0.24255230
## Diabetic 0.104466795 0.15397533 0.02970857 0.20925469
## PhysicalActivity -0.079455195 -0.23228318 -0.09580810 -0.27852396
## GenHealth -0.168090049 -0.48269718 -0.24162528 -0.41379735
## SleepTime 0.011899981 -0.06138663 -0.11971679 -0.02221636
## Asthma 0.038866140 0.11790658 0.11400817 0.10322205
## KidneyDisease 0.091166841 0.14219718 0.03728113 0.15306375
## SkinCancer 0.048116104 0.04169969 -0.03341219 0.06484040
## Sex AgeCategory Diabetic PhysicalActivity
## HeartDisease 0.070040476 0.232324637 0.174782320 -0.100029934
## BMI 0.026939645 -0.001740121 0.200442834 -0.150615994
## Smoking 0.085052486 0.130383805 0.055543794 -0.097173766
## AlcoholDrinking 0.004200142 -0.058836643 -0.057786229 0.017486983
## Stroke -0.003091055 0.137279527 0.104466795 -0.079455195
## PhysicalHealth -0.040903839 0.110789301 0.153975332 -0.232283177
## MentalHealth -0.100058473 -0.155454678 0.029708569 -0.095808105
## DiffWalking -0.068859559 0.242552299 0.209254691 -0.278523964
## Sex 1.000000000 -0.067681898 -0.002054385 0.048246846
## AgeCategory -0.067681898 1.000000000 0.196801318 -0.121465289
## Diabetic -0.002054385 0.196801318 1.000000000 -0.136599474
## PhysicalActivity 0.048246846 -0.121465289 -0.136599474 1.000000000
## GenHealth 0.025947385 -0.188235591 -0.270245831 0.281248774
## SleepTime -0.015703748 0.103071031 0.003452185 0.003848841
## Asthma -0.069191116 -0.058234926 0.046886332 -0.041525882
## KidneyDisease -0.009083858 0.122696590 0.149112620 -0.081827321
## SkinCancer 0.013433800 0.262207609 0.034077309 -0.001327810
## GenHealth SleepTime Asthma KidneyDisease
## HeartDisease -0.24318246 0.008326647 0.0414441511 0.145197099
## BMI -0.23071978 -0.051822254 0.0923450190 0.050767533
## Smoking -0.17471733 -0.030335635 0.0241485320 0.034919686
## AlcoholDrinking 0.02978039 -0.005065451 -0.0022021002 -0.028280092
## Stroke -0.16809005 0.011899981 0.0388661403 0.091166841
## PhysicalHealth -0.48269718 -0.061386632 0.1179065802 0.142197185
## MentalHealth -0.24162528 -0.119716788 0.1140081744 0.037281128
## DiffWalking -0.41379735 -0.022216357 0.1032220488 0.153063752

## Sex 0.02594739 -0.015703748 -0.0691911164 -0.009083858
## AgeCategory -0.18823559 0.103071031 -0.0582349260 0.122696590
## Diabetic -0.27024583 0.003452185 0.0468863320 0.149112620
## PhysicalActivity 0.28124877 0.003848841 -0.0415258818 -0.081827321
## GenHealth 1.00000000 0.063071012 -0.1373424390 -0.174642269
## SleepTime 0.06307101 1.000000000 -0.0482452803 0.006237934
## Asthma -0.13734244 -0.048245280 1.0000000000 0.039707000
## KidneyDisease -0.17464227 0.006237934 0.0397070003 1.000000000
## SkinCancer -0.05064137 0.041266167 -0.0003964769 0.061816217
## SkinCancer
## HeartDisease 0.0933168777
## BMI -0.0336436185
## Smoking 0.0339773873
## AlcoholDrinking -0.0057023705
## Stroke 0.0481161045
## PhysicalHealth 0.0416996855
## MentalHealth -0.0334121907
## DiffWalking 0.0648404018
## Sex 0.0134337997
## AgeCategory 0.2622076094
## Diabetic 0.0340773087
## PhysicalActivity -0.0013278105
## GenHealth -0.0506413674
## SleepTime 0.0412661675
## Asthma -0.0003964769
## KidneyDisease 0.0618162165
## SkinCancer 1.0000000000
#Find correlations above 0.75 #Check if there are any high correlations
high_corr <- which(cor_matrix > 0.75 & cor_matrix < 1, arr.ind = TRUE)
if (nrow(high_corr) == 0) {
cat("No correlations above 0.75 found.\n")
} else {
# Print the pairs of variables with high correlation
for (i in 1:nrow(high_corr)) {
row_num <- high_corr[i, "row"]
col_num <- high_corr[i, "col"]
row_name <- rownames(cor_matrix)[row_num]
col_name <- colnames(cor_matrix)[col_num]
correlation <- cor_matrix[row_num, col_num]
cat("High correlation (", correlation, ") between", row_name, "and", col_name, "\
n")
}
}
## No correlations above 0.75 found.
#Create a copy of the original data frame
df_encoded <- heartd

str(df_encoded)

## $ HeartDisease : chr "0" "0" "0" "0" ...
## $ BMI : num 16.6 20.3 26.6 24.2 23.7 ...
## $ Smoking : chr "1" "0" "1" "0" ...
## $ AlcoholDrinking : chr "0" "0" "0" "0" ...
## $ Stroke : chr "0" "1" "0" "0" ...
## $ DiffWalking : chr "0" "0" "0" "0" ...
## $ Sex : chr "0" "0" "1" "0" ...
## $ AgeCategory : chr "55" "80" "65" "75" ...
## $ Race : chr "White" "White" "White" "White" ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0 0 ...
## $ PhysicalActivity: chr "1" "1" "1" "0" ...
## $ GenHealth : chr "4" "4" "2" "3" ...
## $ SleepTime : num 5 7 8 6 8 12 4 9 5 10 ...
## $ Asthma : chr "1" "0" "1" "0" ...
## $ KidneyDisease : chr "0" "0" "0" "0" ...
## $ SkinCancer : chr "1" "0" "0" "1" ...
#ONE-HOT ENCODING #Perform one-hot encoding for the “Race” column
df_encoded <- heartd
#Extract the “Race” column
race_column <- df_encoded$Race
#Perform one-hot encoding for the “Race” column
encoded_cols <- model.matrix(~ 0 + factor(race_column))

colnames(encoded_cols) <- paste("Race", colnames(encoded_cols), sep = "_")
#Add the encoded columns to the data set
df_encoded <- cbind(df_encoded, encoded_cols)
#Remove the original “Race” column
df_encoded <- df_encoded[, !names(df_encoded) %in% "Race"]
#Print the encoded data set
print(head(df_encoded))

## 1 0 16.60 1 0 0 3 30
## 2 0 20.34 0 0 1 0 0
## 3 0 26.58 1 0 0 20 30
## 4 0 24.21 0 0 0 0 0
## 5 0 23.71 0 0 0 28 0
## 6 1 28.87 1 0 0 6 0
## DiffWalking Sex AgeCategory Diabetic PhysicalActivity GenHealth SleepTime
## 1 0 0 55 1 1 4 5
## 2 0 0 80 0 1 4 7
## 3 0 1 65 1 1 2 8
## 4 0 0 75 0 0 3 6
## 5 1 0 40 0 1 4 8
## 6 1 0 75 0 0 2 12
## Asthma KidneyDisease SkinCancer
## 1 1 0 1
## 2 0 0 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 0
## Race_factor(race_column)American Indian/Alaskan Native
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Race_factor(race_column)Asian Race_factor(race_column)Black
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## Race_factor(race_column)Hispanic Race_factor(race_column)Other
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Race_factor(race_column)White
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 0
str(df_encoded)

## $ HeartDisease : chr "0" "0" "0" "0" ..
.
## $ BMI : num 16.6 20.3 26.6 24.
2 23.7 ...
## $ Smoking : chr "1" "0" "1" "0" ..
.
## $ AlcoholDrinking : chr "0" "0" "0" "0" ..
.
## $ Stroke : chr "0" "1" "0" "0" ..
.
## $ PhysicalHealth : num 3 0 20 0 28 6 15 5
0 0 ...
## $ MentalHealth : num 30 0 30 0 0 0 0 0
0 0 ...
## $ DiffWalking : chr "0" "0" "0" "0" ..
.
## $ Sex : chr "0" "0" "1" "0" ..
.
## $ AgeCategory : chr "55" "80" "65" "75
" ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0
0 ...
## $ PhysicalActivity : chr "1" "1" "1" "0" ..
.
## $ GenHealth : chr "4" "4" "2" "3" ..
.
## $ SleepTime : num 5 7 8 6 8 12 4 9 5
10 ...
## $ Asthma : chr "1" "0" "1" "0" ..
.
## $ KidneyDisease : chr "0" "0" "0" "0" ..
.
## $ SkinCancer : chr "1" "0" "0" "1" ..
.
## $ Race_factor(race_column)American Indian/Alaskan Native: num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Asian : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Black : num 0 0 0 0 0 1 0 0 0
0 ...
## $ Race_factor(race_column)Hispanic : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)Other : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Race_factor(race_column)White : num 1 1 1 1 1 0 1 1 1
1 ...
#Convert specific columns to integers (only independent variables - dependent variable must be a factor)
columns_to_convert <- c("Smoking", "SkinCancer", "KidneyDisease", "Asthma", "GenHealt

h",
"AgeCategory","Stroke", "AlcoholDrinking","Sex", "DiffWalking
", "PhysicalActivity")
df_encoded[columns_to_convert] <- lapply(df_encoded[columns_to_convert], as.numeric)
#Identify numeric variables in the data frame
numeric_vars <- sapply(df_encoded, is.numeric)
str(df_encoded)

.
## $ BMI : num 16.6 20.3 26.6 24.
2 23.7 ...
## $ Smoking : num 1 0 1 0 0 1 0 1 0
0 ...
## $ AlcoholDrinking : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Stroke : num 0 1 0 0 0 0 0 0 0
0 ...
## $ PhysicalHealth : num 3 0 20 0 28 6 15 5
0 0 ...
## $ MentalHealth : num 30 0 30 0 0 0 0 0
0 0 ...
## $ DiffWalking : num 0 0 0 0 1 1 0 1 0
1 ...
## $ Sex : num 0 0 1 0 0 0 0 0 0
1 ...
## $ AgeCategory : num 55 80 65 75 40 75
70 80 80 65 ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0
0 ...
## $ PhysicalActivity : num 1 1 1 0 1 0 1 0 0
1 ...
## $ GenHealth : num 4 4 2 3 4 2 2 3 2
3 ...
## $ SleepTime : num 5 7 8 6 8 12 4 9 5
10 ...
## $ Asthma : num 1 0 1 0 0 0 1 1 0
0 ...
## $ KidneyDisease : num 0 0 0 0 0 0 0 0 1
0 ...
## $ SkinCancer : num 1 0 0 1 0 0 1 0 0
0 ...
0 ...
0 ...
0 ...
0 ...
0 ...
1 ...
SCALING #Apply min-max scaling to the numerical variables
scaled_data <- apply(df_encoded[, numeric_vars], 2, function(x) (x - min(x)) / (max(x

) - min(x)))
#Print the scaled data
print(head(scaled_data))
## BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth

## 1 0.05529398 1 0 0 0.1000000 1
## 2 0.10044670 0 0 1 0.0000000 0
## 3 0.17578172 1 0 0 0.6666667 1
## 4 0.14716890 0 0 0 0.0000000 0
## 5 0.14113244 0 0 0 0.9333333 0
## 6 0.20342871 1 0 0 0.2000000 0
## 1 0 0 0.5967742 1 1 0.75 0.1739130
## 2 0 0 1.0000000 0 1 0.75 0.2608696
## 3 0 1 0.7580645 1 1 0.25 0.3043478
## 4 0 0 0.9193548 0 0 0.50 0.2173913
## 5 1 0 0.3548387 0 1 0.75 0.3043478
## 6 1 0 0.9193548 0 0 0.25 0.4782609
## 1 1 0 1
## 2 0 0 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 0
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Race_factor(race_column)White
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 0
#Combine the scaled numeric variables with the non-numeric variables
scaled_dataframe <- cbind(scaled_data, df_encoded[!numeric_vars])

str(scaled_dataframe)

## $ BMI : num 0.0553 0.1004 0.17
58 0.1472 0.1411 ...
## $ Smoking : num 1 0 1 0 0 1 0 1 0
0 ...
## $ AlcoholDrinking : num 0 0 0 0 0 0 0 0 0
0 ...
## $ Stroke : num 0 1 0 0 0 0 0 0 0
0 ...
## $ PhysicalHealth : num 0.1 0 0.667 0 0.93
3 ...
## $ MentalHealth : num 1 0 1 0 0 0 0 0 0
0 ...
## $ DiffWalking : num 0 0 0 0 1 1 0 1 0
1 ...
## $ Sex : num 0 0 1 0 0 0 0 0 0
1 ...
## $ AgeCategory : num 0.597 1 0.758 0.91
9 0.355 ...
## $ Diabetic : num 1 0 1 0 0 0 0 1 0
0 ...
## $ PhysicalActivity : num 1 1 1 0 1 0 1 0 0
1 ...
## $ GenHealth : num 0.75 0.75 0.25 0.5
0.75 0.25 0.25 0.5 0.25 0.5 ...
## $ SleepTime : num 0.174 0.261 0.304
0.217 0.304 ...
## $ Asthma : num 1 0 1 0 0 0 1 1 0
0 ...
## $ KidneyDisease : num 0 0 0 0 0 0 0 0 1
0 ...
## $ SkinCancer : num 1 0 0 1 0 0 1 0 0
0 ...
0 ...
0 ...
0 ...
0 ...
0 ...
1 ...
.
#Print the scaled data frame
print(head(scaled_dataframe))
## BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth

## 1 0.05529398 1 0 0 0.1000000 1
## 2 0.10044670 0 0 1 0.0000000 0
## 3 0.17578172 1 0 0 0.6666667 1
## 4 0.14716890 0 0 0 0.0000000 0
## 5 0.14113244 0 0 0 0.9333333 0
## 6 0.20342871 1 0 0 0.2000000 0
## 1 0 0 0.5967742 1 1 0.75 0.1739130
## 2 0 0 1.0000000 0 1 0.75 0.2608696
## 3 0 1 0.7580645 1 1 0.25 0.3043478
## 4 0 0 0.9193548 0 0 0.50 0.2173913
## 5 1 0 0.3548387 0 1 0.75 0.3043478
## 6 1 0 0.9193548 0 0 0.25 0.4782609
## 1 1 0 1
## 2 0 0 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 0
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Race_factor(race_column)White HeartDisease
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 0 1
max(scaled_dataframe$BMI)
## [1] 1
min(scaled_dataframe$BMI)
## [1] 0
# install.packages("caret", dependencies = c("Depends", "Suggests"))
#install.packages('gower')
# install.packages('hardhat')
# install.packages('timechange')
# install.packages('ModelMetrics')
library(ggplot2)
library(lattice)
library(caret)
#SPLITING TEST AND TRAIN DATA SET
#Set the random seed for reproducibility #Specify the proportion of data to be allocated for the test set
#Create the train and test split
set.seed(123)
test_size <- 0.2
split <- createDataPartition(scaled_dataframe$HeartDisease, p = test_size, list = FAL

SE)
#Create the train set #Create the test set
test_set<- scaled_dataframe[split, ]
train_set <- scaled_dataframe[-split, ]
#dimension of test and train set
dim(test_set)
## [1] 63960 23
dim(train_set)
## [1] 255835 23
MODELLING Logistic Regression
#Create the formula for logistic regression
formula <- as.formula( HeartDisease ~ .)
#Convert the “HeartDisease” column to factor in both train_set and test_set
train_set$HeartDisease <- factor(train_set$HeartDisease)

test_set$HeartDisease <- factor(test_set$HeartDisease)
#Train the logistic regression model
weight = 0.8
model <- glm(formula, data = train_set,
family = "binomial",
weights = ifelse(train_set$HeartDisease == 1, weight, 1 - weight)
)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
#Make predictions on the test set
predictions <- predict(model, newdata = test_set, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :

## prediction from a rank-deficient fit may be misleading
#Convert the predictions to binary values
binary_predictions <- ifelse(predictions > 0.5, 1, 0)
#Calculate accuracy
accuracy <- mean(binary_predictions == test_set$HeartDisease)

cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.8757661
TP <- sum(binary_predictions == 1 & test_set$HeartDisease == 1)

FN <- sum(binary_predictions == 0 & test_set$HeartDisease == 1)
#recall <- sensitivity(as.factor(test_set$HeartDisease), as.factor(predictions))
recall <- TP / (TP + FN)

cat("Recall:", recall, "\n")
## Recall: 0.4789041
#Calculate precision
precision <- sum(binary_predictions == 1 & test_set$HeartDisease == 1) / sum(binary_p

redictions == 1)
cat("Precision:", precision, "\n")
## Precision: 0.3398574
#Calculate F1 score
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("F1 Score:", f1_score, "\n")
## F1 Score: 0.3975739
#Confusion Matrix
confusionMatrix(data=as.factor(binary_predictions), reference = as.factor(test_set$He

artDisease))
## Confusion Matrix and Statistics

##
## Reference
## Prediction 0 1
## 0 53392 2853
## 1 5093 2622
##
## Accuracy : 0.8758
## 95% CI : (0.8732, 0.8783)
## No Information Rate : 0.9144
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3305
##
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.9129
## Specificity : 0.4789
## Pos Pred Value : 0.9493
## Neg Pred Value : 0.3399
## Prevalence : 0.9144
## Detection Rate : 0.8348
## Detection Prevalence : 0.8794
## Balanced Accuracy : 0.6959
##
## 'Positive' Class : 0
##
KNN Classifier
#Train the KNN classifier
library(caret)
k <- 5
formula <- as.formula(HeartDisease ~ .)
model <- train(formula, data = train_set, method = "knn", trControl = trainControl(me
thod = "none"), tuneGrid = data.frame(k = k))
#Make predictions on the test an training set
predictions <- predict(model, newdata = test_set)

predictions_train <- predict(model, newdata = train_set)
#Calculate accuracy
accuracy <- sum(predictions == test_set$HeartDisease) / length(test_set$HeartDisease)

cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9059412
accuracy_train <- sum(predictions_train == train_set$HeartDisease) / length(train_set

$HeartDisease)
cat("Accuracy_Train:", accuracy_train, "\n")
## Accuracy_Train: 0.9247523
#Calculate true positives
true_positives <- sum(predictions == 1 & test_set$HeartDisease == 1)

print(true_positives)
## [1] 749
#Calculate false positives
false_positives <- sum(predictions == 1 & test_set$HeartDisease == 0)

print(false_positives)
## [1] 1290
#Calculate false negatives
false_negatives <- sum(predictions == 0 & test_set$HeartDisease == 1)

print(false_negatives)
## [1] 4726
#Calculate recall
recall <- true_positives / (true_positives + false_negatives)

## Recall: 0.1368037
precision <- true_positives / (true_positives + false_positives)

## Precision: 0.3673369
#Calculate F1 score

## F1 Score: 0.1993612
#Confusion Matrix of the test set
confusionMatrix(data=as.factor(predictions), reference = as.factor(test_set$HeartDise

ase))

##
## Reference
## Prediction 0 1
## 0 57195 4726
## 1 1290 749
##
## 95% CI : (0.9037, 0.9082)
##
## Kappa : 0.1604
##
##
##
##
#Confusion Matrix of the train set
confusionMatrix(data=as.factor(predictions_train), reference = as.factor(train_set$He

artDisease))

##
## Reference
## Prediction 0 1
## 0 231069 16383
## 1 2868 5515
##
## 95% CI : (0.9237, 0.9258)
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.3326
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
##
##
#RANDOM FOREST
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':

##
## margin
## The following object is masked from 'package:dplyr':

##
## combine
#Get the current column names
column_names <- names(train_set)

column_names_test <- names(test_set)
#Replace “/” and ” ” with “_”
new_column_names <- gsub("/", "_", column_names)

new_column_names_test <- gsub("/", "_", column_names_test)
new_column_names <- gsub(" ", "_", new_column_names)

new_column_names_test <- gsub(" ", "_", new_column_names_test)
#Rename columns with similar issues
new_column_names <- sub(".*\\)", "Race_", new_column_names)

new_column_names_test <- sub(".*\\)", "Race_", new_column_names_test)
#Assign the new column names to the dataset
names(train_set) <- new_column_names

names(test_set) <- new_column_names_test
library(dplyr)
#Define the parameter grid for tuning
ntree_vals <- c(25,30,35,40,45,50,100)

mtry_vals <- c(sqrt(ncol(train_set)))
nodesize_vals <- c(20)
#Initialize empty dataframe to store results
results <- data.frame(ntree = integer(),

mtry = numeric(),
nodesize = integer(),
class_weight = character(),
accuracy = numeric(),
recall = numeric(),
precision = numeric(),
f1_score = numeric(),
stringsAsFactors = FALSE)
#Define the best metric value and class weights
best_metric <- 0
best_class_weights <- NULL
best_ntree <- 0
best_mtry <- 0
best_nodesize <- 0
#Iterate over parameter combinations
for (ntree in ntree_vals) {

for (mtry in mtry_vals) {
for (nodesize in nodesize_vals) {
# Iterate over class weight combinations
for (weight_1 in seq(0.3, 0.6, 0.4)) {
weight_0 <- 1 - weight_1
class_weights <- list(weight_0, weight_1)
# Train the Random Forest model with current parameter combination and class
weight
model <- randomForest(HeartDisease ~ .,
data = train_set,
ntree = ntree,
mtry = mtry,
importance = TRUE,
nodesize = nodesize,
classwt = class_weights)
# Make predictions on the test set

# Calculate metrics
TP <- sum(predictions == 1 & test_set$HeartDisease == 1)
FN <- sum(predictions == 0 & test_set$HeartDisease == 1)
accuracy <- mean(predictions == test_set$HeartDisease)

precision <- sum(predictions == 1 & test_set$HeartDisease == 1) / sum(predict
ions == 1)
# Append results to dataframe

results <- results %>%
add_row(ntree = ntree,
mtry = mtry,
nodesize = nodesize,
class_weight = paste(class_weights, collapse = ":"),
accuracy = accuracy,
recall = recall,
precision = precision,
f1_score = f1_score)
# Check if the current class weight combination improves the best metric
if (f1_score > best_metric) {
best_metric <- f1_score
best_class_weights <- class_weights
best_ntree <- ntree
best_mtry <- mtry
best_nodesize <- nodesize
}
}
}
}
}
#View the results
print(results)
## ntree mtry nodesize class_weight accuracy recall precision f1_score

## 1 25 4.795832 20 0.7:0.3 0.8687930 0.4832877 0.3223292 0.3867290
## 2 30 4.795832 20 0.7:0.3 0.8711538 0.4745205 0.3262999 0.3866935
## 3 35 4.795832 20 0.7:0.3 0.8708255 0.4737900 0.3252665 0.3857249
## 4 40 4.795832 20 0.7:0.3 0.8730926 0.4661187 0.3294604 0.3860525
## 5 45 4.795832 20 0.7:0.3 0.8712946 0.4789041 0.3277090 0.3891362
## 6 50 4.795832 20 0.7:0.3 0.8714665 0.4816438 0.3288030 0.3908114
## 7 100 4.795832 20 0.7:0.3 0.8727955 0.4821918 0.3324518 0.3935599
cat('\n')
#Print the best class weights and metric value
cat("Best Class Weights:", paste(best_class_weights, collapse = ":"), "\n")
## Best Class Weights: 0.7:0.3
cat("best_ntree:", best_ntree, "\n")
## best_ntree: 100
cat("best_mtry:", best_mtry, "\n")
## best_mtry: 4.795832
cat("best_nodesize:", best_nodesize, "\n")
## best_nodesize: 20
cat("Best F1 Score:", best_metric, "\n")
## Best F1 Score: 0.3935599
#BEST RANDOM FOREST MODEL
model <- randomForest(HeartDisease ~ .,

data = train_set,
ntree = best_ntree, # Number of trees in the forest
mtry = best_mtry, # Number of variables to consider
at each split
importance = TRUE, # Calculate variable importance
nodesize = best_nodesize, # Minimum size of terminal nodes
classwt = best_class_weights # Assign class weights
)
#Make predictions on the test set

predictions_train <- predict(model, newdata = train_set)
#Calculate accuracy

cat("Accuracy for test_set:", accuracy, "\n")
## Accuracy for test_set: 0.8727799
accuracy_train <- mean(predictions_train == train_set$HeartDisease)

cat("Accuracy Training:", accuracy_train, "\n")
## Accuracy Training: 0.9063576
#Recall

## Recall: 0.4785388
TP <- sum(predictions_train == 1 & train_set$HeartDisease == 1)

FN <- sum(predictions_train == 0 & train_set$HeartDisease == 1)
recall_train <- TP / (TP + FN)
cat("Recall_Train:", recall_train, "\n")
## Recall_Train: 0.7005206
precision <- sum(predictions == 1 & test_set$HeartDisease == 1) / sum(predictions ==

1)
## Precision: 0.3315616
precision_train <- sum(predictions_train == 1 & train_set$HeartDisease == 1) / sum(pr

edictions_train == 1)
cat("Precision_train:", precision_train, "\n")
## Precision_train: 0.4685543
#Calculate F1 score

## F1 Score: 0.3917171
f1_score_train <- 2 * (precision_train * recall_train) / (precision_train + recall_tr

ain)
cat("F1 Score Train:", f1_score_train, "\n")
## F1 Score Train: 0.5615242
#Confusion matrix for test set
confusionMatrix(data=as.factor(predictions), reference = as.factor(test_set$HeartDise

ase))

##
## Reference
## Prediction 0 1
## 0 53203 2855
## 1 5282 2620
##
## 95% CI : (0.8702, 0.8754)
##
## Kappa : 0.3233
##
##
##
##
#Confusion Matrix for train set
confusionMatrix(data=as.factor(predictions_train), reference = as.factor(train_set$He

artDisease))

##
## Reference
## Prediction 0 1
## 0 216538 6558
## 1 17399 15340
##
## 95% CI : (0.9052, 0.9075)
##
## Kappa : 0.5114
##
##
##
##
#Feature Importances
library(caret)
library(ggplot2)
#Calculate feature importances
importances <- varImp(model)
#Sort the importances in descending order
sorted_importances <- importances[order(importances$`0`, decreasing = TRUE), ]

sorted_importances = subset(sorted_importances, select = -c(1) )
#Print feature importances
print(sorted_importances)
## 1
## AgeCategory 57.17185052
## Stroke 48.55089641
## GenHealth 46.21302100
## Sex 38.19345449
## Diabetic 15.65786760
## PhysicalHealth 14.33448787
## KidneyDisease 13.83797069
## DiffWalking 10.79500337
## MentalHealth 10.00377272
## BMI 9.11022265
## Race_White 6.74038712
## AlcoholDrinking 6.29347386
## Race_Black 4.97982135
## Asthma 4.88838298
## Race_Asian 4.00014224
## Race_Hispanic 3.89790718
## PhysicalActivity 3.89514501
## SkinCancer 3.74235929
## SleepTime 3.17222214
## Smoking 2.65329054
## Race_Other 0.05098671
## Race_American_Indian_Alaskan_Native -0.78333496
#Create a dataframe for plotting
importance_df <- data.frame(variable = row.names(sorted_importances),

importance = sorted_importances)
#Plot the feature importances in a horizontal bar chart
ggplot(importance_df, aes(y = reorder(variable, X1), x = X1)) +

geom_bar(stat = "identity", fill = "blue") +
geom_text(aes(label = round(X1,1)), hjust = -0.1, size = 3, color = "black") +
labs(y = "Variable", x = "Importance") +
ggtitle("Feature Importances") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1))
library(caret)
#Define the range of weights to be tested
weight_range <- seq(0.1, 1, by = 0.1)
#Initialize an empty dataframe to store results
results_LR <- data.frame(weights = character(),

accuracy = numeric(),
recall = numeric(),
precision = numeric(),
f1_score = numeric(),
stringsAsFactors = FALSE)
#Perform parameter tuning with diﬀerent class weights
model <- glm(HeartDisease ~ .,

data = train_set,
family = "binomial")
class_weights <- 'Default'
for (weight in weight_range) {

# Train the logistic regression model with class weights
# Make predictions on the test set

predictions <- predict(model, newdata = test_set, type = "response")
# Convert predictions to factors

predictions <- as.factor(ifelse(predictions > 0.5, 1, 0))
# Calculate evaluation metrics


#recall <- sensitivity(as.factor(test_set$HeartDisease), as.factor(predictions))
precision <- sum(predictions == 1 & test_set$HeartDisease == 1) / sum(predictions =
= 1)
# Append results to the dataframe
results_LR <- rbind(results_LR, data.frame(weights = class_weights,

accuracy = accuracy,
recall = recall,
precision = precision,
f1_score = f1_score,
stringsAsFactors = FALSE))
class_weights <- paste(1 - weight, weight, sep = ":")

model <- glm(HeartDisease ~ .,
data = train_set,
family = "binomial",
weights = ifelse(train_set$HeartDisease == 1, weight, 1 - weight))
}










## Warning: glm.fit: algorithm did not converge
# Print the results

print(results_LR)
## weights accuracy recall precision f1_score

## 1 Default 0.9161507 0.1132420091 0.5496454 0.1877934272
## 2 0.9:0.1 0.9144153 0.0001826484 1.0000000 0.0003652301
## 3 0.8:0.2 0.9147905 0.0056621005 0.8378378 0.0112481858
## 4 0.7:0.3 0.9154472 0.0235616438 0.6753927 0.0455347688
## 5 0.6:0.4 0.9161664 0.0578995434 0.6084453 0.1057371581
## 6 0.5:0.5 0.9161507 0.1132420091 0.5496454 0.1877934272
## 7 0.4:0.6 0.9140869 0.1961643836 0.4953875 0.2810414759
## 8 0.3:0.7 0.9039556 0.3081278539 0.4173676 0.3545234843
## 9 0.2:0.8 0.8757661 0.4789041096 0.3398574 0.3975739196
## 10 0.1:0.9 0.7791119 0.7364383562 0.2411916 0.3633741889

Predict Heart Disease

Uploaded by

Copyright:

Available Formats

Predict Heart Disease

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Predict Heart Disease

Uploaded by

Copyright:

Available Formats

Data Analysis - Final Project-Sena 6/24/23, 10:00 PM

Data Analysis - Final Project-Sena

## HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth

#to checks number of null values

#a list of objects and their structure

## 'data.frame': 319795 obs. of 18 variables:

#summary of the dataset

## HeartDisease BMI Smoking AlcoholDrinking

#the number of diﬀerent values for each column

## The following objects are masked from 'package:stats':

## The following objects are masked from 'package:base':

heartd %>% summarise_all(n_distinct)

## HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth MentalHealth

#EDA (EXPLORATORY DATA ANALYSIS) #histogram of BMI(Body Mass Index)

p<-ggplot(data = heartd, aes(x = BMI)) + geom_histogram(binwidth = 3,fill='blue',colo

#histogram of Physical Health

p<-ggplot(data = heartd, aes(x = heartd$PhysicalHealth)) + geom_histogram(binwidth =

## Warning: Use of `heartd$PhysicalHealth` is discouraged.

#histogram of Mental Health

p<-ggplot(data = heartd, aes(x = MentalHealth )) + geom_histogram(binwidth = 3,fill='

#histogram of Sleep Time

p<-ggplot(data = heartd, aes(x = SleepTime)) + geom_histogram(binwidth = 1,fill='blue

#remove the scientific notations from the axis

ggplot(heartd, aes(x = HeartDisease, fill = Smoking)) +

ggtitle("Bar Plot of Heart Disease and Smoking")

ggplot(heartd, aes(x = HeartDisease, fill = AlcoholDrinking)) +

ggplot(heartd, aes(x = HeartDisease, fill = Sex)) +

ggplot(heartd, aes(x = HeartDisease, fill = Asthma)) +

ggplot(heartd, aes(x = HeartDisease, fill = KidneyDisease)) +

ggplot(heartd, aes(x = HeartDisease, fill = SkinCancer)) +

counts <- table(heartd$HeartDisease, heartd$Smoking)

barplot(counts, beside = TRUE, legend = TRUE,

counts <- table(heartd$HeartDisease, heartd$AgeCategory)

colors <- c("yellow", "green")

barplot(counts, beside = TRUE, legend = TRUE,

#Count the frequencies of each combination of the two variables

counts <- table(heartd$HeartDisease, heartd$Race)

colors <- c("yellow", "green")

barplot(counts, beside = TRUE, legend = TRUE,

#Count the frequencies of each combination of the two variables

counts <- table(heartd$HeartDisease, heartd$GenHealth)

colors <- c("yellow", "green")

barplot(counts, beside = TRUE, legend = TRUE,

Count the frequencies of each combination

colors <- c("yellow", "green")

barplot(counts, beside = TRUE, legend = TRUE,

Count the frequencies of each combination

colors <- c("yellow", "green")

barplot(counts, beside = TRUE, legend = TRUE,

Replace values in the data frame

Convert ‘Diabetic’ column to integer

heartd$Diabetic <- as.numeric(heartd$Diabetic)

Extract the first two characters from the

Print the first five row of modified “Age”

## [1] "55" "80" "65" "75" "40"

#convert all “0”s and “1”s to integer except c( “Race”)

num_columns <- setdiff(colnames(heartd), c( "Race"))

heartd_num <- as.data.frame(lapply(heartd[num_columns], as.numeric))

## 'data.frame': 319795 obs. of 17 variables:

## corrplot 0.92 loaded