Analysis of the Heart Disease Dataset

Load the data from here, and the description is here. The original dataset comes from here and corresponds to the processed cleveland data

Perform an EDA on the dataset

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(cowplot)
library(hrbrthemes) 
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(viridis)
## Loading required package: viridisLite
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
#Load the dataset.
heart_df <- read.csv("data/heart_disease_dataset.csv",sep = " ")
colnames(heart_df) <- c("Age", "Sex", "ChestPainType","RestingBloodPressure", "Cholestherol", "FastingBloodSugar", "Resting_EC", "MaxHeartRate", "ExerciseInducedAngina","Peak","Slope","MajorVessels","Thalassemia", "Diagnosis","ID")

#Clean the data.
heart_df$Sex <- ifelse(heart_df$Sex == 1, "Male", "Female")
heart_df <- heart_df %>% 
  mutate(ChestPainType = recode(ChestPainType, '1' = "TypicalAngina", '2' = "AtypicalAngina", '3' = "NonAnginal", '4' = "Asymptomatic")) %>% 
  mutate(FastingBloodSugar  = recode (FastingBloodSugar, '1' = '>120', '0' = '<120')) %>% 
  mutate(Resting_EC = recode (Resting_EC, '0' = 'Normal', '1'='ST-T', '2'= 'P.V.Hyperthrophy')) %>% 
  mutate(ExerciseInducedAngina = recode (ExerciseInducedAngina, '0' = 'No', '1'='Yes')) %>% 
  mutate(Slope = recode (Slope, '1' = 'UpSloping', '2'='Flat', '3'= 'DownSlopping')) %>% 
  mutate(Thalassemia = recode (Thalassemia, '7' = 'ReversableDefect', '6'='FixedDefect', '3'= 'Normal','?' = NA_character_)) %>% 
  mutate(Diagnosis = recode (Diagnosis, '0' = 'NoDisease', '1'='Disease', '2'='Disease', '3'='Disease','4'='Disease')) %>% 
  mutate(MajorVessels = if_else(MajorVessels == '?', NA_character_, MajorVessels))

# Create boxplots using ggplot2 (with outliers).
plot1 <- ggplot(heart_df, aes(x = "", y = Age)) +
  geom_boxplot() +
  labs(title = "Boxplot for Age", y = "Age Values") +
  theme_minimal()
plot2 <- ggplot(heart_df, aes(x = "", y = Cholestherol)) +
  geom_boxplot() +
  labs(title = "Boxplot for Cholestherol", y = "Cholestherol Values") +
  theme_minimal()
plot3 <- ggplot(heart_df, aes(x = "", y = MaxHeartRate)) +
  geom_boxplot() +
  labs(title = "Boxplot for MaxHeartRate", y = "MaxHeartRate Values") +
  theme_minimal()
plot4 <- ggplot(heart_df, aes(x = "", y = RestingBloodPressure)) +
  geom_boxplot() +
  labs(title = "Boxplot for RestingBloodPressure", y = "RestingBloodPressure Values") +
  theme_minimal()
plot5 <- ggplot(heart_df, aes(x = "", y = Peak)) +
  geom_boxplot() +
  labs(title = "Boxplot for Peak", y = "Peak Values") +
  theme_minimal()

plot_grid(plot1, plot2, plot3, plot4, plot5, ncol = 3)

#Remove outliers.
remove_outliers_column <- function(x) {
  iqr_val <- IQR(x, na.rm = TRUE)
  lower_bound <- quantile(x, 0.25, na.rm = TRUE) - 1.5 * iqr_val
  upper_bound <- quantile(x, 0.75, na.rm = TRUE) + 1.5 * iqr_val
  x[x < lower_bound] <- lower_bound
  x[x > upper_bound] <- upper_bound
  return(x)
}
heart_df_filtered <- heart_df
columns_to_filter <- c("Age", "Cholestherol", "MaxHeartRate", "RestingBloodPressure", "Peak")

for (col in columns_to_filter) {
  heart_df_filtered[[col]] <- remove_outliers_column(heart_df[[col]])
}

# Create boxplots using ggplot2 (without outliers).
plot1_filtered <- ggplot(heart_df_filtered, aes(x = "", y = Age, fill = "Age")) +
  geom_boxplot() +
  labs(title = "Boxplot for Age (Filtered)", y = "Age Values") +
  theme_minimal()
plot2_filtered <- ggplot(heart_df_filtered, aes(x = "", y = Cholestherol, fill = "Cholestherol")) +
  geom_boxplot() +
  labs(title = "Boxplot for Cholestherol (Filtered)", y = "Cholestherol Values") +
  theme_minimal()
plot3_filtered <- ggplot(heart_df_filtered, aes(x = "", y = MaxHeartRate, fill = "MaxHeartRate")) +
  geom_boxplot() +
  labs(title = "Boxplot for MaxHeartRate (Filtered)", y = "MaxHeartRate Values") +
  theme_minimal()
plot4_filtered <- ggplot(heart_df_filtered, aes(x = "", y = RestingBloodPressure, fill = "RestingBloodPressure")) +
  geom_boxplot() +
  labs(title = "Boxplot for RestingBloodPressure (Filtered)", y = "RestingBloodPressure Values") +
  theme_minimal()
plot5_filtered <- ggplot(heart_df_filtered, aes(x = "", y = Peak, fill = "Peak")) +
  geom_boxplot() +
  labs(title = "Boxplot for Peak (Filtered)", y = "Peak Values") +
  theme_minimal()
plot_grid(plot1_filtered, plot2_filtered, plot3_filtered, plot4_filtered, plot5_filtered, ncol = 3)

# Create densityplots.
plot_Age <- ggplot(data = heart_df_filtered, aes( x = Age)) + 
  geom_histogram(aes(y = ..density..), fill = 'deepskyblue1', color = 'black' ,binwidth = 5) +
  geom_density(fill = '#F0FFFF', alpha = 0.5) 
plot_Cholestherol <- ggplot(data = heart_df_filtered, aes( x = Cholestherol)) + 
  geom_histogram(aes(y = ..density..), fill = 'deepskyblue1', color = 'black' ,binwidth = 5) +
  geom_density(fill = '#F0FFFF', alpha = 0.5) 
plot_MaxHeartRate <- ggplot(data = heart_df_filtered, aes( x = MaxHeartRate)) + 
  geom_histogram(aes(y = ..density..), fill = 'deepskyblue1', color = 'black' , binwidth = 5) +
  geom_density(fill = '#F0FFFF', alpha = 0.5) 
plot_RestingBloodPressure <- ggplot(data = heart_df_filtered, aes( x = RestingBloodPressure)) + 
  geom_histogram(aes(y = ..density..), fill = 'deepskyblue1', color = 'black' , binwidth = 5) +
  geom_density(fill = '#F0FFFF', alpha = 0.5) 
plot_Peak <- ggplot(data = heart_df_filtered, aes( x = Peak)) + 
  geom_histogram(aes(y = ..density..), fill = 'deepskyblue1', color = 'black', binwidth = 0.1) +
  geom_density(fill = '#F0FFFF', alpha = 0.5) 

plot_grid(plot_Age,plot_Cholestherol,plot_MaxHeartRate,plot_RestingBloodPressure,plot_Peak, ncol = 3)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

qq_plot_Age <- ggplot(data = heart_df_filtered, aes(sample = Age)) +
  stat_qq(distribution = qnorm, color = 'deepskyblue1') +
  stat_qq_line(distribution = qnorm, color = 'red') +
  ggtitle("Q-Q Plot for Age")

# Create Q-Q plots.
qq_plot_Cholestherol <- ggplot(data = heart_df_filtered, aes(sample = Cholestherol)) +
  stat_qq(distribution = qnorm, color = 'deepskyblue1') +
  stat_qq_line(distribution = qnorm, color = 'red') +
  ggtitle("Q-Q Plot for Cholestherol")
qq_plot_MaxHeartRate <- ggplot(data = heart_df_filtered, aes(sample = MaxHeartRate)) +
  stat_qq(distribution = qnorm, color = 'deepskyblue1') +
  stat_qq_line(distribution = qnorm, color = 'red') +
  ggtitle("Q-Q Plot for MaxHeartRate")
qq_plot_RestingBloodPressure <- ggplot(data = heart_df_filtered, aes(sample = RestingBloodPressure)) +
  stat_qq(distribution = qnorm, color = 'deepskyblue1') +
  stat_qq_line(distribution = qnorm, color = 'red') +
  ggtitle("Q-Q Plot for RestingBloodPressure")
qq_plot_Peak <- ggplot(data = heart_df_filtered, aes(sample = Peak)) +
  stat_qq(distribution = qnorm, color = 'deepskyblue1') +
  stat_qq_line(distribution = qnorm, color = 'red') +
  ggtitle("Q-Q Plot for Peak")

grid.arrange(qq_plot_Age, qq_plot_Cholestherol, qq_plot_MaxHeartRate, 
             qq_plot_RestingBloodPressure, qq_plot_Peak, nrow = 2)

#Set alpha at. 0.005

#Ho - Data follows a normal distribution.
#Ha - Data does not follow a normal distribution.
# Normality test for Age
shapiro_test_Age <- shapiro.test(heart_df_filtered$Age) #Can't reject Ho
cat("Shapiro-Wilk test for Age:\n", "W =", shapiro_test_Age$statistic, ", p-value =", shapiro_test_Age$p.value, "\n\n") 
## Shapiro-Wilk test for Age:
##  W = 0.9864633 , p-value = 0.006068642
# Normality test for Cholestherol
shapiro_test_Cholestherol <- shapiro.test(heart_df_filtered$Cholestherol) #Can't reject Ho
cat("Shapiro-Wilk test for Cholestherol:\n", "W =", shapiro_test_Cholestherol$statistic, ", p-value =", shapiro_test_Cholestherol$p.value, "\n\n")
## Shapiro-Wilk test for Cholestherol:
##  W = 0.9888511 , p-value = 0.02006615
# Normality test for MaxHeartRate
shapiro_test_MaxHeartRate <- shapiro.test(heart_df_filtered$MaxHeartRate) #Reject Ho
cat("Shapiro-Wilk test for MaxHeartRate:\n", "W =", shapiro_test_MaxHeartRate$statistic, ", p-value =", shapiro_test_MaxHeartRate$p.value, "\n\n")
## Shapiro-Wilk test for MaxHeartRate:
##  W = 0.9765386 , p-value = 7.244153e-05
# Normality test for RestingBloodPressure
shapiro_test_RestingBloodPressure <- shapiro.test(heart_df_filtered$RestingBloodPressure) #Reject Ho
cat("Shapiro-Wilk test for RestingBloodPressure:\n", "W =", shapiro_test_RestingBloodPressure$statistic, ", p-value =", shapiro_test_RestingBloodPressure$p.value, "\n\n")
## Shapiro-Wilk test for RestingBloodPressure:
##  W = 0.9745627 , p-value = 3.309576e-05
# Normality test for Peak
shapiro_test_Peak <- shapiro.test(heart_df_filtered$Peak) # Reject Ho
cat("Shapiro-Wilk test for Peak:\n", "W =", shapiro_test_Peak$statistic, ", p-value =", shapiro_test_Peak$p.value, "\n\n")
## Shapiro-Wilk test for Peak:
##  W = 0.8517292 , p-value = 2.155173e-16
#Barplots for categorical variables. 
plot_sex <- ggplot(heart_df_filtered, aes(x=Sex, fill=as.factor(Sex))) + 
  geom_bar() +
  scale_fill_manual(values = c("#FFB6C1", "#87CEFA") ) +
  theme(legend.position="none")
plot_chest_pain <- ggplot(heart_df_filtered, aes(x = ChestPainType, fill = ChestPainType)) +
  geom_bar() +
  scale_fill_brewer(palette = "Set2") +  
  theme_minimal() +  
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    plot.title = element_text(hjust = 0.5)  
  ) +
    theme(legend.position="none")
plot_fasting_blood_sugar <- ggplot(heart_df_filtered, aes(x = FastingBloodSugar, fill = FastingBloodSugar)) +
  geom_bar() +
  scale_fill_brewer(palette = "Set2") +  
  theme_minimal() +  
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    plot.title = element_text(hjust = 0.5)  
  ) +
    theme(legend.position="none")
plot_exercise_Resting_EC <- plot_resting_ec <- ggplot(heart_df_filtered, aes(x = Resting_EC, fill = Resting_EC)) +
  geom_bar() +
  scale_fill_brewer(palette = "Set2") +  
  theme_minimal() +  
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    plot.title = element_text(hjust = 0.5)  
  ) +
    theme(legend.position="none")
plot_exercise_induced_angina <- ggplot(heart_df_filtered, aes(x = ExerciseInducedAngina, fill = ExerciseInducedAngina)) +
  geom_bar() +
  scale_fill_manual(values = c("#CD0000", "#00CD00") ) +  
  theme_minimal() +  
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    plot.title = element_text(hjust = 0.5)  
  ) +
    theme(legend.position="none")
plot_slope <- ggplot(heart_df_filtered, aes(x = Slope, fill = Slope)) +
  geom_bar() +
  scale_fill_manual(values = c("#EE5C42", "#00E5EE", "#00EE76") ) +  
  theme_minimal() +  
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    plot.title = element_text(hjust = 0.5)  
  ) +
    theme(legend.position="none")
plot_major_vessels <- ggplot(heart_df_filtered, aes(x = MajorVessels, fill = MajorVessels)) +
  geom_bar() +
  scale_fill_brewer(palette = "Set7") +  
  theme_minimal() +  
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    plot.title = element_text(hjust = 0.5)  
  ) +
    theme(legend.position="none")
## Warning: Unknown palette: "Set7"
plot_Thalassemia <- ggplot(heart_df_filtered, aes(x=Thalassemia, fill=as.factor(Thalassemia))) + 
  geom_bar() +
  scale_fill_brewer(palette = "Set3") +
  theme(legend.position="none") +
    theme(legend.position="none")
plot_diagnosis <- ggplot(heart_df_filtered, aes(x=Diagnosis, fill=as.factor(Diagnosis))) + 
  geom_bar() +
  scale_fill_manual(values = c("#CD0000", "#00CD00") ) +
  theme(legend.position="none")
plot_grid(plot_sex, plot_chest_pain, plot_fasting_blood_sugar, plot_exercise_Resting_EC, plot_exercise_induced_angina, plot_slope, plot_major_vessels, plot_Thalassemia, plot_diagnosis, ncol = 3)

Create visualizations in order to show which variables seem to be more associated with heart disease

# List of variables to plot
variables_to_plot <- c( "Sex", "ChestPainType", "FastingBloodSugar", "Resting_EC", "ExerciseInducedAngina","Slope","MajorVessels","Thalassemia")

# Function to create a bar plot
create_bar_plot <- function(variable) {
  ggplot(heart_df_filtered, aes_string(x = variable, fill = "Diagnosis")) +
    geom_bar(position = "dodge") +
    labs(title = paste(variable, "vs Diagnosis"),
         x = variable,
         y = "Count") +
    theme_minimal()
}

# Creating plots using a loop
barplots_2vars <- lapply(variables_to_plot, create_bar_plot)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Displaying all plots in a grid
plot_grid(plotlist = barplots_2vars, ncol = 2)

# List of continuous variables
continuous_vars <- c("Age","RestingBloodPressure", "Cholestherol", "MaxHeartRate","Peak")

# Function to create a density plot
create_density_plot <- function(variable) {
  ggplot(heart_df_filtered, aes_string(x = variable, fill = "Diagnosis")) +
    geom_density(alpha = 0.5) +
    labs(title = paste("Density of", variable, "vs Diagnosis"),
         x = variable,
         y = "Density") +
    theme_minimal()
}

# Creating density plots using a loop
density_plots <- lapply(continuous_vars, create_density_plot)

# Displaying all density plots in a grid
plot_grid(plotlist = density_plots, ncol = 2)

# Function to create a violin plot
create_violin_plot <- function(variable) {
  ggplot(heart_df_filtered, aes_string(y = variable, x = "Diagnosis", fill = "Diagnosis")) +
    geom_violin(width = 1.4) +
    geom_boxplot(width = 0.1, color = "grey", alpha = 0.2) +
    scale_fill_viridis(discrete = TRUE) +
    theme_ipsum() +
    theme(legend.position = "none", plot.title = element_text(size = 11)) +
    ggtitle(paste(variable, "vs Diagnosis"))
}

# Creating violin plots using a loop
violin_plots <- lapply(continuous_vars, create_violin_plot)

# Displaying all violin plots in a grid
plot_grid(plotlist = violin_plots, ncol = 2)
## Warning: `position_dodge()` requires non-overlapping x intervals
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning: `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals
## `position_dodge()` requires non-overlapping x intervals

#Since the number of males is higher of the females this data can lead to biased assumptions. Even thoguh we have found that Heart Disease is more prevalent in men than women in this dataset. Most of the disease group reported asymptomatic chestpain. Thalassemia patients with thalassemia defects are also more prone to have Heart Disease. Non anginal chest pain has less proportion of disease patients than typical anginal. Increased Number of major vessels led to higher proportion of disease, since it means that the major vessels diameter is narrower. The group of Patients that experienced exercise induced angina has a larger proportion of disease than those who didn’t.

We found that Age and Maximum Heart Rate are the variables that could be more related to a higher risk of hearth disease. Cholesterol is also distributed differently between Disease and no Disease groups. Furthermore Peak variable, which is the magnitude of the depression of the ST segment, is also found to be higher on Disease group.

2 Difference in mortality rates in hospitalized COVID-19 patients

Using the supplementary material from the Difference in mortality rates in hospitalized COVID-19 patients identified by cytokine profile clustering using a machine learning approach: An outcome prediction alternative, perform the following tasks

Reproduce Figure 1 from the publication

library(dplyr)
library(openxlsx)
library(ggplot2)
library(ggplotify)
library(gridExtra)
library(gridExtra)
library(grid)
#Open supplementary tables. Make the first Row the column names. 
table1 <- read.xlsx("data/Table_1.xlsx" , startRow = 2)
table2 <- read.xlsx("data/Table_2.xlsx" , startRow = 2)

#Clean Table 1:
# Remove rows with NA in the ID column and extract unique records
unique_table1 <- table1 %>%
  filter(!is.na(ID)) %>%
  distinct(ID, .keep_all = TRUE) %>%
  select(ID, Age, Use.of.NIV, Use.of.AMV, ARDS.Diagnosis, Death)
# Delete rows with numbers in certain columns and with characters in Age
unique_table1.2 <- unique_table1 %>%
  filter(!grepl("^[0-9]+$", Use.of.NIV), 
         !grepl("^[0-9]+$", Use.of.AMV), 
         !grepl("^[0-9]+$", ARDS.Diagnosis), 
         !grepl("^[0-9]+$", Death),
         !grepl("^[A-Za-z]+$", Age))


#Cleaning Table 2:
#Change the first column name of "Table 2" to "ID"
colnames(table2)[1] <- "ID"
# Remove rows with NA in the ID column and extract unique records.
table2 <- table2 %>%   
  filter(!is.na(ID)) %>%
  distinct(ID, .keep_all = TRUE) %>%
  select(ID,`IL-6`,CXCL10,`IL-38`,`IL-8`,`IFN-ɑ`,`IL-10`,`TNF-ɑ`,CCL2,CCL3,`IFN-γ`,`IL-1β`,`G-CSF`)

# Define the columns to check as strings
columns_to_check <- c("IL-6", "CXCL10", "IL-38", "IL-8", "IFN-ɑ", "IL-10", "TNF-ɑ", "CCL2", "CCL3", "IFN-γ", "IL-1β", "G-CSF")

# Delete rows with characters in specified columns
unique_table2 <- table2 %>%
  filter(!if_any(all_of(columns_to_check), ~grepl("^[A-Za-z]+$", .)))

# Delete the rows with "NI" in specified columns
unique_table2.2 <- unique_table2 %>%
  filter(!if_any(columns_to_check, ~grepl("NI", .)))
## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `!if_any(columns_to_check, ~grepl("NI", .))`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(columns_to_check)
## 
##   # Now:
##   data %>% select(all_of(columns_to_check))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
# Identify common IDs in both tables
common_ids <- intersect(unique_table1.2$ID, unique_table2.2$ID)

# Filter tables to keep only rows with common IDs
filtered_table1 <- unique_table1.2 %>% filter(ID %in% common_ids)
filtered_table2 <- unique_table2.2 %>% filter(ID %in% common_ids)

# Make the classification column with groups "G1, G2, G3, G4" based on expert-based criteria.
table1.classification <- filtered_table1 %>%
  mutate(Classification = case_when(
    Use.of.NIV == "No" & Use.of.AMV == "No" & ARDS.Diagnosis == "No" ~ "G1",
    ARDS.Diagnosis == "No" & (Use.of.NIV == "Yes" | Use.of.AMV == "Yes") ~ "G2",
    Use.of.NIV == "Yes" & Use.of.AMV == "No" & ARDS.Diagnosis == "Yes" ~ "G3",
    ARDS.Diagnosis == "Yes" & Use.of.AMV == "Yes" & (Use.of.NIV == "Yes" | Use.of.NIV == "No") ~ "G4",
    TRUE ~ NA_character_
  )) %>%
# Remove rows with NA in the Classification column
filter(!is.na(Classification))

# Figure 1 A.
Fig1_a <- ggplot(table1.classification, aes(x = Age)) + 
  geom_histogram(binwidth = 10, fill = "lightblue", color = "black") +
  scale_y_continuous(limits = c(0, 50)) +
  scale_x_continuous(breaks = seq(20, 100, by = 10), limits = c(20, 100)) +
  labs(x = "Age (years)", y = "Frequency (n)", title = "") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Fig1_a
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

#Figure 1 B.
first_column <- c("G1", "G2", "G3","G4")
second_column <- c("-", "-/+", "+","-/+")
third_column <- c("-", "+/-", "-","+")
forth_column <- c("-", "-", "+","+")
table1_classification <- data.frame(first_column, second_column, third_column, forth_column)
names = c("Clinical\n Classification", "NIV", "AMV", "ARDS")
colnames(table1_classification) <- names
rownames(table1_classification) <- c(" ","  ","   ","    ")

Fig1_b <- grid.arrange(top="Definition of the Clincal Classification",tableGrob(table1_classification))

Fig1_b_grob <- as_grob(plot = Fig1_b)

#Figure 1 C.
classification_freq <- table(table1.classification$Classification)
classification_df <- as.data.frame(classification_freq)
names(classification_df) <- c("Classification", "Frequency")

Fig1_c <- ggplot(classification_df, aes(x = Classification, y = Frequency, fill = Classification)) +
  geom_bar(stat = "identity", width = 0.7, color = "black") +  
  geom_text(aes(label = Frequency), vjust = -0.3, color = "black") +  
  scale_fill_manual(values = c("aquamarine3", "khaki1", "mediumpurple1", "salmon")) +
  labs(title = "Clinical Classification", x = "Clinical Classification", y = "Frequency (n)") +
  ylim(0, 90) +
  theme_minimal() +
  theme(legend.position = "none") 

#Figure 1 D.
Death_freq <- table(table1.classification$Death)
Death_df <- as.data.frame(Death_freq)
names(Death_df) <- c("VitalStatus", "Frequency")

Fig1_d <- ggplot(Death_df, aes(x = VitalStatus, y = Frequency, fill = VitalStatus)) +
  geom_bar(stat = "identity", width = 0.7, color = "black") +  
  geom_text(aes(label = Frequency), vjust = -0.3, color = "black") +  
  scale_fill_manual(values = c("turquoise", "gold")) +
  labs(title = "Vital status", x = "Death", y = "Frequency (n)") +
  ylim(0, 150) +
  theme_minimal() +
  theme(legend.position = "none")

print(Fig1_d)

grid.arrange(Fig1_a, Fig1_b_grob, Fig1_c, Fig1_d, ncol = 2)
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

Reproduce Figure 2 from the publication

but instead of representing the clusters in the annotation, represent the groups (G1 to G4)

library(caret)
## Loading required package: lattice
library(RColorBrewer)
library(ggfortify)
library(cluster)
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
## 
##     get_legend
library(survival)
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
# Select the needed columns and rename the labels to make them more easy to handle
unique_table2.2 <- unique_table2.2 %>% 
  select(ID,`IL-6`,CXCL10,`IL-38`,CCL3,`IFN-γ`,`IL-1β`,`IL-10`,`G-CSF`,`IFN-ɑ`
         ,`TNF-É‘`,`IL-8`,CCL2) %>% 
  rename_all(~ gsub("-", "", .)) %>%
  #Turn all non numeric values to NA
  mutate(across(-ID,as.numeric))

#Merge table 2 with classification from table 1 with ID.
table2_1_merged <- unique_table2.2 %>% 
  inner_join(table1.classification %>% select(ID, Classification), by = "ID")








#Define color pallette and assign a color to each group
color <- brewer.pal(4, "Set1")
classification_colors <- setNames(color, c("G1", "G2", "G3", "G4"))
classifications <- table2_1_merged$Classification
classification_colside <- classification_colors[classifications]










#Plot using Heatmap
heatmap_plot <- heatmap(
  as.matrix(t(table2_1_merged[, 2:13])),
  cexCol = 0.1,
  xlab = "Patients",
  labCol = FALSE,
  col = brewer.pal(9, "Oranges"),
  scale = "column",
  ColSideColors = classification_colside,
)

legend("topright",legend=c("G1", "G2", "G3", "G4"),fill=classification_colors)
legend(legend = c("0.00", "50.0%", "100.0%"), 
       fill = (brewer.pal(9,"Oranges")[c(1,5,9)]), title = "Relative\nExpression",
       x = 0, y = 0.3)

#When plotting without clusters, groups are not clustered since there is nothing in the data that relates the groups, for that reason we see the group label unsorted.

Improve figure 2 of the publication

Add a second annotation with information of deathm and a third one with information of gender

library(ComplexHeatmap)
## ========================================
## ComplexHeatmap version 2.16.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(RColorBrewer)

#Clean table.
unique_table1.3 <- table1 %>%
  filter(!is.na(ID)) %>%
  distinct(ID, .keep_all = TRUE)
  
#Merge by ID only columns of gender and death from Table 1.
table12_group_gender <- inner_join(
  table2_1_merged,
  unique_table1.3 %>% select(ID, Gender, Death),
  by = "ID"
)

#Remove numeric character and right strip space character. 
table12_group_gender <- table12_group_gender %>%
  filter(!grepl("\\d", Gender))

#Get numeric data for Heatmap.
numeric_data <- table12_group_gender[, -c(1, 14, 15, 16)]

#Annotations.
gender_info <- table12_group_gender$Gender
death_info <- table12_group_gender$Death
classification_info <- table12_group_gender$Classification
gender_info <- replace(gender_info, gender_info =="F ", "F")

ha <- HeatmapAnnotation(
    Deaths = death_info , 
    Gender = gender_info,
    Groups = classification_info, 
    col = list(Deaths = c("Yes" = "black", "No" = "gray"),
               Groups = c("G1" = "red", "G2" = "green", "G3" = "blue", "G4" = "purple"),
               Gender = c("M" = "royalblue", "F" = "pink")
    )
)

Heatmap(
  scale(as.matrix(t(numeric_data))),
  name = "Relative\nexpression",
  column_title = "Patients",
  column_title_side = "bottom",
  top_annotation = ha,
  col = brewer.pal(9, "Oranges"),
  
  heatmap_legend_param = list(
    title = "Relative\nexpression",
    at = c(-2,1,4),  
    labels = c("0", "50%", "100%")
  )

)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IFNγ' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IFNγ' in 'mbcsToSbcs': dot substituted for <b3>
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IL1β' in 'mbcsToSbcs': dot substituted for <ce>
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IL1β' in 'mbcsToSbcs': dot substituted for <b2>
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IFNÉ‘' in 'mbcsToSbcs': dot substituted for <c9>
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'IFNÉ‘' in 'mbcsToSbcs': dot substituted for <91>
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'TNFÉ‘' in 'mbcsToSbcs': dot substituted for <c9>
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## conversion failure on 'TNFÉ‘' in 'mbcsToSbcs': dot substituted for <91>

session info

R version 4.3.1 (2023-06-16) Platform: aarch64-apple-darwin20 (64-bit) Running under: macOS Sonoma 14.2.1

Matrix products: default BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0

locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/Madrid tzcode source: internal

attached base packages: [1] grid stats graphics grDevices utils datasets methods
[8] base

other attached packages: [1] ComplexHeatmap_2.16.0 survival_3.5-7 ggpubr_0.6.0
[4] cluster_2.1.6 ggfortify_0.4.16 RColorBrewer_1.1-3
[7] caret_6.0-94 lattice_0.22-5 ggplotify_0.1.2
[10] openxlsx_4.2.5.2 gridExtra_2.3 viridis_0.6.4
[13] viridisLite_0.4.2 hrbrthemes_0.8.0 cowplot_1.1.3
[16] ggplot2_3.4.4 dplyr_1.1.4

loaded via a namespace (and not attached): [1] shape_1.4.6 rstudioapi_0.15.0 jsonlite_1.8.8
[4] magrittr_2.0.3 farver_2.1.1 rmarkdown_2.25
[7] GlobalOptions_0.1.2 fs_1.6.3 vctrs_0.6.5
[10] memoise_2.0.1 rstatix_0.7.2 htmltools_0.5.7
[13] curl_5.2.0 broom_1.0.5 gridGraphics_0.5-1
[16] pROC_1.18.5 sass_0.4.8 parallelly_1.36.0
[19] bslib_0.6.1 plyr_1.8.9 lubridate_1.9.3
[22] cachem_1.0.8 mime_0.12 lifecycle_1.0.4
[25] iterators_1.0.14 pkgconfig_2.0.3 Matrix_1.6-5
[28] R6_2.5.1 fastmap_1.1.1 clue_0.3-65
[31] future_1.33.1 shiny_1.8.0 digest_0.6.34
[34] colorspace_2.1-0 S4Vectors_0.38.2 labeling_0.4.3
[37] fansi_1.0.6 timechange_0.3.0 abind_1.4-5
[40] compiler_4.3.1 fontquiver_0.2.1 withr_3.0.0
[43] doParallel_1.0.17 backports_1.4.1 carData_3.0-5
[46] highr_0.10 Rttf2pt1_1.3.12 ggsignif_0.6.4
[49] MASS_7.3-60.0.1 lava_1.7.3 rjson_0.2.21
[52] gfonts_0.2.0 ModelMetrics_1.2.2.2 tools_4.3.1
[55] zip_2.3.0 httpuv_1.6.13 extrafontdb_1.0
[58] future.apply_1.11.1 nnet_7.3-19 glue_1.7.0
[61] nlme_3.1-164 promises_1.2.1 reshape2_1.4.4
[64] generics_0.1.3 recipes_1.0.9 gtable_0.3.4
[67] class_7.3-22 tidyr_1.3.0 data.table_1.14.10
[70] car_3.1-2 utf8_1.2.4 BiocGenerics_0.46.0
[73] foreach_1.5.2 pillar_1.9.0 stringr_1.5.1
[76] yulab.utils_0.1.3 later_1.3.2 circlize_0.4.15
[79] splines_4.3.1 tidyselect_1.2.0 fontLiberation_0.1.0
[82] knitr_1.45 fontBitstreamVera_0.1.1 IRanges_2.34.1
[85] crul_1.4.0 stats4_4.3.1 xfun_0.41
[88] hardhat_1.3.0 timeDate_4032.109 matrixStats_1.2.0
[91] stringi_1.8.3 yaml_2.3.8 evaluate_0.23
[94] codetools_0.2-19 httpcode_0.3.0 extrafont_0.19
[97] gdtools_0.3.5 tibble_3.2.1 cli_3.6.2
[100] rpart_4.1.23 xtable_1.8-4 systemfonts_1.0.5
[103] munsell_0.5.0 jquerylib_0.1.4 Rcpp_1.0.12
[106] globals_0.16.2 png_0.1-8 parallel_4.3.1
[109] ellipsis_0.3.2 gower_1.0.1 listenv_0.9.0
[112] ipred_0.9-14 scales_1.3.0 prodlim_2023.08.28
[115] purrr_1.0.2 crayon_1.5.2 GetoptLong_1.0.5
[118] rlang_1.1.3