knitr::opts_chunk$set(echo = TRUE)
library(tidyverse) library(cluster) library(plotly) library(fpc) library(dendextend) library(factoextra) library(FactoMineR) library(NbClust) library(caret) library(DMwR) library(EnsCat)
data_cluster <- read_csv("/Users/gabrielburcea/rprojects/data/your.md/cleaned_data_22092020.csv") data_select <- data_cluster %>% dplyr::select(id, covid_tested, age, gender, number_morbidities, chills, cough, diarrhoea, fatigue, headache, loss_smell_taste, muscle_ache, nasal_congestion, nausea_vomiting, shortness_breath, sore_throat, sputum, temperature, loss_appetite, chest_pain, itchy_eyes, joint_pain, asthma, diabetes_type_one, diabetes_type_two, obesity, hypertension, heart_disease, lung_condition, liver_disease, kidney_disease) %>% dplyr::filter(covid_tested == "positive") %>% dplyr::filter(number_morbidities <= 1) %>% drop_na() # data_select <- dplyr::mutate(data_select, # n_cmdt = ifelse(number_morbidities %in% 0:1, "1 morbidity", # ifelse(number_morbidities %in% 2 , "2 comorbidities", # ifelse(number_morbidities %in% 3:99, ">3 comorbidites", NA)))) %>% # dplyr::select(-number_morbidities) # covid_tested_levels <- c("positive" = "showing symptoms") # data_transf <- data_select %>% # dplyr::mutate(covid_tested = forcats::fct_recode(covid_tested, !!!covid_tested_levels)) # data_piv <- data_select %>% # pivot_longer(cols = 23:31, # names_to = "comorbidities", # values_to = "bolean") #%>% # # bolean_levels <- c("healthy" = "No") # data_piv <- data_piv %>% # dplyr::mutate(bolean = forcats::fct_recode(bolean, !!!bolean_levels)) %>% # mutate( # b = as.character(bolean), # unhealthy = b != "healthy", # comorbidities = replace(b, unhealthy, comorbidities[unhealthy]), # b = NULL, unhealthy = NULL # ) # data_piv <- data_piv %>% # distinct() #data_piv_id <- dplyr::mutate(data_piv, respondents_id = rownames(data_piv)) # data_select <- data_piv %>% # dplyr::select(id, comorbidities, chills, cough, diarrhoea, fatigue, headache, loss_smell_taste, muscle_ache, # nasal_congestion, nausea_vomiting, shortness_breath, sore_throat, sputum, temperature, loss_appetite, chest_pain, itchy_eyes, joint_pain) %>% # drop_na() # # data_select <- data_select %>% # dplyr::mutate(age_band = case_when(age == 0 | age <= 19 ~ '0-19', # age == 20 | age <= 20 ~ '20-29', # age == 30 | age <= 39 ~ '30-39', # age == 40| age <= 49 ~ '40-49', # age == 50| age <= 59 ~ '50-59', # age == 60 | age <= 69 ~ '60-69', # age == 70 | age <= 79 ~ '70-79', # age == 80 | age <= 89 ~ '80-89', # age >= 90 ~ '90+')) %>% # dplyr::select(-age) #data_rec <- ifelse(data_piv_id[,3:19] == "Yes", 1,0) #data_select <- cbind(data_piv_id[,1:2], data_rec) data_select$gender <- as.factor(data_select$gender) #data_select$age_band <- as.factor(data_select$age_band) # data_select$comorbidities <- as.factor(data_select$comorbidities) # data_select$n_cmdt <- as.factor(data_select$n_cmdt) data_select$chills <- as.factor(data_select$chills) data_select$cough <- as.factor(data_select$cough) data_select$diarrhoea <- as.factor(data_select$diarrhoea) data_select$fatigue <- as.factor(data_select$fatigue) data_select$headache <- as.factor(data_select$headache) data_select$loss_smell_taste <- as.factor(data_select$loss_smell_taste) data_select$muscle_ache <- as.factor(data_select$muscle_ache) data_select$ nasal_congestion <- as.factor(data_select$ nasal_congestion) data_select$nausea_vomiting <- as.factor(data_select$nausea_vomiting) data_select$shortness_breath <- as.factor(data_select$shortness_breath) data_select$sore_throat <- as.factor(data_select$sore_throat) data_select$sputum <- as.factor(data_select$sputum) data_select$temperature <- as.factor(data_select$temperature) data_select$loss_appetite <- as.factor(data_select$loss_appetite) data_select$chest_pain <- as.factor(data_select$chest_pain) data_select$itchy_eyes <- as.factor(data_select$itchy_eyes) data_select$joint_pain <- as.factor(data_select$joint_pain) data_select <- as.data.frame(data_select) #data_test <- head(data_select, 5000) #rownames(data_test) <- data_test$respondents_id rownames(data_select) <- data_select$respondents_id my.seed <- set.seed(22) gower_distance <- cluster::daisy(data_select[,6:22], metric = "gower")
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.
#Hierarchicla clustering using Complete Linkage hc_complete_gower <- hclust(gower_distance, method = "complete") #Plot the obtained dendogram plot(hc_complete_gower, cex = 0.6, hang = -31)
# Compute with agnes hc_agnes <- agnes(data_select[,6:22], method = "ward") # Agglomerative coeffiecient hc_agnes$ac # Agglomerative coeffiecient #hc_agnes$ac #hc_agnes_2 <- agnes(data_test[,3:22], method = "ward") pltree(hc_agnes, cex = 0.6, hang = -1, main = "Dendrogram of agnes")
# compute divisive hierarchical clustering hc_diana <- diana(data_select[,6:22]) # Divisive coefficient; amount of clustering structure found hc_diana$dc #plot dendogram pltree(hc_diana, cex = 0.6, hang = -1, main = "Dendogram of diana")
# Ward's method hc_ward_method <- hclust(gower_distance, method = "ward.D2") # Cut tree into 4 groups sub_grp <- cutree(hc_agnes, k = 4) #Number of countries in each cluster table(sub_grp) plot(hc_agnes) rect.hclust(hc_ward_method , k = 4, border = 2:5)
cstats.table <- function(dist, tree, k) { clust.assess <- c( "cluster.number", "n", "within.cluster.ss", "average.within", "average.between", "wb.ratio", "dunn2", "avg.silwidth" ) clust.size <- c("cluster.size") stats.names <- c() row.clust <- c() output.stats <- matrix(ncol = k, nrow = length(clust.assess)) cluster.sizes <- matrix(ncol = k, nrow = k) for (i in c(1:k)) { row.clust[i] <- paste("Cluster-", i, " size") } for (i in c(2:k)) { stats.names[i] <- paste("Test", i - 1) for (j in seq_along(clust.assess)) { output.stats[j, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j] } for (d in 1:k) { cluster.sizes[d, i] <- unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d] dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1) cluster.sizes[d, i] } } output.stats.df <- data.frame(output.stats) cluster.sizes <- data.frame(cluster.sizes) cluster.sizes[is.na(cluster.sizes)] <- 0 rows.all <- c(clust.assess, row.clust) # rownames(output.stats.df) <- clust.assess output <- rbind(output.stats.df, cluster.sizes)[, -1] colnames(output) <- stats.names[2:k] rownames(output) <- rows.all is.num <- sapply(output, is.numeric) output[is.num] <- lapply(output[is.num], round, 2) output }
stats_divisive <- cstats.table(gower_distance, hc_diana, 7) stats_divisive knitr::kable(stats_divisive)
stats_agglomerative <- cstats.table(gower_distance, hc_agnes, 7) knitr::kable(stats_agglomerative)
elb_div <- ggplot(data = data.frame(t( cstats.table(gower_distance, hc_diana, 15) )), aes(x = cluster.number, y = within.cluster.ss)) + geom_point() + geom_line() + ggtitle("Divisive clustering") + labs(x = "Number of clusters", y = "Within clusters sum of squares(SS)") + theme(plot.title = element_text(hjust = 0.5)) + theme_minimal() elb_div
sil_div <- ggplot(data = data.frame(t( cstats.table(gower_distance, hc_diana , 15) )), aes(x = cluster.number, y = avg.silwidth)) + geom_point() + geom_line() + ggtitle("Divisive clustering") + labs(x = "Number of clusters", y = "Average silhouette width") + theme(plot.title = element_text(hjust = 0.5)) + theme_minimal() sil_div
# fviz_dend(hc_agnes, k = 3, # cex = 0.5, # k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"), # color_labels_by_k = TRUE, # rect = TRUE)
hc_diana_tree <- as.data.frame(cutree(hc_diana, k=4)) data_clustered_1<- cbind(hc_diana_tree, data_select) write.csv(data_clustered_1, file = "/Users/gabrielburcea/rprojects/data/addded_clusters_1.csv")
table <- data_clustered_1 %>% pivot_longer(cols = 24:32, names_to = "comorbidities", values_to = "bolean_yes_no") %>% mutate(comorbidities_2 = case_when(bolean_yes_no == "No" ~ "healthy", TRUE ~ comorbidities)) table_2 <- table %>% group_by(id) %>% mutate(morbidity_healthy = case_when(bolean_yes_no == 'Yes' ~ comorbidities, (!any(bolean_yes_no == 'Yes')) & row_number()==1 ~ 'healthy')) %>% select(-comorbidities, -comorbidities_2, bolean_yes_no) %>% drop_na() final_results_symptoms_all <- table_2 %>% dplyr::rename(cluster = "cutree(hc_diana, k = 4)") %>% dplyr::group_by(cluster, morbidity_healthy) %>% tally() %>% dplyr::mutate(Percentage = n/sum(n) * 100) write.csv(table_2, file = "/Users/gabrielburcea/rprojects/data/table_clusters_added.csv", row.names = FALSE) write.csv(final_results_symptoms_all, file = "/Users/gabrielburcea/rprojects/data/final_results_symptoms_all.csv", row.names = FALSE) final_results_symptoms <- table %>% dplyr::rename(cluster = "cutree(hc_agnes, k = 3)") %>% dplyr::group_by(cluster, symptoms) %>% tally() %>% dplyr::mutate(Percentage = n/sum(n) *100) %>% dplyr::select(cluster, symptoms, Percentage) %>% pivot_wider(names_from = cluster, values_from = "Percentage") write.csv(final_results_symptoms, file = "/Users/gabrielburcea/rprojects/data/final_results_symptoms.csv", row.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.