library(tidyverse) library(cluster) library(plotly) library(fpc) library(dendextend) library(factoextra) library(FactoMineR) library(NbClust) library(caret) library(DMwR)
data_cluster_2 <- read_csv("/Users/gabrielburcea/rprojects/stats_data_whole/data_categ_nosev_comorbidity_one.csv") level_key_temperature <- c("Yes" = "37.5-38", "Yes" = "38.1-39", "Yes" = "38.2-39", "Yes" = "39.1-41") data_select <- data_cluster %>% dplyr::select(id, covid_tested, chills, cough, diarrhoea, fatigue, headache, loss_smell_taste, muscle_ache, nasal_congestion, nausea_vomiting, shortness_breath, sore_throat, sputum, temperature, asthma, diabetes_type_one, diabetes_type_two, obesity, hypertension, heart_disease, lung_condition, liver_disease, kidney_disease) %>% dplyr::filter(covid_tested != "none") covid_tested_levels <- c("positive" = "showing symptoms") data_transf <- data_select %>% dplyr::mutate(covid_tested = forcats::fct_recode(covid_tested, !!!covid_tested_levels), temperature = forcats::fct_recode(temperature, !!!level_key_temperature)) %>% dplyr::filter(covid_tested != "negative") data_piv <- data_transf %>% pivot_longer(cols = 16:24, names_to = "Comorbidities", values_to = "Bolean") %>% dplyr::filter(Bolean == "Yes") %>% pivot_longer(cols = 3:15, names_to = "Symptoms", values_to = "Yes_No") %>% dplyr::group_by(Comorbidities, Symptoms, Yes_No) %>% dplyr::summarise(Count = n()) %>% dplyr::mutate(Freq = Count / sum(Count)*100) %>% dplyr::filter(Yes_No == "Yes") data_piv$Bolean <- NULL data_piv$Yes_No <- NULL data_piv$Count <- NULL data_piv <- data_piv %>% pivot_wider(names_from = Symptoms, values_from = Freq) rownames(data_piv) <- data_piv$Comorbidities # scale data
# Disimilarity matrix d <- dist(data_piv, method = "euclidean") #Hierarchicla clustering using Complete Linkage hc_complete <- hclust(d, method = "complete") #Plot the obtained dendogram plot(hc_complete, cex = 0.6, hang = -31)
# Compute with agnes hc_agnes <- agnes(data_piv, method = "complete") # Agglomerative coeffiecient hc_agnes$ac
hc_agnes_2 <- agnes(data_piv, method = "ward") pltree(hc_agnes_2, cex = 0.6, hang = -1, main = "Dendrogram of agnes")
Divisive Hierarchicla Clustering
# compute divisive hierarchical clustering hc_diana<- diana(data_piv) # Divisive coefficient; amount of clustering structure found hc_diana$dc #plot dendogram pltree(hc_diana, cex = 0.6, hang = -1, main = "Dendogram of diana")
Working with Dendrograms
# Ward's method hc_ward_method <- hclust(d, method = "ward.D2") # Cut tree into 4 groups sub_grp <- cutree(hc_ward_method, k = 4) #Number of countries in each cluster table(sub_grp)
plot(hc_ward_method, cex =0.6) rect.hclust(hc_ward_method , k = 3, border = 2:5)
res_hc <- data_piv %>% dist(method = "euclidean") %>% hclust(method = "ward.D2") fviz_dend(res_hc, k = 4, cex = 0.5, k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"), color_labels_by_k = TRUE, rect = TRUE)
#data_piv$chills <- as.numeric(data_piv$chills ) # data_piv$cough <- as.integer(data_piv$cough ) # data_piv$diarrhoea <- as.integer(data_piv$diarrhoea) # data_piv$fatigue <- as.integer(data_piv$fatigue) # data_piv$headache <- as.integer(data_piv$headache) # data_piv$loss_smell_taste <- as.integer(data_piv$loss_smell_taste) # data_piv$muscle_ache <- as.integer(data_piv$muscle_ache) # data_piv$nasal_congestion <- as.integer(data_piv$nasal_congestion) # data_piv$nausea_vomiting <- as.integer(data_piv$nausea_vomiting) # data_piv$shortness_breath <- as.integer(data_piv$shortness_breath) # data_piv$sore_throat <- as.integer(data_piv$sore_throat) # data_piv$sputum <- as.integer(data_piv$sputum) # data_piv$temperature <- as.integer(data_piv$temperature) level_key_comorbidities <- c("kidney disease" = "kidney_disease", "lung condition" = "lung_condition", "diabetes type one" = "diabetes_type_one", "diabetes type two" = "diabetes_type_two", "liver disease" = "liver_disease", "heart disease" = "heart_disease") data_pi<- data_piv %>% tidyr::replace_na(list(nausea_vomiting = 0)) %>% dplyr::mutate(Comorbidities = forcats::fct_recode(Comorbidities, !!!level_key_comorbidities)) %>% dplyr::rename('loss of smell and taste' = loss_smell_taste, 'muscle ache' = muscle_ache, 'nasal congestion'= nasal_congestion, 'nausea and vomiting' = nausea_vomiting, 'shortness of breath' = shortness_breath, 'sore throat' = sore_throat) data_pi <- as.data.frame(data_pi) data_pi$Comorbidities <- as.character(data_pi$Comorbidities) rownames(data_pi) <- data_pi$Comorbidities #data_scaled <- as.data.frame(scale(data_piv[2:14]))
km_res <- kmeans(data_pi[,2:14], centers = 3, nstart = 25) print(km_res)
fviz_cluster(km_res, data = data_pi[2:14])
# Cut agnes() tree into 3 groups hc_a <- agnes(data_pi, method = "ward") cutree(as.hclust(hc_a), k = 3) # Cut diana() tree into 3 groups hc_d <- diana(data_pi) cutree(as.hclust(hc_d), k = 3)
Comparing two deprograms. Comparing hierarchical clustering with complete linkage versus Ward's method.
The output displays "unique" nodes, with a combination of labels/items not present in the other tree highlighted with dashed line. The quality of the alignment of the two trees can be measured using the function entanglement. Entanglement is a measure between 1 (full entanglement) 0 (no entanglement). A lower entanglement coefficient corresponds to a good alignment.
# Compute distance matrix res_dist <- dist(data_pi, method = "euclidean") # Compute 2 hierarchical clusterings hc_complete <- hclust(res_dist, method = "complete") hc_ward <- hclust(res_dist, method = "ward.D2") # Create two dendrograms dend_complete <- as.dendrogram (hc_complete) dend_ward <- as.dendrogram (hc_ward) tanglegram(dend_complete, dend_ward, margin_inner = 10)
dend_list <- dendlist(dend_complete, dend_ward) tanglegram(dend_complete, dend_ward, margin_inner = 10, highlight_distinct_edges = FALSE, #Turn-off dashed line common_subtrees_color_lines = FALSE, # Turn-off line colors common_subtrees_color_branches = TRUE, # Color common branches main = paste("entanglement =", round(entanglement(dend_list),2)))
Partitioning clustering
I. K-means clustering: 1. each cluster represented by the center (mean of the data points) 2. sensitive to outliers
II. K-medoids clustering /PAM (Partitioning around Medoids) 1. each cluster represented by one of the objects in the cluster 2. less sensitive to outliers
III. CLARA algorithm (Clustering Large Applications) = PAM for large datasets ??
Optimal number of clusters
fviz_nbclust(data_pi[2:14], kmeans, method = "wss",k.max = 8 )
Average Silhouette Method - this method does suggest the same number of clusters just as elbow has indicated.
fviz_nbclust(data_pi[2:4], kmeans, method = "silhouette", k.max = 8)
Gap Statistic Method - the same
gap_stat <- clusGap(data_pi[2:14], kmeans, nstart = 25, K.max = 8, B = 50) fviz_gap_stat(gap_stat)
# data_piv %>% # mutate(Cluster = km_res$cluster) %>% # group_by(Cluster) %>% # summarise_all("mean")
library(pheatmap) pheatmap(t(data_pi[-1]), cutree_cols = 3)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.