library(tidyverse)
library(cluster)
library(plotly)
library(fpc)
library(dendextend)
library(factoextra)
library(FactoMineR)
library(NbClust)
library(caret)
library(DMwR)
data_cluster_2 <- read_csv("/Users/gabrielburcea/rprojects/stats_data_whole/data_categ_nosev_comorbidity_one.csv")

level_key_temperature <- c("Yes" = "37.5-38", 
                           "Yes" = "38.1-39", 
                           "Yes" =  "38.2-39",
                           "Yes" = "39.1-41")



data_select <- data_cluster %>%
  dplyr::select(id, covid_tested, chills, cough, diarrhoea, fatigue, headache, loss_smell_taste, muscle_ache, 
                nasal_congestion, nausea_vomiting, shortness_breath, sore_throat, sputum, temperature, asthma, diabetes_type_one,
                diabetes_type_two, obesity, hypertension, heart_disease, lung_condition, liver_disease, kidney_disease) %>%
  dplyr::filter(covid_tested != "none") 

covid_tested_levels <- c("positive" = "showing symptoms")

data_transf <- data_select %>% 
  dplyr::mutate(covid_tested = forcats::fct_recode(covid_tested, !!!covid_tested_levels), 
                temperature = forcats::fct_recode(temperature, !!!level_key_temperature)) %>%
  dplyr::filter(covid_tested != "negative")



data_piv <- data_transf %>%
  pivot_longer(cols = 16:24, 
               names_to = "Comorbidities",
               values_to = "Bolean") %>%
  dplyr::filter(Bolean == "Yes") %>%
  pivot_longer(cols = 3:15, 
               names_to = "Symptoms", 
               values_to = "Yes_No")  %>%
  dplyr::group_by(Comorbidities, Symptoms, Yes_No) %>%
  dplyr::summarise(Count = n()) %>%
  dplyr::mutate(Freq = Count / sum(Count)*100) %>% 
  dplyr::filter(Yes_No == "Yes")


data_piv$Bolean <- NULL
data_piv$Yes_No <- NULL
data_piv$Count <- NULL

data_piv <- data_piv %>%
  pivot_wider(names_from = Symptoms, values_from = Freq)

rownames(data_piv) <- data_piv$Comorbidities

# scale data
# Disimilarity matrix 
d <- dist(data_piv, method  = "euclidean")

#Hierarchicla clustering using Complete Linkage
hc_complete <- hclust(d, method = "complete")

#Plot the obtained dendogram 
plot(hc_complete, cex = 0.6, hang = -31)
# Compute with agnes 
hc_agnes <- agnes(data_piv, method = "complete")

# Agglomerative coeffiecient 
hc_agnes$ac
hc_agnes_2 <- agnes(data_piv, method = "ward")

pltree(hc_agnes_2, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

Divisive Hierarchicla Clustering

# compute divisive hierarchical clustering 
hc_diana<- diana(data_piv)

# Divisive coefficient; amount of clustering structure found 
hc_diana$dc


#plot dendogram 

pltree(hc_diana, cex = 0.6, hang = -1, main = "Dendogram of diana")

Working with Dendrograms

# Ward's method 

hc_ward_method <- hclust(d, method = "ward.D2")

# Cut tree into 4 groups 

sub_grp <- cutree(hc_ward_method, k = 4)

#Number of countries in each cluster
table(sub_grp)
plot(hc_ward_method,  cex =0.6)

rect.hclust(hc_ward_method , k = 3, border = 2:5)
res_hc <- data_piv %>%
  dist(method = "euclidean") %>%
  hclust(method = "ward.D2")

fviz_dend(res_hc, k = 4, 
          cex = 0.5, 
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"), 
          color_labels_by_k = TRUE, 
          rect = TRUE)
#data_piv$chills <- as.numeric(data_piv$chills )
# data_piv$cough <- as.integer(data_piv$cough )
# data_piv$diarrhoea <- as.integer(data_piv$diarrhoea)
# data_piv$fatigue <- as.integer(data_piv$fatigue)
# data_piv$headache <- as.integer(data_piv$headache)
# data_piv$loss_smell_taste <- as.integer(data_piv$loss_smell_taste)
# data_piv$muscle_ache <- as.integer(data_piv$muscle_ache)
# data_piv$nasal_congestion <- as.integer(data_piv$nasal_congestion)
# data_piv$nausea_vomiting <- as.integer(data_piv$nausea_vomiting)
# data_piv$shortness_breath <- as.integer(data_piv$shortness_breath)
# data_piv$sore_throat <- as.integer(data_piv$sore_throat)
# data_piv$sputum <- as.integer(data_piv$sputum)
# data_piv$temperature <- as.integer(data_piv$temperature)

level_key_comorbidities <-
  c("kidney disease" = "kidney_disease",
    "lung condition" = "lung_condition",
    "diabetes type one" = "diabetes_type_one",
    "diabetes type two" = "diabetes_type_two",
    "liver disease" = "liver_disease",
    "heart disease" = "heart_disease")


data_pi<- data_piv %>%
  tidyr::replace_na(list(nausea_vomiting = 0)) %>%
  dplyr::mutate(Comorbidities = forcats::fct_recode(Comorbidities, !!!level_key_comorbidities)) %>%
  dplyr::rename('loss of smell and taste' = loss_smell_taste, 'muscle ache' = muscle_ache, 'nasal congestion'= nasal_congestion,
                'nausea and vomiting' = nausea_vomiting, 'shortness of breath' = shortness_breath, 'sore throat' = sore_throat)


data_pi <- as.data.frame(data_pi)
data_pi$Comorbidities <- as.character(data_pi$Comorbidities)
rownames(data_pi) <- data_pi$Comorbidities
#data_scaled <- as.data.frame(scale(data_piv[2:14]))
km_res <- kmeans(data_pi[,2:14], centers = 3, nstart = 25)

print(km_res)
fviz_cluster(km_res, data = data_pi[2:14])
# Cut agnes() tree into 3 groups
hc_a <- agnes(data_pi, method = "ward")
cutree(as.hclust(hc_a), k = 3)

# Cut diana() tree into 3 groups
hc_d <- diana(data_pi)
cutree(as.hclust(hc_d), k = 3)

Comparing two deprograms. Comparing hierarchical clustering with complete linkage versus Ward's method.

The output displays "unique" nodes, with a combination of labels/items not present in the other tree highlighted with dashed line. The quality of the alignment of the two trees can be measured using the function entanglement. Entanglement is a measure between 1 (full entanglement) 0 (no entanglement). A lower entanglement coefficient corresponds to a good alignment.

# Compute distance matrix
res_dist <- dist(data_pi, method = "euclidean")

# Compute 2 hierarchical clusterings
hc_complete <- hclust(res_dist, method = "complete")
hc_ward <- hclust(res_dist, method = "ward.D2")

# Create two dendrograms
dend_complete <- as.dendrogram (hc_complete)
dend_ward <- as.dendrogram (hc_ward)

tanglegram(dend_complete, dend_ward, 
           margin_inner = 10)
dend_list <- dendlist(dend_complete, dend_ward)

tanglegram(dend_complete, dend_ward, 
           margin_inner = 10,
           highlight_distinct_edges = FALSE, #Turn-off dashed line
           common_subtrees_color_lines = FALSE, # Turn-off line colors
           common_subtrees_color_branches = TRUE, # Color common branches
           main = paste("entanglement =", round(entanglement(dend_list),2)))

Partitioning clustering

I. K-means clustering: 1. each cluster represented by the center (mean of the data points) 2. sensitive to outliers

II. K-medoids clustering /PAM (Partitioning around Medoids) 1. each cluster represented by one of the objects in the cluster 2. less sensitive to outliers

III. CLARA algorithm (Clustering Large Applications) = PAM for large datasets ??

Optimal number of clusters

  1. Elbow method: location of bend in WSS(within-cluster sum of square for eac K) plot - minimize intra-cluster variation
  2. Average silhouette method: maximum of average silhouette curve (average silhouette of observations for each K)
  3. Gap statistic method: compares the total within intra-cluster variation for different values of K with their expected values under null reference distribution of the data
fviz_nbclust(data_pi[2:14],  kmeans, method = "wss",k.max = 8 )

Average Silhouette Method - this method does suggest the same number of clusters just as elbow has indicated.

fviz_nbclust(data_pi[2:4], kmeans, method = "silhouette", k.max = 8)

Gap Statistic Method - the same

gap_stat <- clusGap(data_pi[2:14], kmeans, nstart = 25, K.max = 8, B = 50)

fviz_gap_stat(gap_stat)
# data_piv %>%
#   mutate(Cluster = km_res$cluster) %>%
#   group_by(Cluster) %>%
#   summarise_all("mean")
library(pheatmap)
pheatmap(t(data_pi[-1]), cutree_cols = 3)


gabrielburcea/cvindia documentation built on Feb. 17, 2021, 3:31 a.m.