library(tidyverse) library(cluster) library(plotly) library(fpc) library(dendextend) library(factoextra) library(FactoMineR) library(NbClust) library(caret) library(DMwR)
data_cluster <- read_csv("/Users/gabrielburcea/rprojects/data/data_no_sev.csv") data_select <- data_cluster %>% dplyr::select(id, covid_tested, country, chills, cough, diarrhoea, fatigue, headache, loss_smell_taste, muscle_ache, nasal_congestion, nausea_vomiting, shortness_breath, sore_throat, sputum, temperature) %>% dplyr::filter(covid_tested != "none") covid_tested_levels <- c("positive" = "showing symptoms") level_key_temperature <- c("Yes" = "37.5-38", "Yes" = "38.1-39", "Yes" = "38.2-39", "Yes" = "39.1-41") levels_country <- c ('USA' = "United States of America", 'United Kingdom' = "Great Britain") data_transf <- data_select %>% dplyr::mutate(covid_tested = forcats::fct_recode(covid_tested, !!!covid_tested_levels), temperature = forcats::fct_recode(temperature, !!!level_key_temperature), country = forcats::fct_recode(country, !!!levels_country)) data_transf$covid_tested <- as.factor(data_transf$covid_tested) data_transf$country <- as.factor(data_transf$country) data_transf$chills <- as.factor(data_transf$chills) data_transf$cough <- as.factor(data_transf$cough) data_transf$diarrhoea <- as.factor(data_transf$diarrhoea) data_transf$fatigue <- as.factor(data_transf$fatigue) data_transf$headache <- as.factor(data_transf$headache) data_transf$loss_smell_taste <- as.factor(data_transf$loss_smell_taste) data_transf$muscle_ache <- as.factor(data_transf$muscle_ache) data_transf$ nasal_congestion <- as.factor(data_transf$ nasal_congestion) data_transf$nausea_vomiting <- as.factor(data_transf$nausea_vomiting) data_transf$shortness_breath <- as.factor(data_transf$shortness_breath) data_transf$sore_throat <- as.factor(data_transf$sore_throat) data_transf$sputum <- as.factor(data_transf$sputum) data_transf$temperature <- as.factor(data_transf$temperature) gather_divided <- data_transf %>% tidyr::pivot_longer(cols = 4:16, names_to = "Symptom", values_to = "Severity") %>% dplyr::filter(Severity != "No") %>% dplyr::group_by(Symptom, country) %>% dplyr::summarise(Count = n()) test_data <- gather_divided %>% pivot_wider(names_from = Symptom, values_from = Count) test_data$chills <- as.numeric(test_data$chills) test_data$cough <- as.numeric(test_data$cough) test_data$diarrhoea <- as.numeric(test_data$diarrhoea) test_data$fatigue <- as.numeric(test_data$fatigue) test_data$headache <- as.numeric(test_data$headache) test_data$loss_smell_taste <- as.numeric(test_data$loss_smell_taste) test_data$muscle_ache <- as.numeric(test_data$muscle_ache) test_data$nasal_congestion <- as.numeric(test_data$nasal_congestion) test_data$nausea_vomiting <- as.numeric(test_data$nausea_vomiting) test_data$shortness_breath <- as.numeric(test_data$shortness_breath) test_data$sore_throat <- as.numeric(test_data$sore_throat) test_data$sputum <- as.numeric(test_data$sputum) test_data$temperature <- as.numeric(test_data$temperature) test_data <- test_data %>% mutate_if(is.numeric, funs(replace_na(., 0))) df_scaled <- scale(test_data[2:14]) rownames(df_scaled) <- test_data$country
Partitioning clustering
I. K-means clustering: 1. each cluster represented by the center (mean of the data points) 2. sensitive to outliers
II. K-medoids clustering /PAM (Partitioning around Medoids) 1. each cluster represented by one of the objects in the cluster 2. less sensitive to outliers
III. CLARA algorithm (Clustering Large Applications) = PAM for large datasets ??
Optimal number of clusters
fviz_nbclust(df_scaled, kmeans, method = "wss") + labs(subtitle = "Elbow method") fviz_nbclust(df_scaled, kmeans, method = "silhouette") + labs(subtitle = "Silhouette method") set.seed(22) fviz_nbclust(df_scaled, kmeans, nstart = 25, method = "gap_stat", nboot = 500) + labs(subtitle = "Gap statistic method")
M-means clustering
km_res <- kmeans( na.omit(df_scaled), 7, iter.max = 10, nstart = 25) fviz_cluster(km_res, data = df_scaled, ellipse.type = "convex", star.plot = TRUE, labelsize = 0, repel = FALSE, ggtheme = theme_minimal())
K-medoids clustering /PAM (Partitioning around Medoids)
pam_res <- pam(df_scaled, k = 7, metric = "euclidean", stand = FALSE ) pam_res$medoids fviz_cluster(pam_res, axes = c(1,2), show.clust.cent = FALSE, ellipse.type = "convex", labelsize = 0, repel = FALSE, star.plot = FALSE, ggtheme = theme_classic())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.