library(tidyverse)
library(cluster)
library(plotly)
library(fpc)
library(dendextend)
library(factoextra)
library(FactoMineR)
library(NbClust)
library(caret)
library(DMwR)
data_cluster <- read_csv("/Users/gabrielburcea/rprojects/data/data_no_sev.csv")

data_select <- data_cluster %>%
  dplyr::select(id, covid_tested, country, chills, cough, diarrhoea, fatigue, headache, loss_smell_taste, muscle_ache, 
                nasal_congestion, nausea_vomiting, shortness_breath, sore_throat, sputum, temperature) %>%
  dplyr::filter(covid_tested != "none") 


covid_tested_levels <- c("positive" = "showing symptoms")
level_key_temperature <- c("Yes" = "37.5-38", 
                           "Yes" = "38.1-39", 
                           "Yes" =  "38.2-39",
                           "Yes" = "39.1-41")



levels_country <- c ('USA' = "United States of America", 
                    'United Kingdom' = "Great Britain")

data_transf <- data_select %>% 
  dplyr::mutate(covid_tested = forcats::fct_recode(covid_tested, !!!covid_tested_levels), 
                temperature = forcats::fct_recode(temperature, !!!level_key_temperature), 
                country = forcats::fct_recode(country, !!!levels_country))

data_transf$covid_tested <- as.factor(data_transf$covid_tested)
data_transf$country <- as.factor(data_transf$country)
data_transf$chills <- as.factor(data_transf$chills)
data_transf$cough <- as.factor(data_transf$cough)
data_transf$diarrhoea <- as.factor(data_transf$diarrhoea)
data_transf$fatigue <- as.factor(data_transf$fatigue)
data_transf$headache <- as.factor(data_transf$headache)
data_transf$loss_smell_taste <- as.factor(data_transf$loss_smell_taste)
data_transf$muscle_ache <- as.factor(data_transf$muscle_ache)
data_transf$ nasal_congestion <- as.factor(data_transf$ nasal_congestion)
data_transf$nausea_vomiting <- as.factor(data_transf$nausea_vomiting)
data_transf$shortness_breath <- as.factor(data_transf$shortness_breath)
data_transf$sore_throat <- as.factor(data_transf$sore_throat)
data_transf$sputum <- as.factor(data_transf$sputum)
data_transf$temperature <- as.factor(data_transf$temperature)

gather_divided <- data_transf %>%
  tidyr::pivot_longer(cols = 4:16,
                      names_to = "Symptom",
                      values_to = "Severity") %>%
  dplyr::filter(Severity != "No") %>%
  dplyr::group_by(Symptom, country) %>%
  dplyr::summarise(Count = n()) 


test_data <-  gather_divided %>% pivot_wider(names_from = Symptom, values_from = Count)

test_data$chills  <- as.numeric(test_data$chills) 
test_data$cough  <- as.numeric(test_data$cough)
test_data$diarrhoea  <- as.numeric(test_data$diarrhoea)
test_data$fatigue  <- as.numeric(test_data$fatigue)
test_data$headache  <- as.numeric(test_data$headache)
test_data$loss_smell_taste  <- as.numeric(test_data$loss_smell_taste)
test_data$muscle_ache  <- as.numeric(test_data$muscle_ache)
test_data$nasal_congestion  <- as.numeric(test_data$nasal_congestion)
test_data$nausea_vomiting  <- as.numeric(test_data$nausea_vomiting)
test_data$shortness_breath  <- as.numeric(test_data$shortness_breath)
test_data$sore_throat  <- as.numeric(test_data$sore_throat)
test_data$sputum <- as.numeric(test_data$sputum)
test_data$temperature <- as.numeric(test_data$temperature)

test_data <- test_data %>% mutate_if(is.numeric, funs(replace_na(., 0)))


df_scaled <- scale(test_data[2:14])

rownames(df_scaled) <- test_data$country

Partitioning clustering

I. K-means clustering: 1. each cluster represented by the center (mean of the data points) 2. sensitive to outliers

II. K-medoids clustering /PAM (Partitioning around Medoids) 1. each cluster represented by one of the objects in the cluster 2. less sensitive to outliers

III. CLARA algorithm (Clustering Large Applications) = PAM for large datasets ??

Optimal number of clusters

  1. Elbow method: location of bend in WSS(within-cluster sum of square for eac K) plot - minimize intra-cluster variation
  2. Average silhouette method: maximum of average silhouette curve (average silhouette of observations for each K)
  3. Gap statistic method: compares the total within intra-cluster variation for different values of K with their expected valyes under null reference distribution of the data
fviz_nbclust(df_scaled, kmeans, method = "wss") + labs(subtitle = "Elbow method")

fviz_nbclust(df_scaled, kmeans, method = "silhouette") + 
  labs(subtitle = "Silhouette method")

set.seed(22)

fviz_nbclust(df_scaled, kmeans, nstart = 25, method = "gap_stat", nboot = 500) +
  labs(subtitle = "Gap statistic method")

M-means clustering

km_res <- kmeans( na.omit(df_scaled), 7, iter.max = 10, nstart = 25)


fviz_cluster(km_res, data = df_scaled, ellipse.type = "convex", star.plot = TRUE, labelsize = 0, 
             repel = FALSE, ggtheme = theme_minimal())

K-medoids clustering /PAM (Partitioning around Medoids)

pam_res <- pam(df_scaled, k = 7, metric = "euclidean", stand = FALSE )

pam_res$medoids

fviz_cluster(pam_res, axes = c(1,2), show.clust.cent = FALSE, ellipse.type = "convex", 
             labelsize = 0, repel = FALSE, star.plot = FALSE, 
             ggtheme = theme_classic())




gabrielburcea/cvindia documentation built on Feb. 17, 2021, 3:31 a.m.