In gabrielburcea/cvindia: Analysis of symptoms

knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(cluster)
library(plotly)
library(fpc)
library(dendextend)
library(factoextra)
library(FactoMineR)
library(NbClust)
library(caret)
library(DMwR)
library(EnsCat)

data_cluster <-  read_csv("/Users/gabrielburcea/rprojects/data/your.md/cleaned_data_22092020.csv")

data_select <- data_cluster %>%
  dplyr::select(id, covid_tested, age, gender, number_morbidities, chills, cough, diarrhoea, fatigue, headache, loss_smell_taste, muscle_ache, 
                nasal_congestion, nausea_vomiting, shortness_breath, sore_throat, sputum, temperature, loss_appetite, chest_pain, itchy_eyes, joint_pain, 
                asthma, diabetes_type_one, diabetes_type_two, obesity, hypertension, heart_disease, lung_condition, liver_disease, kidney_disease) %>%
  dplyr::filter(covid_tested == "positive") %>%
  dplyr::filter(number_morbidities <= 1) %>%
  drop_na()

# data_select <- dplyr::mutate(data_select,
#                             n_cmdt =  ifelse(number_morbidities %in% 0:1, "1 morbidity",
#                                       ifelse(number_morbidities %in% 2 , "2 comorbidities",
#                                       ifelse(number_morbidities %in% 3:99, ">3 comorbidites", NA)))) %>%
#   dplyr::select(-number_morbidities)
# covid_tested_levels <- c("positive" = "showing symptoms")

# data_transf <- data_select %>% 
#   dplyr::mutate(covid_tested = forcats::fct_recode(covid_tested, !!!covid_tested_levels))

# data_piv <- data_select %>%
#   pivot_longer(cols = 23:31, 
#                names_to = "comorbidities",
#                values_to = "bolean") #%>%
# 
# bolean_levels <- c("healthy" = "No")

# data_piv <- data_piv %>%
#   dplyr::mutate(bolean = forcats::fct_recode(bolean, !!!bolean_levels)) %>%
#   mutate(
#     b = as.character(bolean), 
#     unhealthy = b != "healthy", 
#     comorbidities = replace(b, unhealthy, comorbidities[unhealthy]), 
#     b = NULL, unhealthy = NULL
#   )

# data_piv <- data_piv %>%
#   distinct()
#data_piv_id <- dplyr::mutate(data_piv, respondents_id = rownames(data_piv))


# data_select <- data_piv %>%
#   dplyr::select(id, comorbidities, chills, cough, diarrhoea, fatigue, headache, loss_smell_taste, muscle_ache, 
#                 nasal_congestion, nausea_vomiting, shortness_breath, sore_throat, sputum, temperature, loss_appetite, chest_pain, itchy_eyes, joint_pain) %>%
#   drop_na()



#   
# data_select <- data_select %>%
#   dplyr::mutate(age_band = case_when(age == 0 | age <= 19 ~ '0-19',
#                                      age == 20 | age <= 20 ~ '20-29',
#                                      age == 30 | age <= 39 ~ '30-39',
#                                      age == 40|  age <= 49 ~ '40-49',
#                                      age == 50|  age <= 59 ~ '50-59',
#                                      age == 60 | age <= 69 ~ '60-69',
#                                      age == 70 | age <= 79 ~ '70-79',
#                                      age == 80 | age <= 89 ~ '80-89',
#                                      age >= 90  ~ '90+')) %>%
#   dplyr::select(-age)


#data_rec <- ifelse(data_piv_id[,3:19] == "Yes", 1,0)
#data_select <- cbind(data_piv_id[,1:2], data_rec)
data_select$gender <- as.factor(data_select$gender)
#data_select$age_band <- as.factor(data_select$age_band)
# data_select$comorbidities <- as.factor(data_select$comorbidities)
# data_select$n_cmdt <- as.factor(data_select$n_cmdt)
data_select$chills <- as.factor(data_select$chills)
data_select$cough <- as.factor(data_select$cough)
data_select$diarrhoea <- as.factor(data_select$diarrhoea)
data_select$fatigue <- as.factor(data_select$fatigue)
data_select$headache <- as.factor(data_select$headache)
data_select$loss_smell_taste <- as.factor(data_select$loss_smell_taste)
data_select$muscle_ache <- as.factor(data_select$muscle_ache)
data_select$ nasal_congestion <- as.factor(data_select$ nasal_congestion)
data_select$nausea_vomiting <- as.factor(data_select$nausea_vomiting)
data_select$shortness_breath <- as.factor(data_select$shortness_breath)
data_select$sore_throat <- as.factor(data_select$sore_throat)
data_select$sputum <- as.factor(data_select$sputum)
data_select$temperature <- as.factor(data_select$temperature)
data_select$loss_appetite <- as.factor(data_select$loss_appetite)
data_select$chest_pain <- as.factor(data_select$chest_pain)
data_select$itchy_eyes <- as.factor(data_select$itchy_eyes)
data_select$joint_pain <- as.factor(data_select$joint_pain)


data_select <- as.data.frame(data_select)


#data_test <- head(data_select, 5000)


#rownames(data_test) <- data_test$respondents_id

rownames(data_select) <- data_select$respondents_id

my.seed <- set.seed(22)

gower_distance <- cluster::daisy(data_select[,6:22], metric = "gower")

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

#Hierarchicla clustering using Complete Linkage
hc_complete_gower <- hclust(gower_distance, method = "complete")

#Plot the obtained dendogram 
plot(hc_complete_gower, cex = 0.6, hang = -31)

# Compute with agnes 
hc_agnes <- agnes(data_select[,6:22], method = "ward")

# Agglomerative coeffiecient 
hc_agnes$ac

# Agglomerative coeffiecient 
#hc_agnes$ac

#hc_agnes_2 <- agnes(data_test[,3:22], method = "ward")

pltree(hc_agnes, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

# compute divisive hierarchical clustering 
hc_diana <- diana(data_select[,6:22])

# Divisive coefficient; amount of clustering structure found 
hc_diana$dc


#plot dendogram 

pltree(hc_diana, cex = 0.6, hang = -1, main = "Dendogram of diana")

# Ward's method 

hc_ward_method <- hclust(gower_distance, method = "ward.D2")

# Cut tree into 4 groups 

sub_grp <- cutree(hc_agnes, k = 4)

#Number of countries in each cluster
table(sub_grp)


plot(hc_agnes)

rect.hclust(hc_ward_method , k = 4, border = 2:5)

cstats.table <- function(dist, tree, k) {
  clust.assess <-
    c(
      "cluster.number",
      "n",
      "within.cluster.ss",
      "average.within",
      "average.between",
      "wb.ratio",
      "dunn2",
      "avg.silwidth"
    )

  clust.size <- c("cluster.size")

  stats.names <- c()

  row.clust <- c()

  output.stats <- matrix(ncol = k, nrow = length(clust.assess))

  cluster.sizes <- matrix(ncol = k, nrow = k)
  for (i in c(1:k)) {
    row.clust[i] <- paste("Cluster-", i, " size")
  }
  for (i in c(2:k)) {
    stats.names[i] <- paste("Test", i - 1)

    for (j in seq_along(clust.assess)) {
      output.stats[j, i] <-
        unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]

    }

    for (d in 1:k) {
      cluster.sizes[d, i] <-
        unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
      dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
      cluster.sizes[d, i]

    }
  }

  output.stats.df <- data.frame(output.stats)
  cluster.sizes <- data.frame(cluster.sizes)
  cluster.sizes[is.na(cluster.sizes)] <- 0
  rows.all <- c(clust.assess, row.clust)
  # rownames(output.stats.df) <- clust.assess
  output <- rbind(output.stats.df, cluster.sizes)[, -1]
  colnames(output) <- stats.names[2:k]
  rownames(output) <- rows.all
  is.num <- sapply(output, is.numeric)
  output[is.num] <- lapply(output[is.num], round, 2)
  output
}

stats_divisive <- cstats.table(gower_distance, hc_diana, 7)

stats_divisive

knitr::kable(stats_divisive)

stats_agglomerative <- cstats.table(gower_distance,  hc_agnes, 7)

knitr::kable(stats_agglomerative)

elb_div <-
  ggplot(data = data.frame(t(
    cstats.table(gower_distance, hc_diana, 15)
  )),
  aes(x = cluster.number, y = within.cluster.ss)) +
  geom_point() +
  geom_line() +
  ggtitle("Divisive clustering") +
  labs(x = "Number of clusters", y = "Within clusters sum of squares(SS)") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_minimal()

elb_div

sil_div <-
  ggplot(data = data.frame(t(
    cstats.table(gower_distance, hc_diana , 15)
  )),
  aes(x = cluster.number, y = avg.silwidth)) +
  geom_point() +
  geom_line() +
  ggtitle("Divisive clustering") +
  labs(x = "Number of clusters", y = "Average silhouette width") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_minimal()

sil_div

# fviz_dend(hc_agnes, k = 3, 
#           cex = 0.5, 
#           k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"), 
#           color_labels_by_k = TRUE, 
#           rect = TRUE)

hc_diana_tree <- as.data.frame(cutree(hc_diana, k=4)) 

data_clustered_1<- cbind(hc_diana_tree, data_select)

write.csv(data_clustered_1, file = "/Users/gabrielburcea/rprojects/data/addded_clusters_1.csv")

table <- data_clustered_1 %>%
  pivot_longer(cols = 24:32, 
               names_to = "comorbidities", 
               values_to = "bolean_yes_no") %>%
  mutate(comorbidities_2 = case_when(bolean_yes_no == "No" ~ "healthy", 
                              TRUE ~ comorbidities))



table_2 <- table %>%
  group_by(id) %>% 
  mutate(morbidity_healthy = case_when(bolean_yes_no == 'Yes' ~ comorbidities, (!any(bolean_yes_no == 'Yes')) & row_number()==1 ~ 'healthy')) %>%
  select(-comorbidities, -comorbidities_2, bolean_yes_no) %>%
  drop_na()


final_results_symptoms_all <- table_2 %>%
  dplyr::rename(cluster = "cutree(hc_diana, k = 4)") %>%
  dplyr::group_by(cluster, morbidity_healthy) %>%
  tally() %>%
  dplyr::mutate(Percentage = n/sum(n) * 100)

write.csv(table_2, file = "/Users/gabrielburcea/rprojects/data/table_clusters_added.csv", row.names = FALSE)
write.csv(final_results_symptoms_all, file = "/Users/gabrielburcea/rprojects/data/final_results_symptoms_all.csv", row.names = FALSE)



final_results_symptoms <- table %>%
  dplyr::rename(cluster = "cutree(hc_agnes, k = 3)") %>%
  dplyr::group_by(cluster, symptoms) %>%
  tally() %>%
  dplyr::mutate(Percentage = n/sum(n) *100) %>% 
  dplyr::select(cluster, symptoms, Percentage) %>%
  pivot_wider(names_from = cluster, values_from = "Percentage")

write.csv(final_results_symptoms, file = "/Users/gabrielburcea/rprojects/data/final_results_symptoms.csv", row.names = FALSE)