In gabrielburcea/cvindia: Analysis of symptoms

library(tidyverse)
library(cluster)
library(plotly)
library(fpc)
library(dendextend)
library(factorextra)

data_cluster <- read_csv("/Users/gabrielburcea/rprojects/data/data_no_sev.csv")

data_select <- data_cluster %>%
  dplyr::select(id, covid_tested, country, chills, cough, diarrhoea, fatigue, headache, loss_smell_taste, muscle_ache, 
                nasal_congestion, nausea_vomiting, shortness_breath, sore_throat, sputum, temperature) %>%
  dplyr::filter(covid_tested != "none") 


covid_tested_levels <- c("positive" = "showing symptoms")
level_key_temperature <- c("Yes" = "37.5-38", 
                           "Yes" = "38.1-39", 
                           "Yes" =  "38.2-39",
                           "Yes" = "39.1-41")



levels_country <- c ('USA' = "United States of America", 
                    'United Kingdom' = "Great Britain")

data_transf <- data_select %>% 
  dplyr::mutate(covid_tested = forcats::fct_recode(covid_tested, !!!covid_tested_levels), 
                temperature = forcats::fct_recode(temperature, !!!level_key_temperature), 
                country = forcats::fct_recode(country, !!!levels_country))

data_transf$covid_tested <- as.factor(data_transf$covid_tested)
data_transf$country <- as.factor(data_transf$country)
data_transf$chills <- as.factor(data_transf$chills)
data_transf$cough <- as.factor(data_transf$cough)
data_transf$diarrhoea <- as.factor(data_transf$diarrhoea)
data_transf$fatigue <- as.factor(data_transf$fatigue)
data_transf$headache <- as.factor(data_transf$headache)
data_transf$loss_smell_taste <- as.factor(data_transf$loss_smell_taste)
data_transf$muscle_ache <- as.factor(data_transf$muscle_ache)
data_transf$ nasal_congestion <- as.factor(data_transf$ nasal_congestion)
data_transf$nausea_vomiting <- as.factor(data_transf$nausea_vomiting)
data_transf$shortness_breath <- as.factor(data_transf$shortness_breath)
data_transf$sore_throat <- as.factor(data_transf$sore_throat)
data_transf$sputum <- as.factor(data_transf$sputum)
data_transf$temperature <- as.factor(data_transf$temperature)

gather_divided <- data_transf %>%
  tidyr::pivot_longer(cols = 4:16,
                      names_to = "Symptom",
                      values_to = "Severity") %>%
  dplyr::filter(Severity != "No") %>%
  dplyr::group_by(Symptom, country) %>%
  dplyr::summarise(Count = n()) 


test_data <-  gather_divided %>% pivot_wider(names_from = Symptom, values_from = Count)

#test_data <- test_data %>% mutate_if(is.numeric, funs(replace_na(., 0)))

test_data <- na.omit(test_data)

test_data$chills  <- as.numeric(test_data$chills) 
test_data$cough  <- as.numeric(test_data$cough)
test_data$diarrhoea  <- as.numeric(test_data$diarrhoea)
test_data$fatigue  <- as.numeric(test_data$fatigue)
test_data$headache  <- as.numeric(test_data$headache)
test_data$loss_smell_taste  <- as.numeric(test_data$loss_smell_taste)
test_data$muscle_ache  <- as.numeric(test_data$muscle_ache)
test_data$nasal_congestion  <- as.numeric(test_data$nasal_congestion)
test_data$nausea_vomiting  <- as.numeric(test_data$nausea_vomiting)
test_data$shortness_breath  <- as.numeric(test_data$shortness_breath)
test_data$sore_throat  <- as.numeric(test_data$sore_throat)
test_data$sputum <- as.numeric(test_data$sputum)
test_data$temperature <- as.numeric(test_data$temperature)


df_scaled <- scale(test_data[2:14])

rownames(df_scaled) <- test_data$country

Get the gower distance - to calculate the disimilarity matrix

my.seed <- set.seed(22)

gower_distance <- cluster::daisy(df_scaled, metric = "gower")

gower_distance

class(gower_distance)

Agglomerative clustering vs. divisive clustering Agglomerative clustering is better in discovering small clusters

# The main input for the code below is dissimilarity (distance matrix)
# After dissimilarity matrix was calculated, the further steps will be the same for all data types
# I prefer to look at the dendrogram and fine the most appealing one first - in this case, I was looking for a more balanced one - to further continue with #assessment

divisive_clustering <- diana(as.matrix(gower_distance), diss = TRUE, keep.diss = TRUE)



plotly::ggplotly(plot(divisive_clustering, main = "Divisive"))

Assesing

agglomerative_clustering <- hclust(gower_distance, method = "complete")

plot(agglomerative_clustering, main = "Agglomerative, complete linkeges")

Assesing clusters

working with categorical variables, I might end up with non-sense clusters because the combination of their values is limited 0 they are discrete, s so is the number of their combinations. However, I do not want to have a very small number of clusters either - they are likely to be too general.
I am interested in distinctive groups of data points, such that the distance between them within clusters (or compactness) is mininaly, while the distance between groups(separation) is large as possible,
distance btween points is a measure of their disimilarity derived from dissimilarity matrix
the assessment of clustering is built around evaluation of compactness and separation

I will go for approaches:

elbow method: start with it when the compactness of clusters, or similarities within group are most important for my analysis
silhouette method: as a measure of data consistency, the silhouette plot displays a measure of how close each point in one cluster is to points in the neighboring clusters

Different number of clusters will correspond to the most compact / most distinctively separated clusters

cstats.table <- function(dist, tree, k) {
  clust.assess <-
    c(
      "cluster.number",
      "n",
      "within.cluster.ss",
      "average.within",
      "average.between",
      "wb.ratio",
      "dunn2",
      "avg.silwidth"
    )

  clust.size <- c("cluster.size")

  stats.names <- c()

  row.clust <- c()

  output.stats <- matrix(ncol = k, nrow = length(clust.assess))

  cluster.sizes <- matrix(ncol = k, nrow = k)
  for (i in c(1:k)) {
    row.clust[i] <- paste("Cluster-", i, " size")
  }
  for (i in c(2:k)) {
    stats.names[i] <- paste("Test", i - 1)

    for (j in seq_along(clust.assess)) {
      output.stats[j, i] <-
        unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.assess])[j]

    }

    for (d in 1:k) {
      cluster.sizes[d, i] <-
        unlist(cluster.stats(d = dist, clustering = cutree(tree, k = i))[clust.size])[d]
      dim(cluster.sizes[d, i]) <- c(length(cluster.sizes[i]), 1)
      cluster.sizes[d, i]

    }
  }

  output.stats.df <- data.frame(output.stats)
  cluster.sizes <- data.frame(cluster.sizes)
  cluster.sizes[is.na(cluster.sizes)] <- 0
  rows.all <- c(clust.assess, row.clust)
  # rownames(output.stats.df) <- clust.assess
  output <- rbind(output.stats.df, cluster.sizes)[, -1]
  colnames(output) <- stats.names[2:k]
  rownames(output) <- rows.all
  is.num <- sapply(output, is.numeric)
  output[is.num] <- lapply(output[is.num], round, 2)
  output
}

stats_divisive <- cstats.table(gower_distance, divisive_clustering, 7)

stats_divisive

average.within - which is average distance among observations within clusters, is shrinking, so does within cluster ss. Avrage silhouette width is also decreasing.

stats_agglomerative <- cstats.table(gower_distance, agglomerative_clustering, 7)

stats_agglomerative

I am using Elbow and Silhouette methods to indetifu the best number of clusters to better picture the trend.

Divisive clustering. I have produces the elbow graph. It shows how the within sum of squares - as a measure of closeness of observations: the lower it is the closer the obesrvations wtihin the clusters are - changed for the different number of clusers. Distinctive bend in the elbow where splitting clusters further gives only minor decrease in the SS. Probably 11 clusters?

elb_div <-
  ggplot(data = data.frame(t(
    cstats.table(gower_distance, divisive_clustering, 17)
  )),
  aes(x = cluster.number, y = within.cluster.ss)) +
  geom_point() +
  geom_line() +
  ggtitle("Divisive clustering") +
  labs(x = "Number of clusters", y = "Within clusters sum of squares(SS)") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_minimal()

elb_div

Agglomerative "elbow" - tells me I shall have 5 clusters. Isn't as smooth as the previous one.

elb_agglomerative <-
  ggplot(data = data.frame(t(
    cstats.table(gower_distance, agglomerative_clustering, 17)
  )),
  aes(x = cluster.number, y = within.cluster.ss)) +
  geom_point() +
  geom_line() +
  ggtitle("Aglomerative clustering") +
  labs(x = "Number of clusters", y = "Within clusters sum of squares(SS)") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_minimal()

elb_agglomerative

Silhouette - the rule is to choose the number that maximized the silhoutte coefficient because clusters should be distinctive (far) enough to be considered separate. The silhouette coefficient ranges between -1 and 1, with 1 indicating good consistency within clusters, -1 - not so good. This plot is a bit confusing - but will go for 4 ? Not sure

sil_div <-
  ggplot(data = data.frame(t(
    cstats.table(gower_distance, divisive_clustering, 17)
  )),
  aes(x = cluster.number, y = avg.silwidth)) +
  geom_point() +
  geom_line() +
  ggtitle("Divisive clustering") +
  labs(x = "Number of clusters", y = "Average silhouette width") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_minimal()

sil_div

sil_agglomerative <-
  ggplot(data = data.frame(t(
    cstats.table(gower_distance, agglomerative_clustering, 17)
  )),
  aes(x = cluster.number, y = avg.silwidth)) +
  geom_point() +
  geom_line() +
  ggtitle("Aglomerative clustering") +
  labs(x = "Number of clusters", y = "Average silhouette width") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme_minimal()

sil_agglomerative