In gabrielburcea/cvindia: Analysis of symptoms

library(tidyverse)
library(cluster)
library(plotly)
library(fpc)
library(dendextend)
library(factoextra)
library(FactoMineR)
library(NbClust)
library(caret)
library(DMwR)
library(ggraph)

data_cluster <- read_csv("/Users/gabrielburcea/rprojects/stats_data_whole/data_categ_no_sev.csv")

level_key_temperature <- c("Yes" = "37.5-38", 
                           "Yes" = "38.1-39", 
                           "Yes" =  "38.2-39",
                           "Yes" = "39.1-41")



data_select <- data_cluster %>%
  dplyr::select(ID, covid_tested, chills, cough, diarrhoea, fatigue, headache, loss_smell_taste, muscle_ache, 
                nasal_congestion, nausea_vomiting, shortness_breath, sore_throat, sputum, temperature, asthma, diabetes_type_one,
                diabetes_type_two, obesity, hypertension, heart_disease, lung_condition, liver_disease, kidney_disease) %>%
  dplyr::filter(covid_tested != "none") 

covid_tested_levels <- c("positive" = "showing symptoms")

data_transf <- data_select %>% 
  dplyr::mutate(covid_tested = forcats::fct_recode(covid_tested, !!!covid_tested_levels), 
                temperature = forcats::fct_recode(temperature, !!!level_key_temperature)) %>%
  dplyr::filter(covid_tested != "none")

data_transf$temperature <- as.character(data_transf$temperature)

data_transf$ID <- NULL
data_transf$covid_tested <- NULL

level_key_comorbidities <-
  c("kidney disease" = "kidney_disease",
    "lung condition" = "lung_condition",
    "diabetes type one" = "diabetes_type_one",
    "diabetes type two" = "diabetes_type_two",
    "liver disease" = "liver_disease",
    "heart disease" = "heart_disease")


data_pi<- data_transf %>%
  tidyr::replace_na(list(nausea_vomiting = 0)) %>%
  dplyr::rename('loss of smell and taste' = loss_smell_taste, 'muscle ache' = muscle_ache, 'nasal congestion'= nasal_congestion,
                'nausea and vomiting' = nausea_vomiting, 'shortness of breath' = shortness_breath, 'sore throat' = sore_throat, 
                "kidney disease" = kidney_disease, "diabetes type one" = diabetes_type_one, "diabetes type two" = diabetes_type_two,
                "liver disease" = liver_disease, "heart disease" = heart_disease)


# data_piv <- data_transf %>%
#   pivot_longer(cols = 14:22,
#                names_to = "Comorbidities",
#                values_to = "Bolean") %>%
#   dplyr::filter(Bolean == "Yes") %>%
#   data_piv <- data_piv %>%
#   pivot_wider(names_from = Comorbidities, values_from = Freq) %>%
#   pivot_longer(cols = 3:15,
#              names_to = "Symptoms",
#              values_to = "Yes_No")  %>%
#   dplyr::group_by(Comorbidities, Symptoms, Yes_No) %>%
#   dplyr::summarise(Count = n()) %>%
#   dplyr::mutate(Freq = Count / sum(Count)*100) %>%
#   dplyr::filter(Yes_No == "Yes")


# data_piv$Bolean <- NULL

# 

# data_piv$Comorbidities <- as.character(data_piv$Comorbidities)



# scale data

This is unregularised partial correlation network Threshold argument - removes the edges that are not significant. If I pass any threshold a lot of edges dissapear

library(qgraph)

cor_mat <- cor_auto(data_pi)

view(round(cor_mat,2))
graph_pcor <- qgraph(cor_mat, graph = "pcor", layout = "spring", threshold = "bonferroni", sampleSize = nrow(data_pi), alpha = 0.01) # t


# threshold Inadditiontoanumericvaluetoomitedgesthisargumentcanalsobeassignedastringto omit insignficant edges. Note that this REMOVES edges from the network (which influences centrality measures and the spring layout). Can be "sig" to compute significance without correction for multiple testing, "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr" or "none" which are used directly in the adjust argument in corr.p of the psych package (Revelle, 2014). In addition, this argument can be assigned "locfdr" in which edges are set to zero if the local FDR is below FDRcutoff. fdrtool from the fdrtool package (Klaus and Strimmer, 2014) is used to compute these measures, which is used inside FDRnetwork.

Estimating a partial correlation network using LASSO regularization and EBIC model selection can be done by setting graph = "glasso". The tuning argument sets the EBIC hyperparameter. Set between 0 (more connections but also more spurious connections) and 0.5 (more parsimony, but also missing more connections):

graph_lasso <- qgraph(cor_mat, graph = "glasso", layout = "spring", tuning = 0.25, 
                      sampleSize = nrow(data_pi))

Centrality analysis

centRes <- centrality(graph_lasso)

# Node strenght (degree):

centRes$OutDegree

# Closeness:

centRes$Closeness

# Betweeenness: 
centRes$Betweenness

centralityPlot(graph_lasso)

compare different networks

centralityPlot(GGM = list(unregularized = graph_pcor, regularized = graph_lasso))

to make edges in graphs comparable in qgraph, the cut, minimum and maximum arguments need to be set to the same values. details = TRUE

qgraph(cor_mat, graph = "glasso", layout = "spring", tuning = 0.25, 
       sampleSize = nrow(data_pi), minimum = 0, cut = 0.01, maximum = 1, details = TRUE, 
       esize = 20) # the more I increase the cut the less edges are  present - I have tried with 0.15 , 0.10

Comparable layouts

Layout <- averageLayout(graph_pcor,graph_lasso)
layout(t(1:2))
qgraph(cor_mat, graph = "pcor", layout = Layout,threshold = "bonferroni",
                     sampleSize = nrow(data_pi), minimum = 0, 
                      cut = 0.15, maximum = 1, details = TRUE,
                      esize = 20, title = "Partial correlations") # 

qgraph(cor_mat, graph = "glasso", layout = Layout, tuning = 0.25,
                     sampleSize = nrow(data_pi), minimum = 0,
                      cut = 0.15, maximum = 1, details = TRUE,
                      esize = 20, title = "LASSO regularization")

qgraph(cor_mat, graph = "glasso", layout = "spring", tuning = 0.25,
                    sampleSize = nrow(data_pi), legend.cex = 0.2, vsize = 5,
                    esize = 15, pastel = TRUE, posCol = "#003399",
                    negCol = "#FF9933", borders = FALSE, vTrans = 200,
                    details = TRUE)