In sempwn/cr0eso: Estimation of R0 For Multiple Outbreaks

knitr::opts_chunk$set(dev="png", collapse = T, comment = "#>")
options(tibble.print_min = 4L, tibble.print_max = 4L)


if (!requireNamespace("ggstance", quietly = TRUE)) {
  stop("Package \"pkg\" needed for this vignette to build. Please install it.",
    call. = FALSE)
}
if (!requireNamespace("directlabels", quietly = TRUE)) {
  stop("Package \"pkg\" needed for this vignette to build. Please install it.",
    call. = FALSE)
}
if (!requireNamespace("R0", quietly = TRUE)) {
  stop("Package \"pkg\" needed for this vignette to build. Please install it.",
    call. = FALSE)
}
if (!requireNamespace("gridExtra", quietly = TRUE)) {
  stop("Package \"pkg\" needed for this vignette to build. Please install it.",
    call. = FALSE)
}


# load libraries
library(tidyverse)
library(kableExtra)
library(here)
library(ggstance)
library(scales)
library(gridExtra)
library(directlabels)
library(R0)
library("cr0eso")

# Set folder to store outputs
outdir <- here()

set.seed(836361)

# To get the colour scheme used in the paper, uncomment the below:
#devtools::install_github('Mikata-Project/ggthemr')
#library(ggthemr)
#ggthemr("fresh")
#palette(swatch())
# As of the time of writing, there is an open issue for this library. To use this colour scheme currently,  add "+ scale_colour_ggthemr_d()" to all ggplot calls. See https://github.com/Mikata-Project/ggthemr/issues/44.

BC Long Term Health Care outbreak data

The data is formatted as a list with 100 imputations of the missing symptom onset times. For each imputation, the list contains data on the number of cases, time series of cases (by symptom onset), the facility capacity and the outbreak reported date for each facility outbreak. It also contains the time series of cases as a matrix, but we will not use that here.

# View the final imputation of missing data:
str(BC_LTHC_outbreaks_100Imputs[[100]])

Capacity by outbreak size plot

plot_data <- tibble(location=BC_LTHC_outbreaks_100Imputs[[100]]$Location,  capacity=BC_LTHC_outbreaks_100Imputs[[100]]$capacity, outbreak_size=BC_LTHC_outbreaks_100Imputs[[100]]$num_cases, reported_date = BC_LTHC_outbreaks_100Imputs[[100]]$reported_date)

lab_dates <- pretty(plot_data$reported_date)

p <- plot_data %>%
  ggplot(aes(x=capacity,y=outbreak_size,color=as.numeric(reported_date))) +
  geom_point(size=1.8) +
  geom_abline(slope=1,intercept=0,linetype="dashed",alpha=0.5) +
  coord_cartesian(expand=FALSE,xlim = c(0,305), ylim=c(0,95)) +
  scale_color_continuous(breaks = as.numeric(lab_dates), 
                       labels = lab_dates,
                       type = "viridis") +
  theme_classic() +
  labs(y="Total outbreak size",x="Maximum capacity",
       color="Reported date")
show(p)

Cumulative cases plot

# Relabel locations according to manuscript labelling
fac_names <- c("Q", "I", "J", "L", "O", "A", "D", "N", "B", "K", "R", "H", "M", "C", "G", "F", "P", "E")

df <- data.frame(BC_LTHC_outbreaks_100Imputs[[100]]$case_matrix)
colnames(df) <- fac_names
df <- cbind(Day=1:nrow(df),df)

r0_uc_slope <- (3-1)/5
r0_lc_slope <- (1.1-1)/5

p2 <- df %>% 
  pivot_longer(-1) %>% filter(value > 0) %>% group_by(name) %>% mutate(csum = cumsum(value), name=factor(name, levels=fac_names)) %>%
  ggplot(aes(x = Day, y = csum, group = name, colour = name)) + 
  geom_line(show.legend = FALSE) + theme_classic() + xlab("Days since first symptom onset") +  ylab("Cumulative cases") + scale_y_log10() +
  geom_abline(slope=r0_lc_slope,intercept=0,linetype="dashed", col="black") + 
  geom_abline(slope=r0_uc_slope,intercept=0,linetype="dashed", col="black") +
  geom_dl(aes(label = name), method = list(dl.combine("last.points"), cex = 0.8)) + 
  annotate(
    "text",
    x = 5,
    y = exp(5 * r0_uc_slope + 1),
    angle = atan(r0_uc_slope * 10) * 180/pi,
    label = "R0 = 3"
  ) +
  annotate(
    "text",
    x = 10,
    y = exp(10 * r0_lc_slope - 0.01),
    angle = atan(r0_lc_slope * 10) * 180/pi,
    label = "R0 = 1.1"
  ) +
  coord_cartesian(ylim=c(1,100), xlim=c(0, 70), expand=FALSE)

show(p2)

Estimate R0 in each outbreak, using EG and ML methods

For each of the 100 imputations, we estimate R0 independently in each facility using exponential growth (EG) and maximum likelihood (ML) methods, with the 'R0' library.

# This chunk may take several minutes to run. It will also output some warning messages - that's ok.  
nIts <- length(BC_LTHC_outbreaks_100Imputs)
res <- vector(mode = "list", length = nIts)

for (its in 1:nIts){
mGT<-generation.time("gamma", c(5.2, 1.73))
# as in Ganyani et al, 2020, Singapore mean/sd
estr0<-rep(list(1),length(BC_LTHC_outbreaks_100Imputs[[1]]$Location) )
for (i in 1:length(BC_LTHC_outbreaks_100Imputs[[1]]$Location) ){
      end <- as.numeric(length(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]]) + 1 - 
        which.max(rev(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]])))
      # If the first day is the single maximum incidence day, as occurs in a few imputations of a few of the smaller outbreaks in this dataset, we set 'end' to the end of the outbreak. There is no perfect choice here, judgement recommended. 
      if (end==1){end=as.numeric(length(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]]))}
      t <- try(estimate.R(epid = BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]],
                          GT = mGT, time.step = 1, 
                          pop.size = BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
                          begin = 1, end = end,
                          methods=c("EG", "ML")))
      if("try-error" %in% class(t)){estr0[[i]] <- estimate.R(epid = 
                              BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]], 
                              GT = mGT, time.step = 1, pop.size =
                              BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
                              begin = 1, end = end,methods=c("EG"))}else{
                              estr0[[i]] <- estimate.R(epid = 
                              BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]], 
                              GT = mGT, time.step = 1, begin = 1, end = end, pop.size =
                              BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
                              methods=c("EG", "ML"))}
}
names(estr0) <- paste("Facility", BC_LTHC_outbreaks_100Imputs[[its]]$Location, sep="_")
res[[its]]<-estr0
}

nIts <- length(BC_LTHC_outbreaks_100Imputs)
res <- vector(mode = "list", length = nIts)

for (its in 1:nIts){
mGT<-generation.time("gamma", c(5.2, 1.73))
estr0<-rep(list(1),length(BC_LTHC_outbreaks_100Imputs[[1]]$Location) )
for (i in 1:length(BC_LTHC_outbreaks_100Imputs[[1]]$Location) ){
      end <- as.numeric(length(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]]) + 1 - 
        which.max(rev(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]])))
      if (end==1){end=as.numeric(length(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]]))}
      t <- try(estimate.R(epid = BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]],
                          GT = mGT, time.step = 1, 
                          pop.size = BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
                          begin = 1, end = end,
                          methods=c("EG", "ML")))
      if("try-error" %in% class(t)){estr0[[i]] <- estimate.R(epid = 
                              BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]], 
                              GT = mGT, time.step = 1, pop.size =
                              BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
                              begin = 1, end = end,methods=c("EG"))}else{
                              estr0[[i]] <- estimate.R(epid = 
                              BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]], 
                              GT = mGT, time.step = 1, begin = 1, end = end, pop.size =
                              BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
                              methods=c("EG", "ML"))}
}
names(estr0) <- paste("Facility", BC_LTHC_outbreaks_100Imputs[[its]]$Location, sep="_")
res[[its]]<-estr0
}

Output files and summary

We collect summaries of the R0 estimates from a single imputation (the final one), and also calculate confidence intervals across all 100 imputations as a sensitivity analysis.

# First, save the complete nIts=100 sets of estimates to file (the sensitivity analysis) 
# We want the mean of the 100 means, and the CI of those 100 means, per outbreak
er_df <- data.frame(Location=fac_names,
                    EG_mean=c(rep(NA, length(names(estr0)))),
                    EG_mean_CI_lower=c(rep(NA, length(names(estr0)))),
                    EG_mean_CI_upper=c(rep(NA, length(names(estr0)))),
                    ML_mean=c(rep(NA, length(names(estr0)))),
                    ML_mean_CI_lower=c(rep(NA, length(names(estr0)))),
                    ML_mean_CI_upper=c(rep(NA, length(names(estr0)))) )


# To calculate the confidence intervals on the means
normConfInt <- function(x, alpha = 0.05){
  mean(x) + qt(1 - alpha / 2, length(x) - 1) * sd(x) * c(-1, 1)}

for (i in 1:length(estr0)){
  a<-NULL; b<-NULL;
  for (j in 1:nIts){
  a <- c(a, res[[j]][[i]]$estimates$EG[1])
  b <- c(b, res[[j]][[i]]$estimates$ML[1])
  }
  er_df[i, 2] <- mean(unlist(a))
  er_df[i, 3] <- normConfInt(unlist(a))[1]
  er_df[i, 4] <- normConfInt(unlist(a))[2]
  er_df[i, 5] <- mean(unlist(b))
  er_df[i, 6] <- normConfInt(unlist(b))[1]
  er_df[i, 7] <- normConfInt(unlist(b))[2]
}
er_df <- arrange(er_df, Location)
write.table(er_df, file = paste0(outdir,"/R0estimation_results_multiple.txt"))

# show table of results
er_df %>%
  kableExtra::kable() %>%
  kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))

# Secondly, the final imputation as the primary result
er_df <- data.frame(Location=fac_names,
                 EG=c(rep(NA, length(names(estr0)))),
                 EG_CI_lower=c(rep(NA, length(names(estr0)))),
                 EG_CI_upper=c(rep(NA, length(names(estr0)))),
                 ML=c(rep(NA, length(names(estr0)))),
                 ML_CI_lower=c(rep(NA, length(names(estr0)))),
                 ML_CI_upper=c(rep(NA, length(names(estr0)))) )

for (i in 1:length(estr0)){
  er_df[i, 2] <- estr0[[i]]$estimates$EG[1]
  er_df[i, 3] <- estr0[[i]]$estimates$EG$conf.int[1]
  er_df[i, 4] <- estr0[[i]]$estimates$EG$conf.int[2]
  er_df[i, 5] <- ifelse(is.null(estr0[[i]]$estimates$ML[1]),NA,estr0[[i]]$estimates$ML[1])
  er_df[i, 6] <- ifelse(is.null(estr0[[i]]$estimates$ML[1]),NA,estr0[[i]]$estimates$ML$conf.int[1])
  er_df[i, 7] <- ifelse(is.null(estr0[[i]]$estimates$ML[1]),NA,estr0[[i]]$estimates$ML$conf.int[2])
}
er_df <- arrange(er_df, Location)
write.table(er_df, file=paste0(outdir,"/R0estimation_results_single.txt") )

# show table of results
er_df %>%
  kableExtra::kable() %>%
  kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))

Quick results visualisation

Some simple plots of the results.

hist(er_df$EG, breaks=20, main="EG R0 estimates", xlab="R0", col="steelblue4")
hist(er_df$ML, breaks=20, main="ML R0 estimates", xlab="R0", col="steelblue4")

Attack rates

We calculate the attack rate in each outbreak as the number of cases divided by the facility capacity. We incorporate uncertainty in the attack rate by varying the denominator from 85% to 115% of the known capacity. R0 is also estimated from the attack rate, according to the relation -log(1-A_r)/A_r = R0.

AR_table <- data.frame("Facility" = fac_names,  
                  "No. cases" = BC_LTHC_outbreaks_100Imputs[[100]]$num_cases,
                  "Capacity" = BC_LTHC_outbreaks_100Imputs[[100]]$capacity,
                  "A_r" = 
                    100*BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity,
                  "A_r (85% cap.)" =
                    100*BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/(BC_LTHC_outbreaks_100Imputs[[100]]$capacity*0.85),
                  "A_r (115% cap.)" =
                    100*BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/(BC_LTHC_outbreaks_100Imputs[[100]]$capacity*1.15),
                  "R0 (A_r)" =
                    -log(1-(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity))/(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity),
                  "R0 (A_r, 85% cap.)" =
                    -log(1-(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*0.85))/(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*0.85),
                  "R0 (A_r, 115% cap.)" =
                    -log(1-(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*1.15))/(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*1.15) )
AR_table <- arrange(AR_table, Facility)

# show table of results
AR_table %>%
  kableExtra::kable() %>%
  kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))

Compare attack rate to R0 estimates from different methods

Next, compare EG and ML estimates with results from the Bayesian hierarchical model, against the attack rate.

# The Bayesian hierarchical model results are manually set here, so remember to update these if you are adapting this script for another analysis. 
BHM <- c(0.56, 0.79, 0.86, 0.93, 1.2, 1.47, 1.52, 1.57, 2.1, 2.9, 3.2, 3.27, 3.32, 4.55, 5.73, 6.09, 8.17, 9.17)
# The following aren't used until the next chunk:
BHM_multi <- c(2.6, 2.97, 2.62, 3.28, 3.73, 3.68, 3.62, 5.39, 2.87, 2.71, 3.82, 2.37, 4.73, 6.65, 6.7, 6.91, 9.95, 9.13) # multi-level zeta point estimates
BHM_upCI <- c(1.17, 1.79, 1.54, 1.89, 2.21, 2.5, 2.33, 2.69, 3.23, 3.98, 4.35, 4.51, 4.58, 6.98, 7.76, 8.04, 11.38, 11.97) # BHM upper credible interval
BHM_multi_upCI <- c(4.96, 5.99, 4.87, 6.81, 7.46, 6.9, 6.54, 9.52, 5.14, 4.03, 5.55, 3.43, 7.23, 10.32, 8.9, 9.01, 13.21, 12.35) # multi-level zeta upper credible interval
BHM_lowCI <- c(0.16, 0.22, 0.35, 0.3, 0.46, 0.7, 0.91, 0.7, 1.29, 2.13, 2.41, 2.42, 2.43, 3.05, 4.37, 4.7, 5.29, 7.16) # BHM lower credible interval
BHM_multi_lowCI <- c(1.26, 1.38, 1.35, 1.59, 1.7, 1.8, 1.85, 2.45, 1.61, 1.83, 2.67, 1.6, 3.08, 4.1, 5.06, 5.31, 7.44, 7.06) # multi-level zeta lower credible interval

scatterdata <- data.frame(c(AR_table$Facility,AR_table$Facility,AR_table$Facility), c(AR_table$A_r,AR_table$A_r,AR_table$A_r), c(er_df$EG,er_df$ML,BHM), c(rep("Exponential growth",length(AR_table$A_r)), rep("Maximum likelihood",length(AR_table$A_r)), rep("Bayesian hierarchical",length(AR_table$A_r))) )
names(scatterdata) <- c("Facility", "ar", "r0", "Method")

p <- ggplot(scatterdata, aes(x=ar, y=r0, color=Method)) +
  geom_point(size=3, alpha=0.8) +
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
  #theme_sleek() + 
  xlab("Attack Rate (%)") + ylab(expression(R['0,k'])) +  theme(legend.position="right", legend.title = element_text(size=10,                                           face="bold"), legend.text=element_text(size=10)) +
  #scale_colour_ggthemr_d() +
  annotate(geom="label", x=75, y=9.7, label=round(cor(AR_table$A_r, BHM),2), colour=palette()[2]) +
  annotate(geom="label", x=75, y=1.7, label=round(cor(AR_table$A_r, er_df$EG),2), colour=palette()[3])+
  annotate(geom="label", x=75, y=3.5, label=round(cor(AR_table$A_r[!is.na(er_df$ML)], er_df$ML[!is.na(er_df$ML)]),2), colour=palette()[4]) 
# The geom_smooth/geom_point warning messages can be safely ignored - these are caused by the missing ML values
p

Compare R0 estimates from three main methods (BHM, EG, ML)

This figure compares point estimates and confidence intervals/credible intervals from the BHM (single- and multi- level zeta), EG and ML methods.

``` {r, fig.width = 7, fig.height = 5}

First combine the results into a data frame

df <- data.frame(c(rep(er_df$Location, 4)), (c(er_df$EG, er_df$ML , BHM, BHM_multi)), (c(er_df$EG_CI_upper, er_df$ML_CI_upper, BHM_upCI, BHM_multi_upCI)), (c(er_df$EG_CI_lower, er_df$ML_CI_lower, BHM_lowCI, BHM_multi_lowCI)), c(rep("Exponential growth", length(er_df$Location)), rep("Maximum likelihood", length(er_df$Location)), rep("Bayesian hierarchical", length(er_df$Location)), rep("Bayesian hierarchical, multi-level", length(er_df$Location)))) names(df) <- c("Location", "R0", "lowerCI", "upperCI", "Method")

p<-ggplot(df, aes(y=Location, x=R0, group=Method, color=Method)) + xlab(expression(R['0,k'])) + ylab("Location") + geom_errorbarh(mapping=aes(xmin=lowerCI, xmax=upperCI), size = 0.5, alpha=0.99, position=position_dodgev(height=0.8), height=0) + geom_point(position=position_dodgev(height=0.8)) + #theme_sleek() + theme(legend.position="bottom") + guides(colour = guide_legend(nrow = 2)) + scale_x_continuous(trans="log10", limits=c(0.000001,1000), labels=c("0.001","0.01","0.1", "1.0","10","100", "1000"), breaks=c(0.001,0.01, 0.1, 1, 10, 100, 1000)) p

## Correlations between R0 estimates and additional LTHC facility covariate data

In this section we investigate any relationships between the facility R0 estimates and additional data obtained on the LTHC facilities (age of facility, room type etc.). The additional data is primarily contained in *italic*BC_OSABC_facilitydata.rda*italic*, and some factors we have loaded already (e.g. outbreak reported date).

First, we take a look at a few of these individually:

```r

# Take a look at the additional factor data:
head(BC_OSABC_facilitydata)
# Re-order to alphabetical labelling
BC_OSABC_facilitydata$Facility = fac_names
BC_OSABC_facilitydata <- arrange(BC_OSABC_facilitydata, Facility)

# Was the initial (by symptom onset) case staff?
# (2 outbreaks with multiple earliest symptom onsets, both resident & staff: 9 and 17.
# These are coded as unknown.)
staff_cat<-as.factor(BC_OSABC_facilitydata$`Identity of initial COVID-19 case`)

# Remove the 'unknown' categorisations to make this factor dichotomous, and calculate the correlation with R0
staff_cat_di <- staff_cat[staff_cat!="Unknown"]
staff_cat_di <- unclass(staff_cat_di)

# Initial case
print(paste0("BHM and identity of initial case correlation = ", round(cor(staff_cat_di, BHM[staff_cat!="Unknown"]),3)))
print(paste0("BHM multi-level and identity of initial case correlation = ", round(cor(staff_cat_di, BHM_multi[staff_cat!="Unknown"]),3)))
print(paste0("BHM and identity of initial case correlation = ", round(cor(staff_cat_di, er_df$EG[staff_cat!="Unknown"]),3)))
print(paste0("BHM and identity of initial case correlation = ", round(cor(unclass(staff_cat[staff_cat!="Unknown" & !is.na(er_df$ML)]), er_df$ML[staff_cat!="Unknown" & !is.na(er_df$ML)]),3))) #only non-NA ML ests


# Facility capacity
fac_cap <- BC_LTHC_outbreaks_100Imputs[[100]]$capacity[order(fac_names)]
print(paste0("BHM and facility capacity correlation = ", round(cor(BHM, fac_cap),3)))
print(paste0("BHM multi-level and facility capacity correlation = ", round(cor(BHM_multi, fac_cap),3)))
print(paste0("EG and facility capacity correlation = ", round(cor(er_df$EG, fac_cap),3)))
print(paste0("ML and facility capacity correlation = ", round(cor(er_df$ML[!is.na(er_df$ML)], fac_cap[!is.na(er_df$ML)]),3)))


# outbreak reported date
rep_date <- BC_LTHC_outbreaks_100Imputs[[100]]$reported_date[order(fac_names)]
rep_date <- as.numeric(as.POSIXct(rep_date, format="%Y-%m-%d %H:%M:%S", tz="GMT"))
print(paste0("BHM and facility capacity correlation = ", round(cor(BHM, rep_date),3)))
print(paste0("BHM multi-level and facility capacity correlation = ", round(cor(BHM_multi, rep_date),3)))
print(paste0("EG and facility capacity correlation = ", round(cor(er_df$EG, rep_date),3)))
print(paste0("ML and facility capacity correlation = ", round(cor(er_df$ML[!is.na(er_df$ML)], rep_date[!is.na(er_df$ML)]),3)))

Finally, we create figures and tables for all factors available.

 # Make a tibble with the loaded factor data, but also outbreak reported date and facility capacity.
cor_data <- as_tibble(BC_OSABC_facilitydata)
cor_data <- cbind(cor_data, fac_cap, rep_date)
names(cor_data)[(length(cor_data)-1):length(cor_data)] <- c("Facility capacity", "COVID-19 outbreak reported date")

# Create correlation table, for numeric covariates
rho_table <- cor_data %>%
  dplyr::select("Number of disease outbreaks 2018/19","Number of lodged complaints 2018/19",
         "Residents dependent for daily activities (%)","Average resident stay (days)",
         "Average resident age (years)","Direct care hours /resident/day",
         "Facility capacity", "COVID-19 outbreak reported date", "Year facility opened")
rho_table <-
  purrr::map2_df(rho_table,colnames(rho_table),function(x,var_name){
    cor.test(x,BHM) %>%
      broom::tidy() %>%
      mutate(Property = var_name)
  }) %>%
  dplyr::select(Property,everything())
# BHM correlations
rho_table %>%
  kableExtra::kable() %>%
  kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))


# And correlations for dichotomous categorical factors (point-biserial)
print(paste0("BHM and identity of initial case (is staff) correlation = ", cor(unclass(as.factor(cor_data$"Identity of initial COVID-19 case"[cor_data$"Identity of initial COVID-19 case"!="Unknown"])), BHM[cor_data$"Identity of initial COVID-19 case"!="Unknown"])))
# Make this one negative since 'not accredited' is associated with a higher number: 
print(paste0("BHM and (positive) accreditation status correlation = ", -cor(unclass(as.factor(cor_data$"Accreditation status"[!is.na(cor_data$"Accreditation status")])), BHM[!is.na(cor_data$"Accreditation status")])))



## Plot correlations with R0

# separate numeric and character factors into 2 plots
cols_to_plot_num <- c(3, 6, 7, 8, 9, 10, 11, 14, 15)
cols_to_plot_cat <- c(2, 4, 5, 12)
cor_data <- cbind(cor_data, BHM)

cordatalong_num <- pivot_longer(cor_data, cols = all_of(cols_to_plot_num), names_to = "Factor", values_to = "Value")
cordatalong_cat <- pivot_longer(cor_data, cols = all_of(cols_to_plot_cat), names_to = "Factor", values_to = "Value")

# (modified a few aesthetics of figure manually for paper - removed second R_0,k label, rearranged some titles that were cut off)
p1 <- ggplot(data=cordatalong_num, aes(x=Value, y = BHM)) +
  geom_point(size=2, colour = "#E84646") + facet_wrap(~Factor,  scales = "free", ncol=4) + xlab("") + ylab(expression(R['0,k'])) +
  theme(text = element_text(size=12), axis.title.y = element_text(size = 15))

p2 <- ggplot(data=cordatalong_cat, aes(x=Value, y = BHM)) +
  geom_point(size=2, colour = "#E84646") + facet_wrap(~Factor,  scales = "free", ncol=4) + xlab("") + ylab(expression(R['0,k'])) +
  theme(text = element_text(size=12), axis.title.y = element_text(size = 15)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1))
grid.arrange(grobs = list(p2, p1), nrow = 2, heights = c(1.1,3))

# If using paper colour scheme, reset the colour theme:
#ggthemr_reset()