o2geosocial.R
In o2geosocial: Reconstruction of Transmission Chains from Surveillance Data

## ----import_library, warning = FALSE, message = FALSE-------------------------
library(o2geosocial)
## We used the data.table syntax throughout this example
library(data.table)
data("toy_outbreak_short")
print(toy_outbreak_short$cases, nrows = 10)
# Extract dataset
dt_cases <- toy_outbreak_short[["cases"]]
dt_cases <- dt_cases[order(Date), ]

## ----plot_ref, warning = FALSE, fig.width = 6.5, fig.height = 4, fig.cap="Figure 2: Cluster size distribution of the simulated dataset"----
# Reference cluster size distribution
hist(table(dt_cases$cluster), breaks = 0:max(table(dt_cases$cluster)), 
     ylab = "Number of clusters", xlab = "Cluster size", main = "", las=1)

## ----distrib------------------------------------------------------------------
# Distribution of the latent period
f_dens <- dgamma(x = 1:100, scale = 0.43, shape = 26.83)
# Distribution of the generation time
w_dens <- dnorm(x = 1:100, mean = 11.7, sd = 2.0)

## ----age, message = FALSE, warning = FALSE------------------------------------
# From the list toy_outbreak_short  
a_dens <- toy_outbreak_short$age_contact
# Alternatively, from POLYMOD:
polymod_matrix <-
  t(socialmixr::contact_matrix(socialmixr::polymod, 
                               countries="United Kingdom",
                               age.limits=seq(0, 70, by=5),
                               missing.contact.age="remove",
                               estimated.contact.age="mean", 
                               missing.participant.age="remove")$matrix)
polymod_matrix <-data.table::as.data.table(polymod_matrix)
# Compute the proportion of connection to each age group
a_dens <- t(t(polymod_matrix)/colSums(polymod_matrix))

## ----distance_pop, warning = FALSE--------------------------------------------
# Extract all regions in the territory
dt_regions <- toy_outbreak_short[["dt_regions"]]
# Extract the population vector
pop_vect <- dt_regions$population
# Create the matrices of coordinates for each region (one "from"; one "to")
mat_dist_from <- matrix(c(rep(dt_regions$long, nrow(dt_regions)),
                          rep(dt_regions$lat, nrow(dt_regions))), ncol = 2)
mat_dist_to <- matrix(c(rep(dt_regions$long, each = nrow(dt_regions)), 
                        rep(dt_regions$lat, each = nrow(dt_regions))),
                      ncol = 2)
# Compute all the distances between the two matrices
all_dist <- geosphere::distGeo(mat_dist_from, mat_dist_to)
# Compile into a distance matrix
dist_mat <- matrix(all_dist/1000, nrow = nrow(dt_regions))
# Rename the matrix columns and rows, and the population vector
names(pop_vect) <- rownames(dist_mat) <- colnames(dist_mat) <- 
  dt_regions$region

## ----first_model, message = FALSE, warning = FALSE----------------------------
# Default moves, likelihoods and priors
moves <- custom_moves()
likelihoods <- custom_likelihoods()
priors <- custom_priors()
# Data and config, model 1
data1 <- outbreaker_data(dates = dt_cases$Date, #Onset dates
                         age_group = dt_cases$age_group, #Age group
                         region = dt_cases$Cens_tract, #Location
                         genotype = dt_cases$Genotype, #Genotype
                         w_dens = w_dens, #Serial interval
                         f_dens = f_dens, #Latent period
                         a_dens = a_dens, #Age stratified contact matrix
                         population = pop_vect, #Population 
                         distance = dist_mat #Distance matrix
)
config1 <- create_config(data = data1, 
                         n_iter = 5000, #Iteration number: main run
                         n_iter_import = 1500, #Iteration number: short run
                         burnin = 500, #burnin period: first run
                         outlier_relative = T, #Absolute / relative threshold 
                         outlier_threshold = 0.9 #Value of the threshold
)
# Run model 1
out1 <- outbreaker(data = data1, config = config1, moves = moves, 
                   priors = priors, likelihoods = likelihoods)
# Set data and config for model 2
data2 <- outbreaker_data(dates = dt_cases$Date, 
                         age_group = dt_cases$age_group,
                         region = dt_cases$Cens_tract,
                         genotype = dt_cases$Genotype, w_dens = w_dens, 
                         f_dens = f_dens, a_dens = a_dens,
                         population = pop_vect, distance = dist_mat,
                         import = dt_cases$import #Import status of the cases
)
config2 <- create_config(data = data2, 
                         find_import = FALSE, # No inference of import status
                         n_iter = 5000, 
                         sample_every = 50, # 1 in 50 iterations is kept
                         burnin = 500)
# Run model 2
out2 <- outbreaker(data = data2, config = config2, moves = moves, 
                   priors = priors, likelihoods = likelihoods)


## ----params, warning = FALSE--------------------------------------------------
burnin <- config1$burnin
# Summary parameters a and b
#Model 1
print(summary(out1, burnin = burnin)$a)
print(summary(out1, burnin = burnin)$b)
# Model 2
print(summary(out2, burnin = burnin)$a)
print(summary(out2, burnin = burnin)$b)

## ----plot_first, warning = FALSE, fig.width = 6.5, fig.height = 4, fig.cap= "Figure 3: Comparison of inferred cluster size distribution with the reference data"----
# We create groups of cluster size: initialise the breaks for each group
group_cluster <- c(1, 2, 3, 5, 10, 15, 40, 100) - 1
# Reference data: h$counts
h <- hist(table(dt_cases$cluster), breaks = group_cluster, plot = FALSE)

# Grouped cluster size distribution in each run
clust_infer1 <- summary(out1, burnin = burnin, 
                        group_cluster = group_cluster)$cluster
clust_infer2 <- summary(out2, group_cluster = group_cluster, 
                        burnin = burnin)$cluster
# Merge inferred and reference cluster size distributions into one matrix
clust_size_matrix <- rbind(clust_infer1["Median",], clust_infer2["Median",],
                           h$counts)
# Plot  
b <- barplot(clust_size_matrix, names.arg = colnames(clust_infer1), las=1,
             ylab = "Number of clusters", xlab = "Cluster size", main = "", 
             beside = T, ylim = c(0, max(c(clust_infer1, clust_infer2))))
# Add the 50% CI
arrows(b[1,], clust_infer1["1st Qu.",], b[1,], clust_infer1["3rd Qu.",], 
       angle = 90, code = 3, length = 0.1)
arrows(b[2,], clust_infer2["1st Qu.",], b[2,], clust_infer2["3rd Qu.",], 
       angle = 90, code = 3, length = 0.1)
# Add legend
legend("topright", fill = grey.colors(3), bty = "n",
       legend = c("Inferred import status", 
                  "Known import status", "Simulated dataset"))

## ----index_infer, warning = FALSE---------------------------------------------
#' Title: Compute the proportion of iterations in the outbreaker() output 
#` where the inferred index matches the actual index in dt_cases
#'
#' @param dt_cases: reference dataset
#' @param out: Matrix output of outbreaker()
#' @param burnin: Numeric, length of the burnin phase
#'
#' @return Numeric vector showing the proportion of iterations pointing to
#' the correct index case
index_infer <- function(dt_cases, out, burnin){
  out_index <- out[out$step > burnin, grep("alpha", colnames(out))]
  ID_index <- matrix(dt_cases[unlist(out_index), ID], ncol = nrow(dt_cases))
  match_infer_data <- t(ID_index) == dt_cases$infector_ID
  match_infer_data[is.na(t(ID_index)) & is.na(dt_cases$infector_ID)] <- TRUE
  prop_correct <- rowSums(match_infer_data, na.rm = T)/ncol(match_infer_data)
  
  return(prop_correct)
}
# Same as index_infer, except it returns the proportion of inferred indexes
# who are in the same reference cluster as the case
index_clust <- function(dt_cases, out, burnin){
  out_index <- out[out$step > burnin, grep("alpha", colnames(out))]
  clust_index <- matrix(dt_cases[unlist(out_index), cluster], 
                        ncol = nrow(dt_cases))
  match_infer_data <- t(clust_index) == dt_cases$cluster
  match_infer_data <- match_infer_data[!is.na(dt_cases$infector_ID),]
  
  
  prop_correct <- rowSums(match_infer_data, na.rm = T)/ncol(match_infer_data)
  
  return(prop_correct)
}
# Run index_infer for each model
index_infer1 <- index_infer(dt_cases = dt_cases, out = out1, burnin = burnin)
index_infer2 <- index_infer(dt_cases = dt_cases, out = out2, burnin = burnin)
# Run index_clust for each model
index_clust1 <- index_clust(dt_cases = dt_cases, out = out1, burnin = burnin)
index_clust2 <- index_clust(dt_cases = dt_cases, out = out2, burnin = burnin)

## ----plot index_infer, warning = FALSE, fig.width = 7, fig.height = 3, fig.cap = "Figure 4: Panel A: Proportion of iterations with the correct index for each case; Panel B: Proportion of iterations where the index is from the correct cluster"----
# Plot the sorted proportion in each model
oldpar <- par(no.readonly =TRUE)
par(bty = "n", mfrow = c(1, 2), mar = c(5,4,2,0), oma = c(0, 0, 0, 0))
# Panel A
plot(sort(index_infer1), type = "l", ylab = "Proportion", xlab = "Case", 
     main =  "A", las=1, col = grey.colors(3)[1], lwd = 3)
lines(sort(index_infer2), col = grey.colors(3)[2], lwd = 3)

# Panel B
plot(sort(index_clust1), type = "l", xlab = "Case", ylab = "", 
     main =  "B", las=1, col = grey.colors(3)[1], lwd = 3)
lines(sort(index_clust2), col = grey.colors(3)[2], lwd = 3)
legend("bottomright", col = grey.colors(3)[1:2], lwd = 3, bty = "n",
       legend = c("Inferred import status", 
                  "Known import status"))

## ----import_sf_file, message = FALSE, warning = FALSE, fig.width = 6.5, fig.height = 2.5----
library(ggplot2)
# Read the shapefile and create one map for each model
map1 <- tigris::tracts(state = "Ohio", class = "sf", progress_bar = FALSE)
map1$INTPTLON <- as.numeric(map1$INTPTLON)
map1$INTPTLAT <- as.numeric(map1$INTPTLAT)
map2 <- map1
map1$model <- "Model 1"
map2$model <- "Model 2"


## ----n_import_reg_per_case, warning = FALSE-----------------------------------
# Add the proportion of iteration in model 1 where each case is an import
dt_cases[, prop_infer1 := summary(out1, burnin = burnin)$tree$import]
# Add the proportion of iteration in model 2 where each case is an import
dt_cases[, prop_infer2 := summary(out2, burnin = burnin)$tree$import]

## ----n_import_reg_per_reg, warning = FALSE------------------------------------
# Number of import per region in model 1
prop_reg1 <- dt_cases[, .(prop_per_reg = sum(prop_infer1)), 
                      by = Cens_tract]$prop_per_reg
# Number of import per region in model 2
prop_reg2 <- dt_cases[, .(prop_per_reg = sum(prop_infer2)), 
                      by = Cens_tract]$prop_per_reg
names(prop_reg1) <- names(prop_reg2) <- unique(dt_cases$Cens_tract)

# Add the number of imports in each region to the maps
map1$prop_reg <- prop_reg1[as.character(map1$GEOID)]
map2$prop_reg <- prop_reg2[as.character(map2$GEOID)]

## ----create_map1, warning = FALSE, fig.width = 6.5, fig.height = 2.5, fig.cap = "Figure 5: Average number of import cases per census tract"----
# Merge maps
maps <- rbind(map1, map2)

limits_long <- c(-84, -82)
limits_lat <- c(40, 41.5)
maps <- maps[maps$INTPTLON > limits_long[1] & maps$INTPTLON < limits_long[2],]
maps <- maps[maps$INTPTLAT > limits_lat[1] & maps$INTPTLAT < limits_lat[2],]

# Plot: number of imports per region, two panels
ggplot(maps) +  geom_sf(aes(fill = prop_reg)) + facet_grid(~model)  +     
  scale_fill_gradient2(na.value = "lightgrey", midpoint = 0.8, 
                       breaks = c(0, 0.5, 1, 1.5), name = "Nb imports",
                       low = "white", mid = "lightblue", high = "darkblue") + 
  coord_sf(xlim = c(-83.8, -82.2), ylim = c(40.2, 41.3)) +
  theme_classic(base_size = 9)

## ----n_sec_region, warning = FALSE--------------------------------------------
#' Title: Compute the number of secondary cases per case in each region
#'
#' @param dt_cases: reference dataset
#' @param out: Matrix output of outbreaker()
#' @param burnin: Numeric, length of the burnin phase
#'
#' @return A numeric matrix: the first column is the census tract ID, the
#' other columns show the number of secondary cases per case. Each row 
#' corresponds to a different iteration.
n_sec_per_reg <- function(dt_cases, out, burnin){
  ## Number of secondary cases per case
  n_sec <- apply(out[out$step > burnin, grep("alpha", colnames(out))], 1, 
                 function(X){
                   X <- factor(X, 1:length(X))
                   return(table(X))})
  ## Aggregate by region
  tot_n_sec_reg <- aggregate(n_sec, list(dt_cases$Cens_tract), sum)
  ## Divide by the number of cases in each region
  tot_n_sec_reg <- cbind(tot_n_sec_reg[, 1], 
                         tot_n_sec_reg[, -1] / table(dt_cases$Cens_tract))
  return(tot_n_sec_reg)
}
  
n_sec_tot1 <- n_sec_per_reg(dt_cases = dt_cases, out = out1, burnin = burnin)
n_sec_tot2 <- n_sec_per_reg(dt_cases = dt_cases, out = out2, burnin = burnin)
n_sec1 <- apply(n_sec_tot1[,-1], 1, median)
n_sec2 <- apply(n_sec_tot2[,-1], 1, median)
names(n_sec1) <- names(n_sec2) <- unique(dt_cases$Cens_tract)

## ----add_variables_sec, warning = FALSE---------------------------------------
map1$n_sec <- as.numeric(n_sec1[as.character(map1$GEOID)])
map2$n_sec <- as.numeric(n_sec2[as.character(map2$GEOID)])

## ----create_maps2, warning = FALSE, fig.width = 6.5, fig.height = 2.5, fig.cap = "Figure 6: Median number of secondary transmission per case in each census tract"----
# Merge maps
maps <- rbind(map1, map2)

limits_long <- c(-84, -82)
limits_lat <- c(40, 41.5)
maps <- maps[maps$INTPTLON > limits_long[1] & maps$INTPTLON < limits_long[2],]
maps <- maps[maps$INTPTLAT > limits_lat[1] & maps$INTPTLAT < limits_lat[2],]

# Plot the geographical distribution of the number of secondary cases
ggplot(maps) +  geom_sf(aes(fill = n_sec)) + facet_grid(~model)  +     
  scale_fill_gradient2(na.value = "lightgrey", mid = "lightblue",
                       low = "white", midpoint = 1, high = "darkblue",
                       breaks = seq(0, 5, 0.5),name = "Sec cases") +
  coord_sf(xlim = c(-83.8, -82.2), ylim = c(40.2, 41.3)) +
  theme_classic(base_size = 9) 

## ----create_stouff_matrix, message = FALSE, warning = FALSE-------------------
# For every column of the distance matrix, use the cumulative sum of the 
# population vector ordered by the distance. Remove the values where 
# the distance between the regions is above gamma
dist_mat_stouffer <- apply(dist_mat, 2, function(X){
  pop_X <- cumsum(pop_vect[order(X)])
  omega_X <- pop_X[names(X)]
  # omega_X is set to -1 if the distance between two regions is above gamma
  omega_X[X > config1$gamma] <- -1
  return(omega_X)
})
# The new value of gamma is equal to the maximum of dist_mat_stouffer + 1
gamma <- max(dist_mat_stouffer) + 1
# The values previously set to -1 are now set to the new value of gamma
dist_mat_stouffer[dist_mat_stouffer == -1] <- max(dist_mat_stouffer) * 2

## ----moves_priors, message = FALSE, warning = FALSE---------------------------
moves3 <- custom_moves(a = cpp_stouffer_move_a)

f_null <- function(param) {
  return(0.0)
}
priors3 <- custom_priors(b = f_null)

## ----out_stouffer, message = FALSE, warning = FALSE---------------------------
# Set data and config lists
data3 <- outbreaker_data(dates = dt_cases$Date, #Onset dates
                         age_group = dt_cases$age_group, #Age group
                         region = dt_cases$Cens_tract, #Location
                         genotype = dt_cases$Genotype, #Genotype
                         w_dens = w_dens, #Serial interval
                         f_dens = f_dens, #Latent period
                         a_dens = a_dens, #Age stratified contact matrix
                         population = pop_vect, #Population 
                         distance = dist_mat_stouffer #Distance matrix
)
config3 <- create_config(data = data3, 
                         gamma = gamma,
                         move_b = FALSE, # b is not estimated
                         init_b = 0, 
                         spatial_method = "power-law",
                         n_iter = 5000, #Iteration number: main run
                         n_iter_import = 1500, #Iteration number: short run
                         burnin = 500, #burnin period: first run
                         outlier_relative = T, #Absolute / relative threshold
                         outlier_threshold = 0.9 #Value of the threshold
)
# Run the model using the Stouffer's rank method
out_stouffer <- outbreaker(data = data3, config = config3, moves = moves3, 
                           priors = priors3, likelihoods = likelihoods)

## ----stouffer_cluster, message = FALSE, warning = FALSE, fig.width = 6.5, fig.height = 4, fig.cap = "Figure 7: Comparison of inferred cluster size distribution with the reference data"----
# Grouped cluster size distribution in the Stouffer's rank model
clust_infer_stouf <- summary(out_stouffer, burnin = burnin, 
                             group_cluster = group_cluster)$cluster
# Merge inferred and reference cluster size distributions
clust_size_matrix <- rbind(clust_infer_stouf["Median",], h$counts) 
# Plot the two distributions
b <- barplot(clust_size_matrix, names.arg = colnames(clust_infer_stouf), 
             beside = T)
# Add CIs
arrows(b[1,], clust_infer_stouf["1st Qu.",], b[1,], 
       clust_infer_stouf["3rd Qu.",], angle = 90, code = 3, length = 0.1)
legend("topright", fill = grey.colors(2), bty = "n",
       legend = c("Inferred import status, Stouffer's rank method", 
                  "Simulated dataset"))

## ----match_stouffer, message = FALSE, warning = FALSE, fig.width = 7, fig.height = 3, fig.cap = "Figure 8: Panel A: Proportion of iterations with the correct index for each case; Panel B: Proportion of iterations where the index is from the correct cluster"----
index_infer_stouf <- index_infer(dt_cases = dt_cases, out = out_stouffer, 
                                 burnin = config1$burnin)
index_clust_stouf <- index_clust(dt_cases = dt_cases, out = out_stouffer, 
                                 burnin = config1$burnin)
# Plot the sorted proportion in each model
par(bty = "n", mfrow = c(1, 2), mar = c(5,4,2,0), oma = c(0, 0, 0, 0))
# Panel A
plot(sort(index_infer_stouf), type = "l", xlab = "Case", ylab = "", 
     main =  "A", las=1, col = grey.colors(2)[1], lwd = 3)
# Panel B
plot(sort(index_clust_stouf), type = "l", ylab = "Proportion", xlab = "Case", 
     main =  "B", las=1, col = grey.colors(2)[1], lwd = 3)
par(oldpar)
Any scripts or data that you put into this service are public.
o2geosocial documentation built on June 22, 2024, 10:18 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
o2geosocial
Reconstruction of Transmission Chains from Surveillance Data

inst/doc/o2geosocial.R
In o2geosocial: Reconstruction of Transmission Chains from Surveillance Data

Try the o2geosocial package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

o2geosocial Reconstruction of Transmission Chains from Surveillance Data

inst/doc/o2geosocial.R In o2geosocial: Reconstruction of Transmission Chains from Surveillance Data

Try the o2geosocial package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

o2geosocial
Reconstruction of Transmission Chains from Surveillance Data

inst/doc/o2geosocial.R
In o2geosocial: Reconstruction of Transmission Chains from Surveillance Data