Nothing
#' Simulation of cluster randomized trial with spillover
#'
#' \code{simulateCRT} generates simulated data for a cluster randomized trial (CRT) with geographic spillover between arms.
#'
#' @param trial an object of class \code{"CRTsp"} or a data frame containing locations in (x,y) coordinates, cluster
#' assignments (factor \code{cluster}), and arm assignments (factor \code{arm}). Each location may also be
#' assigned a \code{propensity} (see details).
#' @param effect numeric. The simulated effect size (defaults to 0)
#' @param outcome0 numeric. The anticipated value of the outcome in the absence of intervention
#' @param generateBaseline logical. If \code{TRUE} then baseline data and the \code{propensity} will be simulated
#' @param matchedPair logical. If \code{TRUE} then the function tries to carry out randomization
#' using pair-matching on the baseline data (see details)
#' @param scale measurement scale of the outcome. Options are: 'proportion' (the default); 'count'; 'continuous'.
#' @param baselineNumerator optional name of numerator variable for pre-existing baseline data
#' @param baselineDenominator optional name of denominator variable for pre-existing baseline data
#' @param denominator optional name of denominator variable for the outcome
#' @param kernels number of kernels used to generate a de novo \code{propensity}
#' @param ICC_inp numeric. Target intra cluster correlation, provided as input when baseline data are to be simulated
#' @param sigma_m numeric. standard deviation of the normal kernel measuring spatial smoothing leading to spillover
#' @param spillover_interval numeric. input spillover interval
#' @param tol numeric. tolerance of output ICC
#' @returns A list of class \code{"CRTsp"} containing the following components:
#' \tabular{lll}{
#' \code{geom_full}\tab list: \tab summary statistics describing the site
#' cluster assignments, and randomization \cr
#' \code{design}\tab list: \tab values of input parameters to the design \cr
#' \code{trial} \tab data frame: \tab rows correspond to geolocated points, as follows:\cr
#' \tab \code{x} \tab numeric vector: x-coordinates of locations \cr
#' \tab \code{y} \tab numeric vector: y-coordinates of locations \cr
#' \tab\code{cluster} \tab factor: assignments to cluster of each location \cr
#' \tab\code{arm} \tab factor: assignments to \code{control} or \code{intervention} for each location \cr
#' \tab\code{nearestDiscord} \tab numeric vector: signed Euclidean distance to nearest discordant location (km) \cr
#' \tab\code{propensity} \tab numeric vector: propensity for each location \cr
#' \tab\code{base_denom} \tab numeric vector: denominator for baseline \cr
#' \tab\code{base_num} \tab numeric vector: numerator for baseline \cr
#' \tab\code{denom} \tab numeric vector: denominator for the outcome \cr
#' \tab\code{num} \tab numeric vector: numerator for the outcome \cr
#' \tab\code{...} \tab other objects included in the input \code{"CRTsp"} object
#' or \code{data.frame}\cr
#' }
#' @details Synthetic data are generated by sampling around the values of
#' variable \code{propensity}, which is a numerical vector
#' (taking positive values) of length equal to the number of locations.
#' There are three ways in which \code{propensity} can arise:
#' \enumerate{
#' \item \code{propensity} can be provided as part of the input \code{trial} object.
#' \item Baseline numerators and denominators (values of \code{baselineNumerator}
#' and \code{baselineDenominator} may be provided.
#' \code{propensity} is then generated as the numerator:denominator ratio
#' for each location in the input object
#' \item Otherwise \code{propensity} is generated using a 2D Normal
#' kernel density. The [\code{OOR::StoSOO}](https://rdrr.io/cran/OOR/man/StoSOO.html)
#' is used to achieve an intra-cluster correlation coefficient (ICC) that approximates
#' the value of \code{'ICC_inp'} by searching for an appropriate value of the kernel bandwidth.
#' }
#' \code{num[i]}, the synthetic outcome for location \code{i}
#' is simulated with expectation: \cr
#' \deqn{E(num[i]) = outcome0[i] * propensity[i] * denom[i] * (1 - effect*I[i])/mean(outcome0[] * propensity[])} \cr
#' The sampling distribution of \code{num[i]} depends on the value of \code{scale} as follows: \cr
#' - \code{scale}=’continuous’: Values of \code{num} are sampled from a
#' Normal distributions with means \code{E(num[i])}
#' and variance determined by the fitting to \code{ICC_inp}.\cr
#' - \code{scale}=’count’: Simulated events are allocated to locations via multivariate hypergeometric distributions
#' parameterised with \code{E(num[i])}.\cr
#' - \code{scale}=’proportion’: Simulated events are allocated to locations via multinomial distributions
#' parameterised with \code{E(num[i])}.\cr
#'
#' \code{denominator} may specify a vector of numeric (non-zero) values
#' in the input \code{"CRTsp"} or \code{data.frame} which is returned
#' as variable \code{denom}. It acts as a scale-factor for continuous outcomes, rate-multiplier
#' for counts, or denominator for proportions. For discrete data all values of \code{denom}
#' must be > 0.5 and are rounded to the nearest integer in calculations of \code{num}.\cr\cr
#' By default, \code{denom} is generated as a vector of ones, leading to simulation of
#' dichotomous outcomes if \code{scale}=’proportion’.\cr
#'
#' If baseline numerators and denominators are provided then the output vectors
#' \code{base_denom} and \code{base_num} are set to the input values. If baseline numerators and denominators
#' are not provided then the synthetic baseline data are generated by sampling around \code{propensity} in the same
#' way as the outcome data, but with the effect size set to zero.
#'
#' If \code{matchedPair} is \code{TRUE} then pair-matching on the baseline data will be used in randomization providing
#' there are an even number of clusters. If there are an odd number of clusters then matched pairs are not generated and
#' an unmatched randomization is output.
#'
#' Either \code{sigma_m} or \code{spillover_interval} must be provided. If both are provided then
#' the value of \code{sigma_m} is overwritten
#' by the standard deviation implicit in the value of \code{spillover_interval}.
#' Spillover is simulated as arising from a diffusion-like process.
#'
#' For further details see [Multerer (2021)](https://edoc.unibas.ch/85228/)
#' @export
#'
#' @examples
#' {smalltrial <- readdata('smalltrial.csv')
#' simulation <- simulateCRT(smalltrial,
#' effect = 0.25,
#' ICC_inp = 0.05,
#' outcome0 = 0.5,
#' matchedPair = FALSE,
#' scale = 'proportion',
#' sigma_m = 0.6,
#' tol = 0.05)
#' summary(simulation)
#' }
simulateCRT <- function(trial = NULL, effect = 0, outcome0 = NULL, generateBaseline = TRUE, matchedPair = TRUE,
scale = "proportion", baselineNumerator = "base_num", baselineDenominator = "base_denom",
denominator = NULL, ICC_inp = NULL, kernels = 200, sigma_m = NULL,
spillover_interval = NULL, tol = 5e-03) {
# Written by Tom Smith, July 2017. Adapted by Lea Multerer, September 2017
message("\n===================== SIMULATION OF CLUSTER RANDOMISED TRIAL =================\n")
bw <- NULL
initial_bandwidth <- 0.3
if (!is.null(trial)) CRT <- CRTsp(trial)
trial <- CRT$trial
if (is.null(trial$cluster)){
message("*** Clusters not yet assigned ***")
return()
}
trial$cluster <- as.factor(trial$cluster)
if (is.null(trial$arm)){
message("*** No randomization available ***")
return()
}
trial$arm <- as.factor(trial$arm)
# set the denominator variable to be 'denom'
if (is.null(denominator)) denominator <- "denom"
trial$denom <- trial[[denominator]]
if (denominator != "denom") trial[[denominator]] <- NULL
# If the denominator has no values set, set to one
if (is.null(trial$denom)) trial$denom <- 1
# use spillover interval if this is available
if (!is.null(spillover_interval)) {
sigma_m <- spillover_interval/(2 * qnorm(0.975))
}
if (is.null(sigma_m)) {
stop("spillover interval or s.d. of spillover must be provided")
}
# compute distances to nearest discordant locations if they do not exist
if (is.null(trial$nearestDiscord)) trial <- compute_distance(trial, distance = "nearestDiscord")$trial
# trial needs to be ordered for GEE analyses (estimation of ICC)
trial <- trial[order(trial$cluster), ]
# remove any pre-existing propensity if a new one is required
if (generateBaseline) trial$propensity <- NULL
centers <- assignkernels(trial = trial,
baselineNumerator = baselineNumerator,
baselineDenominator = baselineDenominator, kernels = kernels)
# For the smoothing step compute contributions to the relative effect size from other locations as a function of
# distance to the other locations
euclid <- distance_matrix(trial$x, trial$y)
# compute approximate diagonal of clusters
approx_diag <- sqrt((max(trial$x) - min(trial$x))^2 + (max(trial$y) -
min(trial$y))^2)/sqrt(length(unique(trial$cluster)))
if (is.null(ICC_inp)) {
stop("*** A target ICC must be specified ***")
}
message("Estimating the smoothing required to achieve the target ICC of ", ICC_inp)
# determine the required smoothing bandwidth by fitting to the pre-specified ICC
# random multiplier is used to prevent the random number stream being reset to the same value for any given bw value
random_multiplier <- sample(1e6, size = 1)
loss <- 999
nb_iter <- 20
if (identical(Sys.getenv("TESTTHAT"), "true")) nb_iter <- 5
# in testing, the number of iterations is reduced giving very approximate output
while (loss > tol) {
ICC.loss <- OOR::StoSOO(par = c(NA), fn = ICCdeviation, lower = -5, upper = 5, nb_iter = nb_iter,
trial = trial, ICC_inp = ICC_inp, centers = centers, approx_diag = approx_diag,
sigma_m = sigma_m, scale = scale, euclid = euclid, effect = effect, outcome0 = outcome0,
random_multiplier = random_multiplier)
loss <- ICC.loss$value
message("\rtol: ", tol, " loss = ", loss, " \r")
if(kernels > 500) {
loss <- tol
warning("*** Failure to converge on target ICC ***")
} else if(loss > tol) {
# if convergence was not achieved, re-assign the kernels, reducing the smoothness
kernels <- round(kernels * 2)
message("Increasing the number of kernels to ", kernels, "\r")
centers <- assignkernels(trial = trial,
baselineNumerator = baselineNumerator,
baselineDenominator = baselineDenominator,
kernels = kernels)
}
}
logbw <- ICC.loss$par
# recover the seed that generated the best fitting trial and use this to regenerate this trial
bw <- exp(logbw[1])
set.seed(round(bw * random_multiplier))
trial <- get_assignments(trial = trial, scale = scale, euclid = euclid, sigma_m = sigma_m, effect = effect,
outcome0 = outcome0, bw = bw, centers = centers, numerator = "num", denominator = "denom")
# create a baseline dataset using the optimized bandwidth
if (generateBaseline) trial <- get_assignments(trial = trial, scale = scale,
euclid = euclid, sigma_m = sigma_m, effect = 0, outcome0 = outcome0, bw = bw,
centers = centers, numerator = baselineNumerator,
denominator = baselineDenominator)
ICC <- get_ICC(trial = trial, scale = scale)
message("\rbandwidth: ", bw, " ICC = ", ICC, " loss = ", loss, " \r")
CRT$trial <- trial
CRT$design$nkernels <- kernels
return(CRTsp(CRT))
}
createPropensity <- function(trial, bandwidth, centers = centers){
xdist <- with(trial, outer(x, x[centers], "-"))
ydist <- with(trial, outer(y, y[centers], "-"))
# distance matrix between the locations and the kernel centers
euclid <- sqrt(xdist * xdist + ydist * ydist) #is not a square matrix
# 2d normal kernels
f <- (1/(2 * pi * bandwidth^2)) * exp(-(euclid^2)/(2 * (bandwidth^2)))
totalf <- (1/ncol(euclid)) * rowSums(f) #sums over rows of matrix f
# Scale propensity so that it is between zero and one
propensity <- totalf/max(totalf)
return(propensity)
}
# Assign expected outcome to each location assuming a fixed effect size.
get_assignments <- function(trial, scale, euclid, sigma_m, effect, outcome0,
bw, centers, numerator, denominator) {
expected_ratio <- num <- rowno <- sumnum <- NULL
# remove any superseded numerator variable
trial[[numerator]] <- NULL
# generate a pattern of propensity
trial$propensity <- f_1 <- createPropensity(trial, bandwidth = bw, centers = centers)
# Smooth the propensity. the s.d. in each dimension of the 2 d gaussian is bw
# f_2 is the value of propensity decremented by the effect of intervention and smoothed
# by applying a further kernel smoothing step (trap the case with no spillover)
if (sigma_m < 0.001) sigma_m <- 0.001
f_2 <- f_1 * (1 - effect * (trial$arm == "intervention"))
f_3 <- dispersal(bw = sigma_m, euclid = euclid) %*% f_2
if (identical(scale, "continuous")) {
# Note that the sd here is logically different from sigma_m, but how to choose a value?
trial$num <- rnorm(n = nrow(trial),
mean = f_3 * trial[[denominator]],
sd = sigma_m)
} else {
if (!(denominator %in% colnames(trial))) trial[[denominator]] <- 1
# the denominator must be an integer; this changes the value if a non-integral value is input
trial[[denominator]] <- round(trial[[denominator]], digits = 0)
# compute the total positives expected given the input effect size
npositives <- round(outcome0 * sum(trial[[denominator]]) * (1 - 0.5 * effect))
# scale to input value of initial prevalence by assigning required number of infections with probabilities proportional
# to smoothed multiplied by the denominator
expected_allocation <- f_3 * trial[[denominator]]/sum(f_3 * trial[[denominator]])
trial$expected_ratio <- expected_allocation/trial[[denominator]]
trial$rowno <- seq(1:nrow(trial))
# expand the vector of locations to allow for denominators > 1
triallong <- trial %>%
tidyr::uncount(trial[[denominator]])
# To generate count data, records in triallong can be sampled multiple times. To generate proportions each record can
# only be sampled once.
replacement <- identical(scale, "count")
# sample generates a multinomial sample and outputs the indices of the locations assigned
# trap the case when there are not enough positive locations
if (sum(triallong$expected_ratio > 0) < npositives) {
triallong$expected_ratio <- (triallong$expected_ratio + 1e-6)/sum(triallong$expected_ratio + 1e-6)
}
positives <- sample(x = nrow(triallong), size = npositives, replace = replacement, prob = triallong$expected_ratio)
triallong$num <- 0
# TODO: this needs to work also for count data (replace = TRUE above)
triallong$num[positives] <- 1
# summarise the numerator values into the original set of locations
numdf <- dplyr::group_by(triallong, rowno) %>%
dplyr::summarise(sumnum = sum(num))
numdf[[numerator]] <- numdf$sumnum
# use left_join to merge into the original data frame (records with zero denominator do not appear in numdf)
trial <- trial %>%
dplyr::left_join(numdf, by = "rowno")
# remove temporary variables and replace missing numerators with zero (because the multinomial sampling algorithm leaves
# NA values where no events are assigned)
trial <- subset(trial, select = -c(rowno, expected_ratio, sumnum))
if (sum(is.na(trial[[numerator]])) > 0) {
warning("*** Some records have zero denominator after rounding ***")
message("You may want to remove these records or rescale the denominators")
trial[[numerator]][is.na(trial[[numerator]])] <- 0
}
}
return(trial)
}
# deviation of ICC from target as a function of bandwidth
ICCdeviation <- function(logbw, trial, ICC_inp, centers, approx_diag, sigma_m, scale, euclid, effect, outcome0, random_multiplier) {
cluster <- NULL
# set the seed so that a reproducible result is obtained for a specific bandwidth
if (!is.null(logbw)) {
bw <- exp(logbw[1])
set.seed(round(bw * random_multiplier))
}
trial <- get_assignments(trial = trial, scale = scale, euclid = euclid, sigma_m = sigma_m, effect = effect,
outcome0 = outcome0, bw = bw, centers = centers, numerator = "num", denominator = "denom")
loss <- (get_ICC(trial = trial, scale = scale) - ICC_inp)^2
# message("\rbandwidth: ", bw, " ICC=", ICC, " loss = ", loss, " \r")
return(loss)
}
get_ICC <- function(trial, scale) {
link <- map_scale_to_link(scale)
trial$y1 <- trial$num
trial$y_off <- trial$denom
trial$y0 <- trial$denom - trial$num
description <- get_description(trial = trial, link = link, alpha = 0.05, baselineOnly = TRUE)
# Intracluster correlation
ICC <- description$ICC
return(ICC)
}
# compute a euclidian distance matrix
distance_matrix <- function(x, y) {
# generates square matrices of differences
xdist <- outer(x, x, "-")
ydist <- outer(y, y, "-")
euclid <- sqrt(xdist * xdist + ydist * ydist)
}
# add lognormal noise: not sure this function is needed X is the input vector comprising a sample from a smoothed
# distribution varXY is the required variance
add_noise <- function(X, varXY) {
muY <- 1
varY <- (varXY - var(X))/(1 + var(X))
mu <- log(muY/sqrt(1 + varY/muY))
var <- log(1 + varY/muY)
Y <- stats::rlnorm(length(XY), meanlog = mu, sdlog = sqrt(var))
XY <- X * Y
return(XY)
}
assignkernels <- function(trial, baselineNumerator, baselineDenominator, kernels){
if (is.null(trial$propensity) & baselineNumerator %in% colnames(trial)){
# If baseline numerator data exist and propensity doesn't, use the baseline numerator to select kernel centers
trial[[baselineDenominator]] <- ifelse(is.null(trial[[baselineDenominator]]), 1, trial[[baselineDenominator]])
# expand the vector of locations to allow for denominators > 1
possible_kernels <- dplyr::mutate(trial, kernelID = dplyr::row_number())
possible_kernels$propensity <- round(10 * possible_kernels[[baselineNumerator]]/possible_kernels[[baselineDenominator]])
possible_kernels <- tidyr::uncount(possible_kernels, possible_kernels$propensity)
centers <- possible_kernels$kernelID[sample(x = nrow(possible_kernels), size = kernels, replace = FALSE)]
} else {
centers <- sample(1:nrow(trial), size = kernels, replace = TRUE)
}
return(centers)
}
# contribution of l to i as a function of the Gaussian used in simulating spillover
dispersal <- function(bw, euclid) {
# bivariate normal kernel
f <- (1/(2 * pi * bw^2)) * exp(-(euclid^2)/(2 * (bw^2)))
neighbours <- ifelse(euclid < 2*qnorm(0.975)*bw,1,0)
f_neighbours <- f * neighbours
n_neighbours <- rowSums(neighbours) #sums over rows of matrix f considering only neighbours
dispersal <- f_neighbours/n_neighbours
return(dispersal)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.