Nothing
#' @title Fit a z-curve to clustered data
#'
#' @description \code{zcurve_clustered} is used to fit z-curve models to
#' clustered data. The function requires a data object created with the
#' [zcurve_data()] function as the input (where id denotes clusters).
#' Two different methods that account for clustering ar implemented via
#' the EM model: \code{"w"} for down weighting the likelihood of the test
#' statistics proportionately to the number of repetitions in the clusters,
#' and \code{"b"} for a nested bootstrap where only a single study from each
#' bootstrap is selected for model fitting.
#' @param data an object created with [zcurve_data()] function.
#' @param method the method to be used for fitting. Possible options are
#' down weighting \code{"w"} and nested bootstrap \code{"b"}.
#' Defaults to \code{"w"}.
#' @param bootstrap the number of bootstraps for estimating CI. To skip
#' bootstrap specify \code{FALSE}.
#' @param parallel whether the bootstrap should be performed in parallel.
#' Defaults to \code{FALSE}. The implementation is not completely stable
#' and might cause a connection error.
#' @param control additional options for the fitting algorithm more details in
#' \link[=control_EM]{control EM}.
#'
#'
#' @references
#' \insertAllCited{}
#'
#' @return The fitted z-curve object
#'
#' @seealso [zcurve()], [summary.zcurve()], [plot.zcurve()], [control_EM], [control_density]
#' @export
zcurve_clustered <- function(data, method = "b", bootstrap = 1000, parallel = FALSE, control = NULL){
warning("Please note that the clustering adjustment is an experimental feature.", immediate. = TRUE)
if(!method %in% c("w", "b"))
stop("Wrong method, select a supported option.")
if(method == "b" && is.logical(bootstrap) && !bootstrap)
stop("The nested boostrap method requires bootstrap.")
# set bootstrap
if(!is.numeric(bootstrap)){
bootstrap <- FALSE
}else if(bootstrap <= 0){
bootstrap <- FALSE
}
if(!inherits(data, "zcurve_data")){
stop("The 'data' input must be created by the `zcurve_data()` function. See `?zcurve_data()` for more information.")
}
# create results object
object <- NULL
object$call <- match.call()
object$method <- switch(method, "w" = "EM (weighted)", "b" = "EM (bootstrapped)")
object$input_type <- "zcurve-data"
# create control
control <- .zcurve_EM.control(control)
### prepare data
if(nrow(data$precise) != 0){
z <- .p_to_z(data$precise$p)
z_id <- data$precise$id
}else{
z <- numeric()
z_id <- numeric()
}
# get the total number of observations before removal for fitting purposes
N_obs <- length(data$precise$p)
N_sig <- length(data$precise$p[.p_to_z(data$precise$p) >= control$a])
if(nrow(data$censored) != 0){
lb <- .p_to_z(data$censored$p.ub)
ub <- .p_to_z(data$censored$p.lb)
b_id <- data$censored$id
# get the total number of observations before removal for fitting purposes (using the collected bounds before accounting for rounding)
N_obs <- N_obs + length(.p_to_z(data$censored$p.rep))
N_sig <- N_sig + length(.p_to_z(data$censored$p.rep)[.p_to_z(data$censored$p.rep) >= control$a])
# remove non-significant censored p-values
if(any(lb < control$a)){
warning(paste0(sum(lb < control$a), " censored p-values removed due to the upper bound being larger that the fitting range."), immediate. = TRUE, call. = FALSE)
b_id <- b_id[lb >= control$a]
ub <- ub[lb >= control$a]
lb <- lb[lb >= control$a]
}
# move too significant censored p-values among precise p-values
if(length(lb) > 0 && any(lb >= control$b)){
object$data <- c(object$data, lb[lb >= control$b])
b_id <- b_id[lb < control$b]
ub <- ub[lb < control$b]
lb <- lb[lb < control$b]
}
if(length(lb) > 0){
# restrict the upper censoring to the fitting range
ub <- ifelse(ub > control$b, control$b, ub)
# update control
control$type <- 3
}else{
lb <- numeric()
ub <- numeric()
b_id <- numeric()
}
}else{
lb <- numeric()
ub <- numeric()
b_id <- numeric()
}
object$data <- z
object$data_id <- z_id
object$data_censoring <- data.frame(lb = lb, ub = ub, id = b_id)
object$control <- control
object$N_obs <- N_obs
object$N_sig <- N_sig
# only run the algorithm with some significant results
if(sum(z > control$a & z < control$b) + length(lb) < 10)
stop("There must be at least 10 z-scores in the fitting range but a much larger number is recommended.")
# use appropriate algorithm
if(method == "b"){
fit_b <- .zcurve_EM_b(z = z, z_id = z_id, lb = lb, ub = ub, b_id = b_id, control = control, bootstrap = bootstrap, parallel = parallel)
fit <- fit_b$fit
}else if(method == "w"){
fit <- .zcurve_EM_w(z = z, z_id = z_id, lb = lb, ub = ub, b_id = b_id, control = control)
}
object$fit <- fit
# check convergence
object$converged <- ifelse(fit$iter < control$max_iter, TRUE, FALSE)
# do bootstrap
if(bootstrap != FALSE){
if(method == "b"){
fit_boot <- fit_b$fit_boot
}else if(method == "w"){
if(parallel){
fit_boot <- .zcurve_EM_w_boot.par(z = z, z_id = z_id, lb = lb, ub = ub, b_id = b_id, control = control, fit = fit, bootstrap = bootstrap)
}else{
fit_boot <- .zcurve_EM_w_boot(z = z, z_id = z_id, lb = lb, ub = ub, b_id = b_id, control = control, fit = fit, bootstrap = bootstrap)
}
}
object$boot <- fit_boot
}
# estimates
object$coefficients <- .get_estimates(mu = fit$mu, weights = fit$weights, prop_high = fit$prop_high, sig_level = control$sig_level, a = control$a)
# boot estimates
if(bootstrap != FALSE){
object$coefficients_boot <- data.frame(t(sapply(1:bootstrap, function(i){
.get_estimates(mu = fit_boot$mu[i,], weights = fit_boot$weights[i,], prop_high = fit_boot$prop_high[i], sig_level = control$sig_level, a = control$a)
})))
}
class(object) <- c("zcurve", "zcurve.clustered")
return(object)
}
.zcurve_EM_b <- function(z, z_id, lb, ub, b_id, control, bootstrap, parallel){
# get starting value z-curves
fit_start <- .zcurve_EMc_start_fast_RCpp(x = z,
lb = lb,
ub = ub,
K = control$K,
mu = control$mu,
sigma = control$sigma,
mu_alpha = control$mu_alpha,
mu_max = control$mu_max,
theta_alpha = control$theta_alpha,
a = control$a,
b = control$b,
sig_level = control$sig_level,
fit_reps = control$fit_reps,
max_iter = control$max_iter_start,
criterion = control$criterion_start)
# fit final z-curve
data_index <- rbind(
data.frame(
z = z,
lb = rep(NA, length(z)),
ub = rep(NA, length(z)),
id = z_id,
type = rep(1, length(z))
),
data.frame(
z = rep(NA, length(lb)),
lb = lb,
ub = ub,
id = b_id,
type = rep(2, length(lb))
)
)
if(parallel){
cores <- zcurve.get_option("max_cores")
core_load <- split(1:bootstrap, rep(1:cores, length.out = bootstrap))
core_load <- sapply(core_load, length)
initial_seed <- sample(.Machine$integer.max, 1)
cl <- parallel::makePSOCKcluster(cores)
parallel::clusterEvalQ(cl, {library("zcurve")})
parallel::clusterExport(cl, c("fit_start", "control", "data_index", "core_load", "initial_seed"), envir = environment())
fit <- parallel::parLapplyLB(cl, 1:cores, function(i){
set.seed(initial_seed + i)
fit <- list()
for(i in 1:core_load[i]){
boot_data <- .boot_id(data_index)
fit[[i]] <- .zcurve_EMc_fit_fast_RCpp(x = boot_data$z[boot_data$type == 1],
lb = boot_data$lb[boot_data$type == 2],
ub = boot_data$ub[boot_data$type == 2],
mu = fit_start$mu[which.max(fit_start$Q),],
sigma = control$sigma,
theta = fit_start$weights[which.max(fit_start$Q),],
a = control$a,
b = control$b,
sig_level = control$sig_level,
max_iter = control$max_iter,
criterion = control$criterion)
}
return(fit)
})
parallel::stopCluster(cl)
fit <- do.call(c, fit)
}else{
fit <- list()
for(i in 1:bootstrap){
boot_data <- .boot_id(data_index)
fit[[i]] <- .zcurve_EMc_fit_fast_RCpp(x = boot_data$z[boot_data$type == 1],
lb = boot_data$lb[boot_data$type == 2],
ub = boot_data$ub[boot_data$type == 2],
mu = fit_start$mu[which.max(fit_start$Q),],
sigma = control$sigma,
theta = fit_start$weights[which.max(fit_start$Q),],
a = control$a,
b = control$b,
sig_level = control$sig_level,
max_iter = control$max_iter,
criterion = control$criterion)
}
}
fit_boot = list(
"mu" = do.call(rbind, lapply(fit, function(f) f[["mu"]])),
"weights" = do.call(rbind, lapply(fit, function(f) f[["weights"]])),
"prop_high" = do.call(c, lapply(fit, function(f) f[["prop_high"]])),
"Q" = do.call(c, lapply(fit, function(f) f[["Q"]])),
"iter" = do.call(c, lapply(fit, function(f) f[["iter"]]))
)
return(
list(
fit = list(
"mu" = apply(fit_boot[["mu"]], 2, mean),
"weights" = apply(fit_boot[["weights"]], 2, mean),
"prop_high" = mean(fit_boot[["prop_high"]]),
"Q" = mean(fit_boot[["Q"]]),
"iter" = mean(fit_boot[["iter"]]),
"iter_start" = fit_start$iter[which.max(fit_start$Q)]
),
fit_boot = fit_boot
)
)
}
.zcurve_EM_w <- function(z, z_id, lb, ub, b_id, control){
# get starting value z-curves
fit_start <- .zcurve_EMc_start_fast_RCpp(x = z,
lb = lb,
ub = ub,
K = control$K,
mu = control$mu,
sigma = control$sigma,
mu_alpha = control$mu_alpha,
mu_max = control$mu_max,
theta_alpha = control$theta_alpha,
a = control$a,
b = control$b,
sig_level = control$sig_level,
fit_reps = control$fit_reps,
max_iter = control$max_iter_start,
criterion = control$criterion_start)
# compute weights
id_freq <- table(c(z_id, b_id))
id_weights <- data.frame(
id = names(id_freq),
w = 1/as.vector(id_freq)
)
# fit final z-curve
fit <- .zcurve_EMc_fit_fast_w_RCpp(x = z,
x_w = id_weights$w[match(z_id, id_weights$id)],
lb = lb,
ub = ub,
b_w = id_weights$w[match(b_id, id_weights$id)],
mu = fit_start$mu[which.max(fit_start$Q),],
sigma = control$sigma,
theta = fit_start$weights[which.max(fit_start$Q),],
a = control$a,
b = control$b,
sig_level = control$sig_level,
max_iter = control$max_iter,
criterion = control$criterion)
return(
list(
"mu" = fit$mu,
"weights" = fit$weights,
"prop_high" = fit$prop_high,
"Q" = fit$Q,
"iter" = fit$iter,
"iter_start" = fit_start$iter[which.max(fit_start$Q)]
)
)
}
.zcurve_EM_w_boot <- function(z, z_id, lb, ub, b_id, control, fit, bootstrap){
# compute weights
id_freq <- table(c(z_id, b_id))
id_weights <- data.frame(
id = names(id_freq),
w = 1/as.vector(id_freq)
)
x_w <- id_weights$w[match(z_id, id_weights$id)]
b_w <- id_weights$w[match(b_id, id_weights$id)]
indx <- c(
if(length(z) > 0) 1:length(z),
if(length(lb) > 0) (-length(lb)):-1
)
fit_boot <- .zcurve_EMc_boot_fast_w_RCpp(x = z,
x_w = x_w,
lb = lb,
ub = ub,
b_w = b_w,
indx = indx,
mu = fit$mu,
sigma = control$sigma,
theta = fit$weights,
a = control$a,
b = control$b,
sig_level = control$sig_level,
bootstrap = bootstrap,
criterion = control$criterion_boot,
max_iter = control$max_iter_boot)
return(
list(
"mu" = fit_boot$mu,
"weights" = fit_boot$weights,
"Q" = fit_boot$Q,
"prop_high" = fit_boot$prop_high,
"iter" = fit_boot$iter
)
)
}
.zcurve_EM_w_boot.par <- function(z, z_id, lb, ub, b_id, control, fit, bootstrap){
cores <- zcurve.get_option("max_cores")
core_load <- split(1:bootstrap, rep(1:cores, length.out = bootstrap))
core_load <- sapply(core_load, length)
initial_seed <- sample(.Machine$integer.max, 1)
cl <- parallel::makePSOCKcluster(cores)
parallel::clusterEvalQ(cl, {library("zcurve")})
parallel::clusterExport(cl, c("z", "z_id", "lb", "ub", "b_id", "control", "fit", "bootstrap", "core_load", "initial_seed"), envir = environment())
fit_boot <- parallel::parLapplyLB(cl, 1:cores, function(i){
set.seed(initial_seed + i)
return(.zcurve_EM_w_boot(z, z_id, lb, ub, b_id, control, fit, core_load[i]))
})
parallel::stopCluster(cl)
return(
list(
"mu" = do.call(rbind, lapply(fit_boot, function(x) x$mu)),
"weights" = do.call(rbind, lapply(fit_boot, function(x) x$weights)),
"Q" = do.call(c, lapply(fit_boot, function(x) x$Q)),
"prop_high" = do.call(c, lapply(fit_boot, function(x) x$prop_high)),
"iter" = do.call(c, lapply(fit_boot, function(x) x$iter))
)
)
}
.boot_id <- function(data){
unique_id <- unique(data$id)
if(length(unique_id) == nrow(data)){
return(data)
}
boot_out <- list()
for(id in unique_id){
temp_data <- data[data$id == id,]
if(nrow(temp_data) == 1){
boot_out[[id]] <- temp_data
}else{
boot_out[[id]] <- temp_data[sample(nrow(temp_data), 1),]
}
}
boot_out <- do.call(rbind, boot_out)
return(boot_out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.