#' Linear Regression of Y vs. Covariates with Y Measured in Pools and
#' (Potentially) Subject to Additive Normal Errors
#'
#' Assumes outcome given covariates is a normal-errors linear regression. Pooled
#' outcome measurements can be assumed precise or subject to additive normal
#' processing error and/or measurement error. Replicates are supported.
#'
#' The individual-level model of interest for Y|\strong{X} is:
#'
#' Y = beta_0 + \strong{beta_x}^T \strong{X} + e, e ~ N(0, sigsq)
#'
#' The implied model for summed Y*|\strong{X*} in a pool with g members is:
#'
#' Y* = g beta_0 + \strong{beta_x}^T \strong{X*} + e*, e* ~ N(0, g sigsq)
#'
#' The assay targets Ybar, the mean Y value for each pool, from which the sum Y*
#' can be calculated as Y* = g Ybar. But the Ybar's may be subject to processing
#' error and/or measurement error. Suppose Ybartilde is the imprecise version of
#' Ybar from the assay. If both errors are present, the assumed error structure
#' is:
#'
#' Ybartilde = Ybar + e_p I(g > 1) + e_m, e_p ~ N(0, sigsq_p),
#' e_m ~ N(0, sigsq_m)
#'
#' with the processing error e_p and measurement error e_m assumed independent
#' of each other. This motivates a maximum likelihood analysis for estimating
#' \strong{theta} = (beta_0, \strong{beta_x}^T)^T based on observed
#' (Ytilde*, \strong{X}*) values, where Ytilde* = g Ytildebar.
#'
#'
#' @param g Numeric vector with pool sizes, i.e. number of members in each pool.
#' @param ytilde Numeric vector (or list of numeric vectors, if some pools have
#' replicates) with poolwise sum \code{Ytilde} values.
#' @param x Numeric matrix with poolwise \strong{\code{X}} values (if any), with
#' one row for each pool. Can be a vector if there is only 1 covariate.
#' @param errors Character string specifying the errors that \code{Y} is subject
#' to. Choices are \code{"neither"}, \code{"processing"} for processing error
#' only, \code{"measurement"} for measurement error only, and \code{"both"}.
#' @param estimate_var Logical value for whether to return variance-covariance
#' matrix for parameter estimates.
#' @param start_nonvar_var Numeric vector of length 2 specifying starting value
#' for non-variance terms and variance terms, respectively.
#' @param lower_nonvar_var Numeric vector of length 2 specifying lower bound for
#' non-variance terms and variance terms, respectively.
#' @param upper_nonvar_var Numeric vector of length 2 specifying upper bound for
#' non-variance terms and variance terms, respectively.
#' @param nlminb_list List of arguments to pass to \code{\link[stats]{nlminb}}
#' for log-likelihood maximization.
#' @param hessian_list List of arguments to pass to
#' \code{\link[numDeriv]{hessian}}.
#'
#'
#' @return
#' List containing:
#' \enumerate{
#' \item Numeric vector of parameter estimates.
#' \item Variance-covariance matrix (if \code{estimate_var = TRUE}).
#' \item Returned \code{\link[stats]{nlminb}} object from maximizing the
#' log-likelihood function.
#' \item Akaike information criterion (AIC).
#' }
#'
#'
#' @references
#' Schisterman, E.F., Vexler, A., Mumford, S.L. and Perkins, N.J. (2010) "Hybrid
#' pooled-unpooled design for cost-efficient measurement of biomarkers."
#' \emph{Stat. Med.} \strong{29}(5): 597--613.
#'
#'
#' @examples
#' # Load dataset containing data frame with (g, X1*, X2*, Y*, Ytilde*) values
#' # for 500 pools each of size 1, 2, and 3, and list of Ytilde values where 20
#' # of the single-specimen pools have replicates. Ytilde values are affected by
#' # processing error and measurement error; true parameter values are
#' # beta_0 = 0.25, beta_x1 = 0.5, beta_x2 = 0.25, sigsq = 1.
#' data(dat_p_linreg_yerrors)
#' dat <- dat_p_linreg_yerrors$dat
#' reps <- dat_p_linreg_yerrors$reps
#'
#' # Fit Ytilde* vs. (X1*, X2*) ignoring errors in Ytilde (leads to loss of
#' # precision and overestimated sigsq, but no bias).
#' fit.naive <- p_linreg_yerrors(
#' g = dat$g,
#' y = dat$y,
#' x = dat[, c("x1", "x2")],
#' errors = "neither"
#' )
#' fit.naive$theta.hat
#'
#' # Account for errors in Ytilde*, without using replicates
#' fit.corrected.noreps <- p_linreg_yerrors(
#' g = dat$g,
#' y = dat$ytilde,
#' x = dat[, c("x1", "x2")],
#' errors = "both"
#' )
#' fit.corrected.noreps$theta.hat
#'
#' # Account for errors in Ytilde*, incorporating the 20 replicates
#' fit.corrected.reps <- p_linreg_yerrors(
#' g = dat$g,
#' y = reps,
#' x = dat[, c("x1", "x2")],
#' errors = "both"
#' )
#' fit.corrected.reps$theta.hat
#'
#' # In this trial, incorporating replicates resulted in much better estimates
#' # of sigsq (truly 1), sigsq_p (truly 0.4), and sigsq_m (truly = 0.2) but very
#' # similar regression coefficient estimates.
#' fit.corrected.noreps$theta.hat
#' fit.corrected.reps$theta.hat
#'
#'
#' @export
p_linreg_yerrors <- function(
g,
ytilde,
x = NULL,
errors = "processing",
estimate_var = TRUE,
start_nonvar_var = c(0.01, 1),
lower_nonvar_var = c(-Inf, 1e-4),
upper_nonvar_var = c(Inf, Inf),
nlminb_list = list(control = list(trace = 1, eval.max = 500, iter.max = 500)),
hessian_list = list(method.args = list(r = 4))
) {
# Check that inputs are valid
if (! errors %in% c("neither", "processing", "measurement", "both")) {
stop("The input 'errors' should be set to 'neither', 'processing',
'measurement', or 'both'.")
}
if (! is.logical(estimate_var)) {
stop("The input 'estimate_var' should be TRUE or FALSE.")
}
if (! (is.numeric(start_nonvar_var) & length(start_nonvar_var) == 2)) {
stop("The input 'start_nonvar_var' should be a numeric vector of length 2.")
}
if (! (is.numeric(lower_nonvar_var) & length(lower_nonvar_var) == 2)) {
stop("The input 'lower_nonvar_var' should be a numeric vector of length 2.")
}
if (! (is.numeric(upper_nonvar_var) & length(upper_nonvar_var) == 2)) {
stop("The input 'upper_nonvar_var' should be a numeric vector of length 2.")
}
# Sample size
n <- length(ytilde)
# Get number of X variables (and assign names)
if (is.null(x)) {
n.xvars <- 0
x.varnames <- NULL
} else {
x.varname <- deparse(substitute(x))
if (! is.matrix(x)) {
x <- as.matrix(x)
}
n.xvars <- ncol(x)
x.varnames <- colnames(x)
if (is.null(x.varnames)) {
if (n.xvars == 1) {
if (length(grep("$", x.varname, fixed = TRUE)) > 0) {
x.varname <- substr(x.varname,
start = which(unlist(strsplit(x.varname, "")) == "$") + 1,
stop = nchar(x.varname))
}
x.varnames <- x.varname
} else {
x.varnames <- paste("x", 1: n.xvars, sep = "")
}
}
}
# Get number of betas
n.betas <- 1 + n.xvars
# Create vector indicating which observations are pools
Ig <- ifelse(g > 1, 1, 0)
# Create matrix of (g, X) values
gx <- cbind(g, x)
# Separate into replicates and singles
class.ytilde <- class(ytilde)
if (class.ytilde == "list") {
k <- sapply(ytilde, length)
which.r <- which(k > 1)
n.r <- length(which.r)
some.r <- n.r > 0
if (some.r) {
# Replicates
g.r <- g[which.r]
Ig.r <- Ig[which.r]
k.r <- k[which.r]
ytilde.r <- ytilde[which.r]
gx.r <- gx[which.r, , drop = FALSE]
}
n <- n - n.r
some.s <- n > 0
if (some.s) {
# Singles
g <- g[-which.r]
Ig <- Ig[-which.r]
ytilde <- unlist(ytilde[-which.r])
gx <- gx[-which.r, , drop = FALSE]
}
} else {
some.r <- FALSE
some.s <- TRUE
}
# Get indices for parameters being estimated and create labels
loc.betas <- 1: n.betas
beta.labels <- paste("beta", c("0", x.varnames), sep = "_")
loc.sigsq <- n.betas + 1
if (errors == "neither") {
theta.labels <- c(beta.labels, "sigsq")
} else if (errors == "processing") {
theta.labels <- c(beta.labels, "sigsq", "sigsq_p")
} else if (errors == "measurement") {
theta.labels <- c(beta.labels, "sigsq", "sigsq_m")
} else if (errors == "both") {
theta.labels <- c(beta.labels, "sigsq", "sigsq_p", "sigsq_m")
}
# Log-likelihood function
llf <- function(f.theta) {
# Extract parameters
f.betas <- matrix(f.theta[loc.betas], ncol = 1)
f.sigsq <- f.theta[loc.sigsq]
if (errors == "neither") {
f.sigsq_p <- 0
f.sigsq_m <- 0
} else if (errors == "measurement") {
f.sigsq_p <- 0
f.sigsq_m <- f.theta[loc.sigsq + 1]
} else if (errors == "processing") {
f.sigsq_p <- f.theta[loc.sigsq + 1]
f.sigsq_m <- 0
} else if (errors == "both") {
f.sigsq_p <- f.theta[loc.sigsq + 1]
f.sigsq_m <- f.theta[loc.sigsq + 2]
}
# Likelihood:
# L_i = f(Ytilde|X)
if (some.r) {
ll.vals <- c()
for (ii in 1: length(ytilde.r)) {
# Values for ith subject
g_i <- g.r[ii]
Ig_i <- Ig.r[ii]
k_i <- k.r[ii]
ytilde_i <- ytilde.r[[ii]]
gx_i <- gx.r[ii, ]
# E(Ytilde|X) and V(Ytilde|X)
Mu_ytilde.x <- matrix(gx_i %*% f.betas, nrow = k_i)
Sigma_ytilde.x <- g_i * f.sigsq +
g_i^2 * f.sigsq_p * Ig_i +
diag(g_i^2 * f.sigsq_m, k_i)
# Log-likelihood
ll.vals[ii] <- dmvnorm(x = ytilde_i, log = TRUE,
mean = Mu_ytilde.x,
sigma = Sigma_ytilde.x)
}
ll.r <- sum(ll.vals)
} else {
ll.r <- 0
}
if (some.s) {
# E(Ytilde|X) and V(Ytilde|X)
mu_ytilde.x <- gx %*% f.betas
sigsq_ytilde.x <- g * f.sigsq + g^2 * f.sigsq_p * Ig + g^2 * f.sigsq_m
# Log-likelihood
ll.s <- sum(dnorm(x = ytilde, log = TRUE,
mean = mu_ytilde.x, sd = sqrt(sigsq_ytilde.x)))
} else {
ll.s <- 0
}
# Return negative log-likelihood
ll <- ll.r + ll.s
return(-ll)
}
# Starting values
if (is.null(nlminb_list$start)) {
if (errors == "neither") {
nlminb_list$start <- c(rep(start_nonvar_var[1], n.betas),
start_nonvar_var[2])
} else if (errors %in% c("measurement", "processing")) {
nlminb_list$start <- c(rep(start_nonvar_var[1], n.betas),
rep(start_nonvar_var[2], 2))
} else if (errors == "both") {
nlminb_list$start <- c(rep(start_nonvar_var[1], n.betas),
rep(start_nonvar_var[2], 3))
}
}
names(nlminb_list$start) <- theta.labels
# Lower bounds
if (is.null(nlminb_list$lower)) {
if (errors == "neither") {
nlminb_list$lower <- c(rep(lower_nonvar_var[1], n.betas),
lower_nonvar_var[2])
} else if (errors %in% c("measurement", "processing")) {
nlminb_list$lower <- c(rep(lower_nonvar_var[1], n.betas),
rep(lower_nonvar_var[2], 2))
} else if (errors == "both") {
nlminb_list$lower <- c(rep(lower_nonvar_var[1], n.betas),
rep(lower_nonvar_var[2], 3))
}
}
# Upper bounds
if (is.null(nlminb_list$upper)) {
if (errors == "neither") {
nlminb_list$upper <- c(rep(upper_nonvar_var[1], n.betas),
upper_nonvar_var[2])
} else if (errors %in% c("measurement", "processing")) {
nlminb_list$upper <- c(rep(upper_nonvar_var[1], n.betas),
rep(upper_nonvar_var[2], 2))
} else if (errors == "both") {
nlminb_list$upper <- c(rep(upper_nonvar_var[1], n.betas),
rep(upper_nonvar_var[2], 3))
}
}
# Obtain ML estimates
ml.max <- do.call(nlminb, c(list(objective = llf), nlminb_list))
# Print message if nlminb indicates non-convergence
if (ml.max$convergence == 1) {
message("Object returned by 'nlminb' function indicates non-convergence. You may want to try different starting values.")
}
# Create list to return
theta.hat <- ml.max$par
names(theta.hat) <- theta.labels
ret.list <- list(theta.hat = theta.hat)
# If requested, add variance-covariance matrix to ret.list
if (estimate_var) {
# Estimate Hessian
hessian.mat <- do.call(numDeriv::hessian,
c(list(func = llf, x = theta.hat),
hessian_list))
# Estimate variance-covariance matrix
theta.variance <- try(solve(hessian.mat), silent = TRUE)
if (class(theta.variance)[1] == "try-error" | sum(is.na(hessian.mat)) > 0) {
message("Estimated Hessian matrix is singular, so variance-covariance matrix cannot be obtained.")
ret.list$theta.var <- NULL
} else {
colnames(theta.variance) <- rownames(theta.variance) <- theta.labels
ret.list$theta.var <- theta.variance
}
}
# Add nlminb object and AIC to ret.list
ret.list$nlminb.object <- ml.max
ret.list$aic <- 2 * (length(theta.hat) + ml.max$objective)
# Return ret.list
return(ret.list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.