#' Gamma Discriminant Function Approach for Estimating Odds Ratio with Exposure
#' Measured in Pools and Potentially Subject to Multiplicative Lognormal Errors
#' (Non-constant Odds Ratio Version)
#'
#' See \code{\link{p_gdfa}}.
#'
#'
#' @param g Numeric vector with pool sizes, i.e. number of members in each pool.
#' @param y Numeric vector with poolwise Y values, coded 0 if all members are
#' controls and 1 if all members are cases.
#' @param xtilde Numeric vector (or list of numeric vectors, if some pools have
#' replicates) with Xtilde values.
#' @param c List where each element is a numeric matrix containing the
#' \strong{C} values for members of a particular pool (1 row for each member).
#' @param errors Character string specifying the errors that X is subject to.
#' Choices are \code{"neither"}, \code{"processing"} for processing error only,
#' \code{"measurement"} for measurement error only, and \code{"both"}.
#' @param estimate_var Logical value for whether to return variance-covariance
#' matrix for parameter estimates.
#' @param start_nonvar_var Numeric vector of length 2 specifying starting value
#' for non-variance terms and variance terms, respectively.
#' @param lower_nonvar_var Numeric vector of length 2 specifying lower bound for
#' non-variance terms and variance terms, respectively.
#' @param upper_nonvar_var Numeric vector of length 2 specifying upper bound for
#' non-variance terms and variance terms, respectively.
#' @param jitter_start Numeric value specifying standard deviation for mean-0
#' normal jitters to add to starting values for a second try at maximizing the
#' log-likelihood, should the initial call to \code{\link[stats]{nlminb}} result
#' in non-convergence. Set to \code{NULL} for no second try.
#' @param hcubature_list List of arguments to pass to
#' \code{\link[cubature]{hcubature}} for numerical integration.
#' @param nlminb_list List of arguments to pass to \code{\link[stats]{nlminb}}
#' for log-likelihood maximization.
#' @param hessian_list List of arguments to pass to
#' \code{\link[numDeriv]{hessian}} for approximating the Hessian matrix. Only
#' used if \code{estimate_var = TRUE}.
#' @param nlminb_object Object returned from \code{\link[stats]{nlminb}} in a
#' prior call. Useful for bypassing log-likelihood maximization if you just want
#' to re-estimate the Hessian matrix with different options.
#'
#'
#' @return List containing:
#' \enumerate{
#' \item Numeric vector of parameter estimates.
#' \item Variance-covariance matrix.
#' \item Returned \code{\link[stats]{nlminb}} object from maximizing the
#' log-likelihood function.
#' \item Akaike information criterion (AIC).
#' }
#'
#'
#' @references
#' Lyles, R.H., Van Domelen, D.R., Mitchell, E.M. and Schisterman, E.F. (2015)
#' "A discriminant function approach to adjust for processing and measurement
#' error When a biomarker is assayed in pooled samples."
#' \emph{Int. J. Environ. Res. Public Health} \strong{12}(11): 14723--14740.
#'
#' Mitchell, E.M, Lyles, R.H., and Schisterman, E.F. (2015) "Positing, fitting,
#' and selecting regression models for pooled biomarker data." \emph{Stat. Med}
#' \strong{34}(17): 2544--2558.
#'
#' Schisterman, E.F., Vexler, A., Mumford, S.L. and Perkins, N.J. (2010) "Hybrid
#' pooled-unpooled design for cost-efficient measurement of biomarkers."
#' \emph{Stat. Med.} \strong{29}(5): 597--613.
#'
#' Whitcomb, B.W., Perkins, N.J., Zhang, Z., Ye, A., and Lyles, R. H. (2012)
#' "Assessment of skewed exposure in case-control studies with pooling."
#' \emph{Stat. Med.} \strong{31}: 2461--2472.
#'
#'
#' @export
# data(dat_p_gdfa)
# dat <- dat_p_gdfa$dat
# reps <- dat_p_gdfa$reps
# c.list <- dat_p_gdfa$c.list
#
# g <- dat$g
# y <- dat$y
# xtilde <- reps
# c <- c.list
# errors <- "both"
# estimate_var <- TRUE
p_gdfa_nonconstant <- function(
g,
y,
xtilde,
c = NULL,
errors = "processing",
estimate_var = TRUE,
start_nonvar_var = c(0.01, 1),
lower_nonvar_var = c(-Inf, 1e-4),
upper_nonvar_var = c(Inf, Inf),
jitter_start = 0.01,
hcubature_list = list(tol = 1e-8),
nlminb_list = list(control = list(trace = 1, eval.max = 500, iter.max = 500)),
hessian_list = list(method.args = list(r = 4)),
nlminb_object = NULL
) {
# Check that inputs are valid
if (! errors %in% c("neither", "processing", "measurement", "both")) {
stop("The input 'errors' should be set to 'neither', 'processing', 'measurement', or 'both'.")
}
if (! is.logical(estimate_var)) {
stop("The input 'estimate_var' should be TRUE or FALSE.")
}
if (! (is.numeric(start_nonvar_var) & length(start_nonvar_var) == 2)) {
stop("The input 'start_nonvar_var' should be a numeric vector of length 2.")
}
if (! (is.numeric(lower_nonvar_var) & length(lower_nonvar_var) == 2)) {
stop("The input 'lower_nonvar_var' should be a numeric vector of length 2.")
}
if (! (is.numeric(upper_nonvar_var) & length(upper_nonvar_var) == 2)) {
stop("The input 'upper_nonvar_var' should be a numeric vector of length 2.")
}
if (! is.null(jitter_start) & jitter_start <= 0) {
stop("The input 'jitter_start' should be a non-negative value, if specified.")
}
# Get name of y input
y.varname <- deparse(substitute(y))
if (length(grep("$", y.varname, fixed = TRUE)) > 0) {
y.varname <- substr(y.varname,
start = which(unlist(strsplit(y.varname, "")) == "$") + 1,
stop = nchar(y.varname))
}
# Get information about covariates C
if (is.null(c)) {
c.varnames <- NULL
n.cvars <- 0
some.cs <- FALSE
} else {
n.cvars <- ncol(c[[1]])
some.cs <- TRUE
c.varnames <- colnames(c[[1]])
if (is.null(c.varnames)) {
if (n.cvars == 1) {
c.varnames <- deparse(substitute(c))
} else {
c.varnames <- paste("c", 1: n.cvars, sep = "")
}
}
}
# Sample size
n <- length(y)
# Get number of gammas
n.gammas <- 2 + n.cvars
# Figure out pool sizes if not specified
if (is.null(g)) {
g <- sapply(c, nrow)
}
# Create vector indicating which observations are pools
Ig <- ifelse(g > 1, 1, 0)
# Construct list of (1, Y, C) matrices
if (some.cs) {
oneyc <- mapply(
FUN = function(y, c) {
cbind(1, y, c)
},
c = c,
y = y
)
} else {
oneyc <- cbind(1, y)
}
# If no measurement error and xtilde is a list, just use first measurements.
# Also, calculate number of replicates per pool.
class.xtilde <- class(xtilde)
if (class.xtilde == "list") {
if (errors %in% c("neither", "processing")) {
xtilde <- sapply(xtilde, function(x) x[1])
k <- rep(1, n)
} else {
k <- sapply(xtilde, length)
}
} else {
k <- rep(1, n)
}
# Separate into subjects with precisely measured X, replicate Xtilde's, and
# single imprecise Xtilde
if (errors == "neither") {
which.p <- 1: n
which.r <- NULL
which.i <- NULL
} else if (errors == "processing") {
which.p <- which(Ig == 0)
which.r <- NULL
which.i <- which(Ig == 1)
} else if (errors == "measurement") {
which.p <- NULL
which.r <- which(k > 1)
which.i <- which(k == 1)
} else if (errors == "both") {
which.p <- NULL
which.r <- which(k > 1)
which.i <- which(k == 1)
}
n.p <- length(which.p)
some.p <- n.p > 0
if (some.p) {
g.p <- g[which.p]
Ig.p <- Ig[which.p]
y.p <- y[which.p]
oneyc.p <- oneyc[which.p]
x.p <- unlist(xtilde[which.p])
}
n.r <- length(which.r)
some.r <- n.r > 0
if (some.r) {
k.r <- k[which.r]
g.r <- g[which.r]
Ig.r <- Ig[which.r]
y.r <- y[which.r]
oneyc.r <- oneyc[which.r]
xtilde.r <- xtilde[which.r]
}
n.i <- length(which.i)
some.i <- n.i > 0
if (some.i) {
g.i <- g[which.i]
Ig.i <- Ig[which.i]
y.i <- y[which.i]
oneyc.i <- oneyc[which.i]
xtilde.i <- unlist(xtilde[which.i])
}
# Get indices for parameters being estimated and create labels
loc.gammas <- 1: n.gammas
gamma.labels <- paste("gamma", c("0", y.varname, c.varnames), sep = "_")
loc.bs <- (n.gammas + 1): (n.gammas + 2)
theta.labels <- c(gamma.labels, "b1", "b0")
if (errors == "processing") {
theta.labels <- c(theta.labels, "sigsq_p")
} else if (errors == "measurement") {
theta.labels <- c(theta.labels, "sigsq_m")
} else if (errors == "both") {
theta.labels <- c(theta.labels, "sigsq_p", "sigsq_m")
}
# Likelihood function for singles and replicates
if (some.i | some.r) {
lf <- function(Ig,
k,
xtilde,
x,
shape,
scale,
sigsq_p,
sigsq_m) {
# f(XtildeX|Y,C_1,...,C_g)
x <- matrix(x, nrow = 1)
f_xtildex.yc <- apply(x, 2, function(z) {
# Transformation
s <- z / (1 - z)
if (k == 1) {
# E[log(Xtilde)|X] and V[log(Xtilde|X)]
mu_logxtilde.x <- log(s) - 1/2 * (sigsq_p * Ig + sigsq_m)
sigsq_logxtilde.x <- sigsq_p * Ig + sigsq_m
# Density
1 / xtilde * dnorm(x = log(xtilde),
mean = mu_logxtilde.x,
sd = sqrt(sigsq_logxtilde.x)) *
dgamma(x = s, shape = shape, scale = scale)
} else {
# E[log(Xtilde)|X] and V[log(Xtilde|X)]
Mu_logxtilde.x <- rep(log(s) - 1/2 * (sigsq_p * Ig + sigsq_m), k)
Sigma_logxtilde.x <- sigsq_p * Ig + diag(sigsq_m, k)
# Density
1 / prod(xtilde) * dmvnorm(x = log(xtilde),
mean = Mu_logxtilde.x,
sigma = Sigma_logxtilde.x) *
dgamma(x = s, shape = shape, scale = scale)
}
})
# Back-transformation
out <- matrix(f_xtildex.yc / (1 - x)^2, ncol = ncol(x))
}
}
# Log-likelihood function
llf <- function(f.theta, estimating.hessian = FALSE) {
# Extract parameters
f.gammas <- matrix(f.theta[loc.gammas], ncol = 1)
f.b1 <- f.theta[loc.bs[1]]
f.b0 <- f.theta[loc.bs[2]]
if (errors == "neither") {
f.sigsq_p <- 0
f.sigsq_m <- 0
} else if (errors == "measurement") {
f.sigsq_p <- 0
f.sigsq_m <- f.theta[loc.bs[2] + 1]
} else if (errors == "processing") {
f.sigsq_p <- f.theta[loc.bs[2] + 1]
f.sigsq_m <- 0
} else if (errors == "both") {
f.sigsq_p <- f.theta[loc.bs[2] + 1]
f.sigsq_m <- f.theta[loc.bs[2] + 2]
}
if (some.p) {
# Likelihood for pools with precisely measured X:
# L = f(X|Y,C_1, ..., C_g)
shapes <- sapply(oneyc.p, function(x) sum(exp(x %*% f.gammas)))
scales <- ifelse(y.p == 1, f.b1, f.b0)
ll.p <- sum(dgamma(x = x.p, log = TRUE,
shape = shapes,
scale = scales))
} else {
ll.p <- 0
}
# Set skip.rest flag to FALSE
skip.rest <- FALSE
if (some.r) {
# Likelihood for subjects with replicates:
# L = int_X f(Xtilde|X) f(X|Y,C) dX
# Shape and scale parameters to feed integral
shapes <- sapply(oneyc.r, function(x) sum(exp(x %*% f.gammas)))
scales <- ifelse(y.r == 1, f.b1, f.b0)
int.vals <- c()
for (ii in 1: length(xtilde.r)) {
int.ii <- do.call(hcubature,
c(list(f = lf,
vectorInterface = TRUE,
lowerLimit = 0,
upperLimit = 1,
Ig = Ig.r[ii],
k = k.r[ii],
xtilde = xtilde.r[[ii]],
shape = shapes[ii],
scale = scales[ii],
sigsq_p = f.sigsq_p,
sigsq_m = f.sigsq_m),
hcubature_list))
# If integral 0, find region with density
if (is.na(int.ii$integral) | int.ii$integral == 0) {
limits <- seq(1e-5, 1 - 1e-5, 1e-5)
fs <- lf(x = limits,
Ig = Ig.r[ii],
k = k.r[ii],
xtilde = xtilde.r[[ii]],
shape = shapes[ii],
scale = scales[ii],
sigsq_p = f.sigsq_p,
sigsq_m = f.sigsq_m)
limits <- limits[fs > 0]
if (length(limits) > 0) {
limits <- c(max(0, min(limits) - 1e-5), min(1, max(limits) + 1e-5))
int.ii <- do.call(hcubature,
c(list(f = lf,
vectorInterface = TRUE,
lowerLimit = limits[1],
upperLimit = limits[2],
Ig = Ig.r[ii],
k = k.r[ii],
xtilde = xtilde.r[[ii]],
shape = shapes[ii],
scale = scales[ii],
sigsq_p = f.sigsq_p,
sigsq_m = f.sigsq_m),
hcubature_list))
}
}
int.vals[ii] <- int.ii$integral
# If integral 0, set skip.rest to TRUE to skip further LL calculations
if (is.na(int.ii$integral) | int.ii$integral == 0) {
print(paste("Integral is ", int.ii$integral, " for ii = ", ii, sep = ""))
print(f.theta)
skip.rest <- TRUE
break
}
}
ll.r <- sum(log(int.vals))
} else {
ll.r <- 0
}
if (some.i & ! skip.rest) {
# Likelihood for pools with single Xtilde:
# L = int_X f(Xtilde|X) f(X|Y,C_1,...,C_g) dX
# Shape and scale parameters to feed to integral
shapes <- sapply(oneyc.i, function(x) sum(exp(x %*% f.gammas)))
scales <- ifelse(y.i == 1, f.b1, f.b0)
int.vals <- c()
for (ii in 1: length(xtilde.i)) {
int.ii <- do.call(hcubature,
c(list(f = lf,
vectorInterface = TRUE,
lowerLimit = 0,
upperLimit = 1,
Ig = Ig.i[ii],
k = 1,
xtilde = xtilde.i[ii],
shape = shapes[ii],
scale = scales[ii],
sigsq_p = f.sigsq_p,
sigsq_m = f.sigsq_m),
hcubature_list))
# If integral 0, find region with density
if (is.na(int.ii$integral) | int.ii$integral == 0) {
limits <- seq(1e-5, 1 - 1e-5, 1e-5)
fs <- lf(x = limits,
k = 1,
xtilde = xtilde.i[ii],
shape = shapes[ii],
scale = scales[ii],
sigsq_p = f.sigsq_p,
sigsq_m = f.sigsq_m)
limits <- limits[fs > 0]
if (length(limits) > 0) {
limits <- c(max(0, min(limits) - 1e-5), min(1, max(limits) + 1e-5))
int.ii <- do.call(hcubature,
c(list(f = lf,
vectorInterface = TRUE,
lowerLimit = limits[1],
upperLimit = limits[2],
Ig = Ig.i[ii],
k = 1,
xtilde = xtilde.i[ii],
shape = shapes[ii],
scale = scales[ii],
sigsq_p = f.sigsq_p,
sigsq_m = f.sigsq_m),
hcubature_list))
}
}
int.vals[ii] <- int.ii$integral
# If integral 0, set skip.rest to TRUE to skip further LL calculations
if (is.na(int.ii$integral) | int.ii$integral == 0) {
print(paste("Integral is ", int.ii$integral, " for ii = ", ii, sep = ""))
print(f.theta)
skip.rest <- TRUE
break
}
}
ll.i <- sum(log(int.vals))
} else {
ll.i <- 0
}
# Return negative log-likelihood
ll <- ll.p + ll.r + ll.i
return(-ll)
}
# Starting values
if (is.null(nlminb_list$start)) {
if (errors == "neither") {
nlminb_list$start <- c(rep(start_nonvar_var[1], n.gammas),
rep(start_nonvar_var[2], 2))
} else if (errors %in% c("measurement", "processing")) {
nlminb_list$start <- c(rep(start_nonvar_var[1], n.gammas),
rep(start_nonvar_var[2], 3))
} else if (errors == "both") {
nlminb_list$start <- c(rep(start_nonvar_var[1], n.gammas),
rep(start_nonvar_var[2], 4))
}
}
names(nlminb_list$start) <- theta.labels
# Lower bounds
if (is.null(nlminb_list$lower)) {
if (errors == "neither") {
nlminb_list$lower <- c(rep(lower_nonvar_var[1], n.gammas),
rep(lower_nonvar_var[2], 2))
} else if (errors %in% c("measurement", "processing")) {
nlminb_list$lower <- c(rep(lower_nonvar_var[1], n.gammas),
rep(lower_nonvar_var[2], 3))
} else if (errors == "both") {
nlminb_list$lower <- c(rep(lower_nonvar_var[1], n.gammas),
rep(lower_nonvar_var[2], 4))
}
}
# Upper bounds
if (is.null(nlminb_list$upper)) {
if (errors == "neither") {
nlminb_list$upper <- c(rep(upper_nonvar_var[1], n.gammas),
rep(upper_nonvar_var[2], 2))
} else if (errors %in% c("measurement", "processing")) {
nlminb_list$upper <- c(rep(upper_nonvar_var[1], n.gammas),
rep(upper_nonvar_var[2], 3))
} else if (errors == "both") {
nlminb_list$upper <- c(rep(upper_nonvar_var[1], n.gammas),
rep(upper_nonvar_var[2], 4))
}
}
if (is.null(nlminb_object)) {
# Obtain ML estimates
ml.max <- do.call(nlminb, c(list(objective = llf), nlminb_list))
# If non-convergence, try with jittered starting values if requested
if (ml.max$convergence == 1) {
if (! is.null(jitter_start)) {
message("Trying jittered starting values...")
nlminb_list$start <- nlminb_list$start +
rnorm(n = length(nlminb_list$start), sd = jitter_start)
ml.max2 <- do.call(nlminb, c(list(objective = llf), nlminb_list))
if (ml.max2$objective < ml.max$objective) ml.max <- ml.max2
}
if (ml.max$convergence == 1) {
message("Object returned by 'nlminb' function indicates non-convergence. You may want to try different starting values.")
}
}
} else {
ml.max <- nlminb_object
}
ml.estimates <- ml.max$par
# Obtain variance estimates
if (estimate_var) {
# Estimate Hessian and variance-covariance matrix
hessian.mat <- do.call(numDeriv::hessian,
c(list(func = llf, x = ml.estimates),
hessian_list))
# Estimate variance-covariance matrix
theta.variance <- try(solve(hessian.mat), silent = TRUE)
if (class(theta.variance)[1] == "try-error" | sum(is.na(hessian.mat)) > 0) {
print(hessian.mat)
message("The estimated Hessian matrix (printed here) is singular, so variance-covariance matrix could not be obtained. You could try tweaking 'start_nonvar_var' or 'hessian_list' (e.g. increase 'r')")
theta.variance <- NULL
} else {
colnames(theta.variance) <- rownames(theta.variance) <- theta.labels
if (sum(diag(theta.variance) <= 0) > 0) {
print(theta.variance)
message("The estimated variance-covariance matrix (printed here) has some non-positive diagonal elements, so it may not be reliable. You could try tweaking 'start_nonvar_var' or 'hessian_list' (e.g. increase 'r')")
}
}
} else {
theta.variance <- NULL
}
# Create vector of estimates to return
estimates <- ml.estimates
names(estimates) <- theta.labels
# Create list to return
ret.list <- list(estimates = estimates,
theta.var = theta.variance,
nlminb.object = ml.max,
aic = 2 * (length(ml.estimates) + ml.max$objective))
return(ret.list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.