Nothing
#' Censored Likelihood Multiple Imputation
#'
#' This function performs censored likelihood multiple imputation for
#' single-pollutant models where the pollutant of interest is subject to
#' varying detection limits across batches (this function will also work if
#' there is only one distinct detection limit). The function
#' outputs a list containing the imputed datasets and details regarding the
#' imputation procedure (i.e., number of imputed dataset, covariates used to
#' impute the non-detects, etc).
#'
#' \code{clmi} is somewhat picky regarding the \code{formula} parameter. It
#' tries to infer what transformation you'd like to apply to the exposure you
#' are imputing, what the exposure is, and what the outcome is. It attempts to
#' check to make sure that everything is working correctly, but it can fail.
#' Roughly, the rules are:
#' \itemize{
#' \item The left hand side of formula should be the exposure you are trying
#' to impute.
#' \item The exposure may be optionally wrapped in a univariate transformation
#' function. If the transformation function is not univariate, you ought to
#' get an error about a "complicated" transformation.
#' \item The first variable on the right hand side of \code{formula} should be
#' your outcome of interest.
#'}
#' @param formula A formula in the form of \code{exposure ~ outcome + covariates}.
#' That is, the first variable on the right hand side of \code{formula} should
#' be the outcome of interest.
#' @param df A data.frame with \code{exposure}, \code{outcome} and
#' \code{covariates}.
#' @param lod Name of limit of detection variable in \code{df}.
#' @param n.imps Number of datasets to impute. Default is 5.
#' @param seed For reproducability.
#' @param verbose If \code{TRUE}, \code{clmi} prints out useful debugging
#' information while running. Default is \code{FALSE}.
#' @note
#' \itemize{
#' \item \code{clmi} only supports categorical variables that are numeric,
#' (i.e., not factors or characters). You can use the \code{model.matrix}
#' function to convert a data frame with factors to a numeric design matrix
#' and subsequently convert that matrix back into a data frame using
#' \code{as.data.frame}.
#' \item If you get the error message "L-BFGS-B needs finite values of 'fn'",
#' try normalising your data.
#' }
#' @examples
#' library(lodi)
#'
#' # Note that the outcome of interest is the first variable on the right hand
#' # side of the formula.
#' clmi.out <- clmi(poll ~ case_cntrl + smoking + gender, toy_data, lod, 1)
#'
#' # you can specify a transformation to the exposure in the formula
#' clmi.out <- clmi(log(poll) ~ case_cntrl + smoking + gender, toy_data, lod, 1)
#'
#' @references
#' Boss J, Mukherjee B, Ferguson KK, et al. Estimating outcome-exposure
#' associations when exposure biomarker detection limits vary across batches.
#' Epidemiology. 2019;30(5):746-755.
#' \href{https://doi.org/10.1097/EDE.0000000000001052}{10.1097/EDE.0000000000001052}
#' @export
clmi <- function(formula, df, lod, seed, n.imps = 5, verbose = FALSE)
{
if (!rlang::is_formula(formula))
stop("formula must be a formula")
if (!is.data.frame(df))
stop("df must be a data.frame.")
if (!is.logical(verbose))
stop("'verbose' must take on a logical value.")
if (verbose)
print(paste("Formula:", rlang::expr_text(formula)))
transform.init <- rlang::f_lhs(formula)
exposure <- all.vars(transform.init)
if (length(exposure) > 1)
stop("Complicated transformation on exposure. See help for fix.")
if (verbose)
print(sprintf("Exposure variable: %s", exposure))
vars <- all.vars(rlang::f_rhs(formula))
outcome <- vars[1]
if (verbose)
print(sprintf("Outcome variable: %s", outcome))
# Calculate the transformation function
assign(substitute(exposure), quote(x))
transform <- eval(rlang::expr(substitute(!!transform.init)))
trans <- function(x) x
rlang::fn_body(trans) <- transform
rlang::fn_env(trans) <- rlang::caller_env()
if (verbose)
print(sprintf("Transformation function: %s",
gsub("\n", "", rlang::expr_text(trans))
))
lod <- deparse(substitute(lod))
if (verbose)
print(sprintf("LOD variable: %s", lod))
if (is.null(df[[lod]]))
stop(sprintf("%s not in df", lod))
if (!is.numeric(df[[lod]]))
stop(sprintf("%s must be numeric."))
if (!is.numeric(seed) || length(seed) > 1)
stop("seed must be a number.")
set.seed(seed)
if (!is.numeric(n.imps) || length(n.imps) > 1)
stop("n.imps must be an integer.")
if (n.imps < 1)
stop("n.imps must be >= 1")
vars <- c(exposure, vars)
if (any(sapply(vars, function(x) !is.numeric(df[[x]]))))
stop("clmi only supports floating point / integer variables.")
if (any(sapply(vars[-1], function(x) any(is.na(df[[x]])))))
stop("The covariates on the rhs of formula cannot contain missing values.")
if (any(stats::na.omit(df[[exposure]] < df[[lod]])))
stop(paste(exposure, "contains values below", lod,
"that are not coded as NA."))
# columns to add back at the end
leftovers <- df[, setdiff(names(df), c(vars, lod))]
# will be used to ensure the imputed column ordering is the same
df.names <- names(df)
df.rownames <- rownames(df)
df <- df[, c(vars, lod)]
t.imp.exp <- paste0(exposure, "_transform", "_imputed")
df[[t.imp.exp]] <- df[[exposure]]
obs.above <- !is.na(df[[t.imp.exp]])
# Subjects with concentration above LOD
above <- df[obs.above,]
# Subjects with concentration below LOD for each batch
below <- df[!obs.above,]
above.matrix <- as.matrix(above[, vars[-(1:2)]])
below.matrix <- as.matrix(below[, vars[-(1:2)]])
# Perform Multiple Imputation
imp <- rep(list(below), n.imps)
for (j in 1:n.imps) {
#Bootstrap data
df.bs <- df[sample(nrow(df), nrow(df), T),]
above.bs <- df.bs[!is.na(df.bs[[t.imp.exp]]),]
above.bs[[t.imp.exp]] <- trans(above.bs[[t.imp.exp]])
below.bs <- df.bs[is.na(df.bs[[t.imp.exp]]),]
below.bs[[lod]] <- trans(below.bs[[lod]])
above.matrix.bs <- as.matrix(above.bs[, vars[-(1:2)]])
below.matrix.bs <- as.matrix(below.bs[, vars[-(1:2)]])
# calculates the means of (1, exposure, covariates) given beta
mu <- function(beta, outcome, covars)
{
beta[2] + beta[3] * outcome + covars %*% beta[-(1:3)]
}
# objective function for mle is smooth and convex. So, it has a unique
# global minimum. `sigma^2` is parameterized as beta[1] = log(sigma^2),
# to add a non negativity constraint.
objective <- function(beta)
{
mu.b <- mu(beta, below.bs[[outcome]], below.matrix.bs)
mu.a <- mu(beta, above.bs[[outcome]], above.matrix.bs)
-sum(log(stats::pnorm(below.bs[[lod]], mu.b, sqrt(exp(beta[1]))))) +
sum((above.bs[[t.imp.exp]] - mu.a)^2) / (2 * exp(beta[1])) +
nrow(above.bs) * 0.5 * log(2 * pi * exp(beta[1]))
}
# get MLE for Bootstrapped sample
beta <- c(0, 0, 0, rep(0, length(vars[-(1:2)])))
mle <- stats::optim(beta, objective, method = "BFGS")
# Impute missing values
mus <- mu(mle$par, below[[outcome]], below.matrix)
sigma <- sqrt(exp(mle$par[1]))
normalize <- function(x, mu, sd) (x - mu) / sd
inv.normalize <- function(x, mu, sd) x * sd + mu
probs <- stats::pnorm(normalize(trans(below[[lod]]), mus, sigma))
zs <- stats::qnorm(stats::runif(nrow(below)) * probs)
vals <- inv.normalize(zs, mus, sigma)
imp[[j]][[t.imp.exp]] <- as.vector(vals)
}
# Check MLE estimation for original dataset
mu <- function(beta, outcome, covars)
{
beta[2] + beta[3] * outcome + covars %*% beta[-(1:3)]
}
objective <- function(beta)
{
mu.b <- mu(beta, below[[outcome]], below.matrix)
mu.a <- mu(beta, above[[outcome]], above.matrix)
-sum(log(stats::pnorm(trans(below[[lod]]), mu.b, sqrt(exp(beta[1]))))) +
sum((trans(above[[t.imp.exp]]) - mu.a)^2) / (2 * exp(beta[1])) +
nrow(above) * 0.5 * log(2 * pi * exp(beta[1]))
}
mle <- stats::optim(beta, objective, method = "BFGS", hessian = T)
param <- mle$par
param[1] <- exp(param[1])
names(param) <- c("variance", "intercept", outcome, vars[-(1:2)])
# due to the reparameterization of sigma^2, we need to multiply the fisher
# information matrix by the jacobian.
jacobian <- diag(c(param[1], 1, 1, rep(1, length(vars[-(1:2)]))))
fisher.inf <- jacobian %*% solve(mle$hessian) %*% jacobian
prop.sigma <- sqrt(diag(fisher.inf))
#Aggregate imputed datasets with observed dataset
imputed.dfs <- imp
above[[t.imp.exp]] <- trans(above[[t.imp.exp]])
# add imputed values, then add back the rest of the columns to to imputed
# dataframe and make sure it is in the same order as the original
imputed.dfs <- lapply(imputed.dfs, function(df)
{
df <- rbind(df, above)
# reorder the rows so they match the original
df <- df[df.rownames, ]
df <- cbind(df, leftovers)
# reorder the columns so they match the original
df[, c(df.names, t.imp.exp)]
})
structure(
list(formula = formula, nimp = n.imps, imputed.dfs = imputed.dfs,
t.function = trans, par.mle = param, fisher.inf = fisher.inf),
class = "clmi.out")
}
#' Calculate pooled estimates from \code{clmi.out} objects using Rubin's rules
#' @param formula Formula to fit. Exposure variable should end in
#' \code{_transform_imputed}.
#' @param clmi.out An object generated by clmi.
#' @param type Type of regression to pool. Valid types are
#' logistic and linear.
#' @examples
#' # continue example from clmi
#' # fit model on imputed data and pool results
#' library(lodi)
#' data("toy_data")
#' clmi.out <- clmi(log(poll) ~ case_cntrl + smoking + gender, toy_data, lod, 1)
#' results <- pool.clmi(case_cntrl ~ poll_transform_imputed + smoking, clmi.out,
#' logistic)
#'
#' results$output
#' @export
pool.clmi <- function(formula, clmi.out, type)
{
if (class(clmi.out) != "clmi.out")
stop("clmi.out is not an clmi.out object.")
type <- deparse(substitute(type))
type <- match.arg(type, c("linear","logistic"))
# Get names used in clmi() function
imputed.dfs <- clmi.out$imputed.dfs
n.imps <- clmi.out$nimp
nobs <- nrow(imputed.dfs[[1]])
# Get number of coefficients to store
# Get estimates and standard errors for each imputed dataset
betas <- c()
varcov <- vector("list", n.imps)
regressions <- vector("list", n.imps)
if (type == "logistic") {
for(i in 1:n.imps) {
data.fit <- imputed.dfs[[i]]
logreg <- stats::glm(formula, family = "binomial", data = data.fit)
betas <- rbind(betas, stats::coefficients(logreg))
varcov[[i]] <- summary(logreg)$cov.unscaled
regressions[[i]] <- logreg
}
} else {
for (i in 1:n.imps) {
data.fit <- imputed.dfs[[i]]
linreg <- stats::lm(formula, data = data.fit)
betas <- rbind(betas, stats::coefficients(linreg))
varcov[[i]] <- stats::vcov(linreg)
regressions[[i]] <- linreg
}
}
# Pooled Inference for Multiply Imputed Datasets
qbar <- apply(betas, 2, mean)
ubar <- Reduce('+', varcov) / n.imps
# Each column of this matrix is Qi - Qbar
q_diff <- apply(betas, 1, function(x) x - qbar)
ui <- matrix(0, ncol(betas), ncol(betas))
for (m in 1:ncol(q_diff))
ui <- ui + as.matrix(q_diff[,m]) %*% t(as.matrix(q_diff[,m]))
b <- ui / (n.imps-1)
t <- ubar + (1 + (1 / n.imps)) * b
gamma <- ((1 + (1 / n.imps)) * diag(b)) / diag(t)
r <- ((1 + (1 / n.imps)) * diag(b)) / diag(ubar)
v <- (n.imps - 1) * (1 + (1 / r))^2
v0 <- nobs - ncol(betas)
denom_adj_df <- ((1 - gamma) * v0 * (v0 + 1)) / (v0 + 3)
vstar <- ((1 / v) + (1 / denom_adj_df))^(-1)
#Get Confidence Intervals
LCL <- qbar - stats::qt(0.975, vstar) * sqrt(diag(t))
UCL <- qbar + stats::qt(0.975, vstar) * sqrt(diag(t))
#Get p-values
p_vals <- 2 * stats::pt(abs(qbar / sqrt(diag(t))), df = vstar,
lower.tail = F)
#Summary of each regression models on each imputed datasets
regression.summaries <- lapply(regressions, summary)
list(output = data.frame(est = qbar, se = sqrt(diag(t)), df = vstar,
p.values = p_vals, LCL.95 = LCL, UCL.95 = UCL),
pooled.vcov = t, regressions = regressions,
regression.summaries = regression.summaries
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.