#' Nonparametrically estimate the survival function using B-spline sieves
#'
#' @param time Column name for follow-up time
#' @param event Column name for event indicators
#' @param covar (Optional) column name(s) for additional fully-observed covariates. Default is \code{covar=NULL}, which estimates unconditional survival.
#' @param bspline Column names for B-spline basis.
#' @param data Dataframe or matrix containing (at least) named columns \code{time}, \code{event}, \code{covar}, and \code{bspline}.
#' @param logTransform If \code{FALSE}, \code{time} is log transformed before density estimation. Default is \code{TRUE}.
#' @param tol Tolerance to define convergence. Default is \code{tol=1E-4}.
#' @param max_iter Maximum number of iterations allowed for the EM algorithm. Default is \code{max_iter = 1000}.
#' @param assume_last Assume last observed \code{time} is an event (for use when the integration of the survival function is desired). Default is \code{FALSE}.
#'
#' @return A list with the following five elements:
#' \item{surv_uncensor}{subset of uncensored rows of \code{data} with added column \code{surv} with survival estimate at their event times}
#' \item{coeff}{a matrix of the B-spline coefficients at convergence}
#' \item{od_loglik}{value of the observed-data log-likelihood for the B-spline coefficients at convergence}
#' \item{conv}{indicator of convergence for the EM algorithn}
#' \item{conv_msg}{description of nonconvergence if \code{conv = FALSE}}
#'
#' @export
#'
sieveSurv <- function(time, event, covar = NULL, bspline, data, logTransform = FALSE,
tol = 1E-4, max_iter = 1000, assume_last = FALSE) {
# Create censored variable
data$x <- data[, time]
## Make NA for censored subjects
data$x[data[, event] == 0] <- NA
# Assume largest or smallest observed t is an event regardless
## For stability of the integral estimate
if (assume_last) {
# Assume largest observed t is an event regardless
max_t <- max(data[, time])
max_t_at <- which.max(data[, time])
data[max_t_at, c("x", event)] <- c(max_t, 1)
}
# If logTransform = TRUE, replace x with log(x)
if (logTransform) {
if (any(data$x < 0, na.rm = TRUE)) {
return(warning("Supplied time variable is not nonnegative. Cannot allow option logTransform = TRUE."))
}
data$x <- log(data$x)
}
# Subset data to valid times
## Remove censored subjects with T > max(X)
data <- data[data[, time] <= max(data[data[, event] == 1, time]), ]
### Also remove uncensored subjects with X < min(T_cens)
#data <- data[data[, event] == 0 | data$x >= min(data[data[, event] == 0, time]), ]
# Order by x
## This leads to the uncensored subjects coming first
data <- data[order(data$x), ]
dist_x <- unique(data[data[, event] == 1, "x"])
m <- length(dist_x)
merge_k <- data.frame(k = 1:m, x = dist_x)
data <- merge(data, merge_k, all.x = TRUE)
# Create indicators of being uncensored
uncens <- data[, event] == 1
# Create ID variable for grouping/summing
n <- nrow(data)
data$id <- 1:n
# Save B-spline basis separately
B <- data.matrix(data[, bspline])
# Initialize B-spline coefficients, p_{kj}
p_val_num <- rowsum(x = B[uncens, ], group = data$k[uncens], reorder = TRUE)
if (any(colSums(p_val_num) == 0)) {
return(list(surv = NA,
od_loglik = NA,
coeff = NA,
conv = FALSE,
conv_msg = "empty B-spline"))
}
prev_p <- t(t(p_val_num) / colSums(p_val_num))
# Create complete dataset
## Censored subjects
cdat <- data[!uncens, c(time, event, bspline, "id")]
cdat <- cbind(cdat[rep(1:nrow(cdat), times = m), ], x = rep(dist_x, each = nrow(cdat)),
k = rep(1:m, each = nrow(cdat)), row.names = NULL)
## Uncensored subjects
cdat <- rbind(data[uncens, c(time, event, bspline, "id", "x", "k")], cdat, row.names = NULL)
### Subset to possible values of x (i.e., x > T)
if (logTransform) {
cdat <- cdat[cdat[, "x"] >= log(cdat[, time]), ]
} else {
cdat <- cdat[cdat[, "x"] >= cdat[, time], ]
}
# Prepare for EM algorithm
it <- 1
conv <- FALSE
# EM algorithm
while(!conv & it < max_iter) {
# E step
pXU_num <- data.matrix(prev_p[cdat$k, ] * B[cdat$id, ])
pXU_denom <- rowSums(rowsum(x = pXU_num, group = cdat$id, reorder = TRUE))
pXU_denom[pXU_denom == 0] <- 1
pXU <- pXU_num / pXU_denom[cdat$id]
# M step
add_to_p <- rowsum(x = pXU[cdat[, event] == 0, ], group = cdat$k[cdat[, event] == 0], reorder = TRUE)
## Check for values of xk < min(Ti among uncensored)
pad_rows <- setdiff(x = 1:m, y = unique(cdat$k[cdat[, event] == 0]))
## For these rows, add an empty tow to add_to_p
empty_rows <- matrix(data = 0, nrow = length(pad_rows), ncol = ncol(add_to_p))
## Update b-spline coefficients with contribution from censored subjects
new_p_num <- p_val_num + rbind(empty_rows, add_to_p)
new_p <- t(t(new_p_num) / colSums(new_p_num))
# Check for convergence
if (!any(abs(new_p - prev_p) > tol)) {
conv <- TRUE
} else {
it <- it + 1
prev_p <- new_p
}
}
if (conv) {
# Calculate observed-data log-likelihood of new_p
od_ll <- NA #loglik_p(bspline_coeff = new_p, time = time, event = event, bspline = bspline, data = data, try_event_times = NULL)
# Predict survival for all observed event times w/ corresponding covar
## Based on SMLE at convergence
bspline_coeff <- cbind(dist_x, new_p)
# \hat{p}_{kj}B_{j}^{q}(Z_i)
## consider all xk, all Zi
pBspline <- bspline_coeff[rep(x = 1:nrow(bspline_coeff), times = nrow(data)), -1] *
data[rep(x = 1:nrow(data), each = nrow(bspline_coeff)), bspline]
sum_pBspline <- rowSums(pBspline) # These estimate P(X=xk|Z)
allZ <- data[, covar]
allX <- bspline_coeff[, 1]
## If x was log-transformed, these estimates need to be back-transformed
### to the original scale of x
if (logTransform) {
#sum_pBspline <- 1 / exp(bspline_coeff[rep(x = 1:nrow(bspline_coeff), times = nrow(data)), 1]) * sum_pBspline
#allX <- exp(bspline_coeff[, 1])
df_Sx <- sieveSurv.predict(newdata = data.frame(t = log(data[uncens, time]), z = data[uncens, covar]),
time = "t", covar = "z", all_covar = allZ, all_event_times = allX,
sum_pBspline = sum_pBspline)
} else {
df_Sx <- sieveSurv.predict(newdata = data.frame(t = data[uncens, time], z = data[uncens, covar]),
time = "t", covar = "z", all_covar = allZ, all_event_times = allX,
sum_pBspline = sum_pBspline)
}
return(list(surv_uncensor = df_Sx,
coeff = bspline_coeff,
od_loglik = od_ll,
conv = conv,
conv_msg = ""))
} else {
df_Sx <- cbind(data[uncens, c(time, event, covar, bspline)], Sx = NA)
if (it >= max_iter) {
return(list(surv = df_Sx,
coeff = cbind(x = dist_x, p = NA),
od_loglik = NA,
conv = conv,
conv_msg = "max_iter reached"))
}
return(list(surv = df_Sx,
od_loglik = NA,
coeff = cbind(x = dist_x, p = NA),
conv = conv,
conv_msg = "unknown"))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.