sieveSurv: Nonparametric Survival Estimator with B-spline Sieves

Documented in sieveSurv

#' Nonparametrically estimate the survival function using B-spline sieves
#'
#' @param time Column name for follow-up time
#' @param event Column name for event indicators
#' @param covar (Optional) column name(s) for additional fully-observed covariates. Default is \code{covar=NULL}, which estimates unconditional survival.
#' @param bspline Column names for B-spline basis.
#' @param data Dataframe or matrix containing (at least) named columns \code{time}, \code{event}, \code{covar}, and \code{bspline}.
#' @param logTransform If \code{FALSE}, \code{time} is log transformed before density estimation. Default is \code{TRUE}.
#' @param tol Tolerance to define convergence. Default is \code{tol=1E-4}.
#' @param max_iter Maximum number of iterations allowed for the EM algorithm. Default is \code{max_iter = 1000}.
#' @param assume_last Assume last observed \code{time} is an event (for use when the integration of the survival function is desired). Default is \code{FALSE}.
#'
#' @return A list with the following five elements:
#' \item{surv_uncensor}{subset of uncensored rows of \code{data} with added column \code{surv} with survival estimate at their event times}
#' \item{coeff}{a matrix of the B-spline coefficients at convergence}
#' \item{od_loglik}{value of the observed-data log-likelihood for the B-spline coefficients at convergence}
#' \item{conv}{indicator of convergence for the EM algorithn}
#' \item{conv_msg}{description of nonconvergence if \code{conv = FALSE}}
#'
#' @export
#'
sieveSurv <- function(time, event, covar = NULL, bspline, data, logTransform = FALSE,
                      tol = 1E-4, max_iter = 1000, assume_last = FALSE) {
  # Create censored variable
  data$x <- data[, time]
  ## Make NA for censored subjects
  data$x[data[, event] == 0] <- NA

  # Assume largest or smallest observed t is an event regardless
  ## For stability of the integral estimate
  if (assume_last) {
    # Assume largest observed t is an event regardless
    max_t <- max(data[, time])
    max_t_at <- which.max(data[, time])
    data[max_t_at, c("x", event)] <- c(max_t, 1)
  }

  # If logTransform = TRUE, replace x with log(x)
  if (logTransform) {
    if (any(data$x < 0, na.rm = TRUE)) {
      return(warning("Supplied time variable is not nonnegative. Cannot allow option logTransform = TRUE."))
    }
    data$x <- log(data$x)
  }

  # Subset data to valid times
  ## Remove censored subjects with T > max(X)
  data <- data[data[, time] <= max(data[data[, event] == 1, time]), ]
  ### Also remove uncensored subjects with X < min(T_cens)
  #data <- data[data[, event] == 0 | data$x >= min(data[data[, event] == 0, time]), ]

  # Order by x
  ## This leads to the uncensored subjects coming first
  data <- data[order(data$x), ]
  dist_x <- unique(data[data[, event] == 1, "x"])
  m <- length(dist_x)
  merge_k <- data.frame(k = 1:m, x = dist_x)
  data <- merge(data, merge_k, all.x = TRUE)

  # Create indicators of being uncensored
  uncens <- data[, event] == 1

  # Create ID variable for grouping/summing
  n <- nrow(data)
  data$id <- 1:n

  # Save B-spline basis separately
  B <- data.matrix(data[, bspline])

  # Initialize B-spline coefficients, p_{kj}
  p_val_num <- rowsum(x = B[uncens, ], group = data$k[uncens], reorder = TRUE)
  if (any(colSums(p_val_num) == 0)) {
    return(list(surv = NA,
                od_loglik = NA,
                coeff = NA,
                conv = FALSE,
                conv_msg = "empty B-spline"))
  }
  prev_p <- t(t(p_val_num) / colSums(p_val_num))

  # Create complete dataset
  ## Censored subjects
  cdat <- data[!uncens, c(time, event, bspline, "id")]
  cdat <- cbind(cdat[rep(1:nrow(cdat), times = m), ], x = rep(dist_x, each = nrow(cdat)),
                k = rep(1:m, each = nrow(cdat)), row.names = NULL)
  ## Uncensored subjects
  cdat <- rbind(data[uncens, c(time, event, bspline, "id", "x", "k")], cdat, row.names = NULL)
  ### Subset to possible values of x (i.e., x > T)
  if (logTransform) {
    cdat <- cdat[cdat[, "x"] >= log(cdat[, time]), ]
  } else {
    cdat <- cdat[cdat[, "x"] >= cdat[, time], ]
  }

  # Prepare for EM algorithm
  it <- 1
  conv <- FALSE

  # EM algorithm
  while(!conv & it < max_iter) {
    # E step
    pXU_num <- data.matrix(prev_p[cdat$k, ] * B[cdat$id, ])
    pXU_denom <- rowSums(rowsum(x = pXU_num, group = cdat$id, reorder = TRUE))
    pXU_denom[pXU_denom == 0] <- 1
    pXU <- pXU_num / pXU_denom[cdat$id]

    # M step
    add_to_p <- rowsum(x = pXU[cdat[, event] == 0, ], group = cdat$k[cdat[, event] == 0], reorder = TRUE)
    ## Check for values of xk < min(Ti among uncensored)
    pad_rows <- setdiff(x = 1:m, y = unique(cdat$k[cdat[, event] == 0]))
    ## For these rows, add an empty tow to add_to_p
    empty_rows <- matrix(data = 0, nrow = length(pad_rows), ncol = ncol(add_to_p))
    ## Update b-spline coefficients with contribution from censored subjects
    new_p_num <- p_val_num + rbind(empty_rows, add_to_p)
    new_p <- t(t(new_p_num) / colSums(new_p_num))
    # Check for convergence
    if (!any(abs(new_p - prev_p) > tol)) {
      conv <- TRUE
    } else {
      it <- it + 1
      prev_p <- new_p
    }
  }

  if (conv) {
    # Calculate observed-data log-likelihood of new_p
    od_ll <- NA #loglik_p(bspline_coeff = new_p, time = time, event = event, bspline = bspline, data = data, try_event_times = NULL)
    # Predict survival for all observed event times w/ corresponding covar
    ## Based on SMLE at convergence
    bspline_coeff <- cbind(dist_x, new_p)
    # \hat{p}_{kj}B_{j}^{q}(Z_i)
    ## consider all xk, all Zi
    pBspline <- bspline_coeff[rep(x = 1:nrow(bspline_coeff), times = nrow(data)), -1] *
      data[rep(x = 1:nrow(data), each = nrow(bspline_coeff)), bspline]
    sum_pBspline <- rowSums(pBspline) # These estimate P(X=xk|Z)
    allZ <- data[, covar]
    allX <- bspline_coeff[, 1]
    ## If x was log-transformed, these estimates need to be back-transformed
    ### to the original scale of x
    if (logTransform) {
      #sum_pBspline <- 1 / exp(bspline_coeff[rep(x = 1:nrow(bspline_coeff), times = nrow(data)), 1]) * sum_pBspline
      #allX <- exp(bspline_coeff[, 1])
      df_Sx <- sieveSurv.predict(newdata = data.frame(t = log(data[uncens, time]), z = data[uncens, covar]),
                                 time = "t", covar = "z", all_covar = allZ, all_event_times = allX,
                                 sum_pBspline = sum_pBspline)
    } else {
      df_Sx <- sieveSurv.predict(newdata = data.frame(t = data[uncens, time], z = data[uncens, covar]),
                                 time = "t", covar = "z", all_covar = allZ, all_event_times = allX,
                                 sum_pBspline = sum_pBspline)
    }
    return(list(surv_uncensor = df_Sx,
                coeff = bspline_coeff,
                od_loglik = od_ll,
                conv = conv,
                conv_msg = ""))
  } else {
    df_Sx <- cbind(data[uncens, c(time, event, covar, bspline)], Sx = NA)
    if (it >= max_iter) {
      return(list(surv = df_Sx,
                  coeff = cbind(x = dist_x, p = NA),
                  od_loglik = NA,
                  conv = conv,
                  conv_msg = "max_iter reached"))
    }
    return(list(surv = df_Sx,
                od_loglik = NA,
                coeff = cbind(x = dist_x, p = NA),
                conv = conv,
                conv_msg = "unknown"))
  }
}