intsurv: Integrative Survival Modeling

Documented in cox_cure cox_cure.fit

##
## intsurv: Integrative Survival Models
## Copyright (C) 2017-2021  Wenjie Wang <wang@wwenjie.org>
##
## This file is part of the R package intsurv.
##
## The R package intsurv is free software: You can redistribute it and/or
## modify it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or any later
## version (at your option). See the GNU General Public License at
## <https://www.gnu.org/licenses/> for details.
##
## The R package intsurv is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
##


##' Cox Cure Rate Model
##'
##' For right-censored data, fit a regular Cox cure rate model (Kuk and Chen,
##' 1992; Sy and Taylor, 2000) via an EM algorithm.  For right-censored data
##' with uncertain event status, fit the Cox cure model proposed by Wang et
##' al. (2020).
##'
##' @param surv_formula A formula object starting with \code{~} for the model
##'     formula in survival model part.  For Cox model, no intercept term is
##'     included even if an intercept is specified or implied in the model
##'     formula.  A model formula with an intercept term only is not allowed.
##' @param cure_formula A formula object starting with \code{~} for the model
##'     formula in incidence model part.  For logistic model, an intercept term
##'     is included by default and can be excluded by adding \code{+ 0} or
##'     \code{- 1} to the model formula.
##' @param time A numeric vector for the observed survival times.
##' @param event A numeric vector for the event indicators.  \code{NA}'s are
##'     allowed and represent uncertain event status.
##' @param data An optional data frame, list, or environment that contains the
##'     covariates and response variables (\code{time} and \code{event}),
##'     included in the model. If they are not found in data, the variables are
##'     taken from the environment of the specified formula, usually the
##'     environment from which this function is called.
##' @param subset An optional logical vector specifying a subset of observations
##'     to be used in the fitting process.
##' @param contrasts An optional list, whose entries are values (numeric
##'     matrices or character strings naming functions) to be used as
##'     replacement values for the contrasts replacement function and whose
##'     names are the names of columns of data containing factors.  See
##'     \code{contrasts.arg} of \code{\link[stats]{model.matrix.default}} for
##'     details.
##' @param bootstrap An integer representing the number of bootstrap samples for
##'     estimating standard errors of the coefficient estimates.  The bootstrap
##'     procedure will not run with \code{bootstrap = 0} by default.  If
##'     \code{bootstrap > 0}, the specified number of bootstrap samples will be
##'     used in estimating standard errors.
##' @param firth A logical value indicating whether to use Firth's
##'     bias-reduction method (Firth, 1993) in the logistic model component.
##'     The default value is \code{FALSE} for fitting a regular logistic model.
##'     Notice that this argument is experimental and only available for regular
##'     Cox cure rate model currently.
##' @param surv_start,cure_start An optional numeric vector representing the
##'     starting values for the survival model component or the incidence model
##'     component.  If \code{surv_start = NULL} is specified, the starting
##'     values will be obtained from fitting a regular Cox to events only.
##'     Similarly, if \code{cure_start = NULL} is specified, the starting values
##'     will be obtained from fitting a regular logistic model to the
##'     non-missing event indicators.
##' @param surv_offset,cure_offset An optional numeric vector representing the
##'     offset term in the survival model compoent or the incidence model
##'     component.  The function will internally try to find values of the
##'     specified variable in the \code{data} first.  Alternatively, one or more
##'     \code{offset} terms can be specified in the formula (by
##'     \code{stats::offset()}).  If more than one offset terms are specified,
##'     their sum will be used.
##' @param em_max_iter A positive integer specifying the maximum iteration
##'     number of the EM algorithm.  The default value is \code{200}.
##' @param em_rel_tol A positive number specifying the tolerance that determines
##'     the convergence of the EM algorithm in terms of the convergence of the
##'     covariate coefficient estimates.  The tolerance is compared with the
##'     relative change between estimates from two consecutive iterations, which
##'     is measured by ratio of the L1-norm of their difference to the sum of
##'     their L1-norm.  The default value is \code{1e-5}.
##' @param surv_max_iter,cure_max_iter A positive integer specifying the maximum
##'     iteration number of the M-step routine related to the survival model
##'     component or the incidence model component.  The default value is
##'     \code{200}.
##' @param surv_rel_tol,cure_rel_tol A positive number specifying the tolerance
##'     that determines the convergence of the M-step related to the survival
##'     model component or the incidence model component in terms of the
##'     convergence of the covariate coefficient estimates.  The tolerance is
##'     compared with the relative change between estimates from two consecutive
##'     iterations, which is measured by ratio of the L1-norm of their
##'     difference to the sum of their L1-norm.  The default value is
##'     \code{1e-5}.
##' @param tail_completion A character string specifying the tail completion
##'     method for conditional survival function.  The available methods are
##'     \code{"zero"} for zero-tail completion after the largest event times (Sy
##'     and Taylor, 2000), \code{"exp"} for exponential-tail completion (Peng,
##'     2003), and \code{"zero-tau"} for zero-tail completion after a specified
##'     \code{tail_tau}.  The default method is the zero-tail completion
##'     proposed by Sy and Taylor (2000).
##' @param tail_tau A numeric number specifying the time of zero-tail
##'     completion.  It will be used only if \code{tail_completion =
##'     "zero-tau"}.  A reasonable choice must be a time point between the
##'     largest event time and the largest survival time.
##' @param pmin A numeric number specifying the minimum value of probabilities
##'     for sake of numerical stability.  The default value is \code{1e-5}.
##' @param early_stop A logical value specifying whether to stop the iteration
##'     once the negative log-likelihood unexpectedly increases, which may
##'     suggest convergence on likelihood, or indicate numerical issues or
##'     implementation bugs.  The default value is \code{TRUE}.
##' @param verbose A logical value.  If \code{TRUE}, a verbose information will
##'     be given along iterations for tracing the convergence.  The default
##'     value is \code{FALSE}.
##' @param ... Other arguments for future usage.  A warning will be thrown if
##'     any invalid argument is specified.
##'
##' @return
##'
##' \code{cox_cure} object for regular Cox cure rate model or
##' \code{cox_cure_uncer} object for Cox cure rate model with uncertain events.
##'
##' @references
##'
##' Firth, D. (1993). Bias reduction of maximum likelihood
##' estimates. \emph{Biometrika}, 80(1), 27--38.
##'
##' Kuk, A. Y. C., & Chen, C. (1992). A mixture model combining logistic
##' regression with proportional hazards regression. \emph{Biometrika}, 79(3),
##' 531--541.
##'
##' Peng, Y. (2003). Estimating baseline distribution in proportional hazards
##' cure models. \emph{Computational Statistics & Data Analysis}, 42(1-2),
##' 187--201.
##'
##' Sy, J. P., & Taylor, J. M. (2000). Estimation in a Cox proportional hazards
##' cure model. \emph{Biometrics}, 56(1), 227--236.
##'
##' Wang, W., Luo, C., Aseltine, R. H., Wang, F., Yan, J., & Chen, K. (2020).
##' Suicide Risk Modeling with Uncertain Diagnostic Records. \emph{arXiv
##' preprint arXiv:2009.02597}.
##'
##' @seealso
##'
##' \code{\link{cox_cure_net}} for regularized Cox cure rate model with
##' elastic-net penalty.
##'
##' @example inst/examples/cox_cure.R
##'
##' @export
cox_cure <- function(surv_formula,
                     cure_formula,
                     time, event,
                     data, subset,
                     contrasts = NULL,
                     bootstrap = 0,
                     firth = FALSE,
                     surv_start = NULL,
                     cure_start = NULL,
                     surv_offset = NULL,
                     cure_offset = NULL,
                     em_max_iter = 200,
                     em_rel_tol = 1e-5,
                     surv_max_iter = 30,
                     surv_rel_tol = 1e-5,
                     cure_max_iter = 30,
                     cure_rel_tol = 1e-5,
                     tail_completion = c("zero", "exp", "zero-tau"),
                     tail_tau = NULL,
                     pmin = 1e-5,
                     early_stop = TRUE,
                     verbose = FALSE,
                     ...)
{
    ## warning on `...`
    warn_dots(...)

    ## record function call
    call0 <- match.call()

    ## prepare to call function prep_cure_model
    this_call <- match.call(expand.dots = FALSE)
    ## time is also a function name.  rename to avoid potential issues.
    names(this_call)[which(names(this_call) == "time")] <- "obs_time"
    names(this_call)[which(names(this_call) == "event")] <- "obs_event"
    this_call$eval_env <- parent.frame()
    matched_call <- match(names(formals(prep_cure_model)),
                          names(this_call), nomatch = 0L)
    model_call <- this_call[c(1L, matched_call)]
    model_call[[1L]] <- quote(prep_cure_model)
    model_list <- eval(model_call)

    ## get design matrix and responses
    obs_time <- model_list$surv$obs_time
    obs_event <- model_list$surv$obs_event
    surv_x <- model_list$surv$xMat
    cure_x <- model_list$cure$xMat
    surv_offset <- model_list$surv$offset
    cure_offset <- model_list$surv$offset

    ## cox model does not have an intercept
    surv_is_intercept <- colnames(surv_x) == "(Intercept)"
    surv_has_intercept <- any(surv_is_intercept)
    if ((ncol(surv_x) - as.integer(surv_has_intercept)) == 0L) {
        stop("No covariate is specified in 'formula'.")
    }
    ## remove the possible intercept term
    if (surv_has_intercept) {
        surv_x <- surv_x[, which(! surv_is_intercept), drop = FALSE]
    }

    ## logistic model can have an intercept
    cure_is_intercept <- colnames(cure_x) == "(Intercept)"
    cure_has_intercept <- any(cure_is_intercept)
    cure_only_intercept <- all(cure_is_intercept)
    cure_standardize <- ! cure_only_intercept
    ## remove the possible intercept term
    if (cure_has_intercept) {
        cure_x <- cure_x[, which(! cure_is_intercept), drop = FALSE]
        cure_intercept <- TRUE
    } else {
        cure_intercept <- FALSE
    }

    ## check time
    if (anyNA(obs_time)) {
        stop("Found NA's in 'time'.")
    }
    ## check event
    if (all(obs_event[! is.na(obs_event)] < 1)) {
        stop("No event can be found.")
    }
    ## starting values
    if (is.null(surv_start)) {
        surv_start <- 0
    } else if (length(surv_start) != ncol(surv_x)) {
        stop("The length of 'surv_start' is inappropriate.")
    }
    if (is.null(cure_start)) {
        cure_start <- 0
    } else if (length(cure_start) != ncol(cure_x) +
               as.integer(cure_intercept)) {
        stop("The length of 'cure_start' is inappropriate.")
    }
    ## offset terms
    if (is.null(surv_offset)) {
        surv_offset <- rep(0, nrow(surv_x))
    } else if (length(surv_offset) != nrow(surv_x)) {
        stop("The length of 'surv_offset' is inappropriate.")
    }
    if (is.null(cure_offset)) {
        cure_offset <- rep(0, nrow(cure_x))
    } else if (length(cure_offset) != nrow(cure_x)) {
        stop("The length of 'cure_start' is inappropriate.")
    }
    ## on tail completion
    all_tails <- c("zero", "exp", "zero-tau")
    tail_completion <- match(match.arg(tail_completion, all_tails),
                             all_tails)
    ## prepare tail_tau
    if (is.null(tail_tau)) {
        tail_tau <- - 1
    }
    ## more checks if tail completion after a specified tau
    if (tail_completion == 3L) {
        if (tail_tau < max(obs_time[! is.na(obs_event) & obs_event > 0])) {
            stop("The specified 'tail_tau' cannot be less than",
                 "the largest event time.")
        } else if (tail_tau > max(obs_time)) {
            warning("The specified 'tail_tau' is greater than",
                    "the largest survival time.")
        }
    }

    ## call the routine
    if (anyNA(obs_event)) {
        out <- coxph_cure_uncer(
            time = obs_time,
            event = obs_event,
            cox_x = surv_x,
            cure_x = cure_x,
            cure_intercept = cure_intercept,
            bootstrap = bootstrap,
            cox_start = surv_start,
            cure_start = cure_start,
            cox_offset = surv_offset,
            cure_offset = cure_offset,
            cox_standardize = TRUE,
            cure_standardize = cure_standardize,
            em_max_iter = em_max_iter,
            cox_mstep_max_iter = surv_max_iter,
            cox_mstep_rel_tol = surv_rel_tol,
            cure_mstep_max_iter = cure_max_iter,
            cure_mstep_rel_tol = cure_rel_tol,
            tail_completion = tail_completion,
            tail_tau = tail_tau,
            pmin = pmin,
            early_stop = early_stop,
            verbose = verbose
        )
        ## add class
        class(out) <- "cox_cure_uncer"
    } else {
        out <- rcpp_coxph_cure(
            time = obs_time,
            event = obs_event,
            cox_x = surv_x,
            cure_x = cure_x,
            cure_intercept = cure_intercept,
            bootstrap = bootstrap,
            firth = firth,
            cox_start = surv_start,
            cure_start = cure_start,
            cox_offset = surv_offset,
            cure_offset = cure_offset,
            cox_standardize = TRUE,
            cure_standardize = cure_standardize,
            em_max_iter = em_max_iter,
            cox_mstep_max_iter = surv_max_iter,
            cox_mstep_rel_tol = surv_rel_tol,
            cure_mstep_max_iter = cure_max_iter,
            cure_mstep_rel_tol = cure_rel_tol,
            tail_completion = tail_completion,
            tail_tau = tail_tau,
            pmin = pmin,
            early_stop = early_stop,
            verbose = verbose
        )
        ## add class
        class(out) <- "cox_cure"
    }
    ## add bootstrap se if available
    if (bootstrap <= 0) {
        out$bootstrap$surv_coef_mat <- out$bootstrap$cure_coef_mat <- NULL
    } else {
        out$bootstrap$surv_coef_se <- apply(out$bootstrap$surv_coef_mat,
                                            2L, se_interQ)
        out$bootstrap$cure_coef_se <- apply(out$bootstrap$cure_coef_mat,
                                            2L, se_interQ)
    }
    ## add covariate names back
    if (! is.null(surv_var_names <- colnames(surv_x))) {
        names(out$surv_coef) <- surv_var_names
    } else {
        names(out$surv_coef) <- paste0("x", seq_along(out$surv_coef))
    }
    if (! is.null(cure_var_names <- colnames(cure_x))) {
        names(out$cure_coef) <- c({if (cure_intercept) "(Intercept)" else NULL},
                                  colnames(cure_x))
    } else {
        names(out$cure_coef) <-
            if (cure_intercept) {
                c("(Intercept)",
                  paste0("z", seq_along(out$cure_coef[- 1L])))
            } else {
                paste0("z", seq_along(out$cure_coef))
            }
    }
    ## add function call
    out$call <- call0
    ## return
    out
}


##' @rdname cox_cure
##'
##' @param surv_x A numeric matrix for the design matrix of the survival model
##'     component.
##' @param cure_x A numeric matrix for the design matrix of the cure rate model
##'     component.  The design matrix should exclude an intercept term unless we
##'     want to fit a model only including the intercept term.  In that case, we
##'     need further set \code{cure_intercept = FALSE} to not standardize the
##'     intercept term.
##' @param cure_intercept A logical value specifying whether to add an intercept
##'     term to the cure rate model component.  If \code{TRUE} by default, an
##'     intercept term is included.
##' @param surv_standardize A logical value specifying whether to standardize
##'     the covariates for the survival model part.  If \code{FALSE}, the
##'     covariates will be standardized internally to have mean zero and
##'     standard deviation one.
##' @param cure_standardize A logical value specifying whether to standardize
##'     the covariates for the cure rate model part.  If \code{TRUE} (by
##'     default), the covariates will be standardized internally to have mean
##'     zero and standard deviation one.
##'
##' @export
cox_cure.fit <- function(surv_x, cure_x,
                         time, event,
                         cure_intercept = TRUE,
                         bootstrap = 0,
                         firth = FALSE,
                         surv_start = NULL,
                         cure_start = NULL,
                         surv_offset = NULL,
                         cure_offset = NULL,
                         surv_standardize = TRUE,
                         cure_standardize = TRUE,
                         em_max_iter = 200,
                         em_rel_tol = 1e-5,
                         surv_max_iter = 30,
                         surv_rel_tol = 1e-5,
                         cure_max_iter = 30,
                         cure_rel_tol = 1e-5,
                         tail_completion = c("zero", "exp", "zero-tau"),
                         tail_tau = NULL,
                         pmin = 1e-5,
                         early_stop = TRUE,
                         verbose = FALSE,
                         ...)
{
    ## warning on `...`
    warn_dots(...)

    ## record function call
    this_call <- match.call()

    ## check time
    if (anyNA(time)) {
        stop("Found NA's in 'time'.")
    }
    ## check event
    if (all(event[! is.na(event)] < 1)) {
        stop("No event can be found.")
    }
    ## starting values
    if (is.null(surv_start)) {
        surv_start <- 0
    } else if (length(surv_start) != ncol(surv_x)) {
        stop("The length of 'surv_start' is inappropriate.")
    }
    if (is.null(cure_start)) {
        cure_start <- 0
    } else if (length(cure_start) != ncol(cure_x) +
               as.integer(cure_intercept)) {
        stop("The length of 'cure_start' is inappropriate.")
    }
    ## offset terms
    if (is.null(surv_offset)) {
        surv_offset <- rep(0, nrow(surv_x))
    } else if (length(surv_offset) != nrow(surv_x)) {
        stop("The length of 'surv_offset' is inappropriate.")
    }
    if (is.null(cure_offset)) {
        cure_offset <- rep(0, nrow(cure_x))
    } else if (length(cure_offset) != nrow(cure_x)) {
        stop("The length of 'cure_start' is inappropriate.")
    }
    ## on tail completion
    all_tails <- c("zero", "exp", "zero-tau")
    tail_completion <- match(match.arg(tail_completion, all_tails),
                             all_tails)
    ## prepare tail_tau
    if (is.null(tail_tau)) {
        tail_tau <- - 1
    }
    ## more checks if tail completion after a specified tau
    if (tail_completion == 3L) {
        if (tail_tau < max(time[! is.na(event) & event > 0])) {
            stop("The specified 'tail_tau' cannot be less than",
                 "the largest event time.")
        } else if (tail_tau > max(time)) {
            warning("The specified 'tail_tau' is greater than",
                    "the largest survival time")
        }
    }

    ## call the routine
    if (anyNA(event)) {
        out <- coxph_cure_uncer(
            time = time,
            event = event,
            cox_x = surv_x,
            cure_x = cure_x,
            cure_intercept = cure_intercept,
            bootstrap = bootstrap,
            cox_start = surv_start,
            cure_start = cure_start,
            cox_offset = surv_offset,
            cure_offset = cure_offset,
            cox_standardize = surv_standardize,
            cure_standardize = cure_standardize,
            em_max_iter = em_max_iter,
            cox_mstep_max_iter = surv_max_iter,
            cox_mstep_rel_tol = surv_rel_tol,
            cure_mstep_max_iter = cure_max_iter,
            cure_mstep_rel_tol = cure_rel_tol,
            tail_completion = tail_completion,
            tail_tau = tail_tau,
            pmin = pmin,
            early_stop = early_stop,
            verbose = verbose
        )
        ## add class
        class(out) <- "cox_cure_uncer"
    } else {
        out <- rcpp_coxph_cure(
            time = time,
            event = event,
            cox_x = surv_x,
            cure_x = cure_x,
            cure_intercept = cure_intercept,
            bootstrap = bootstrap,
            firth = firth,
            cox_start = surv_start,
            cure_start = cure_start,
            cox_offset = surv_offset,
            cure_offset = cure_offset,
            cox_standardize = surv_standardize,
            cure_standardize = cure_standardize,
            em_max_iter = em_max_iter,
            cox_mstep_max_iter = surv_max_iter,
            cox_mstep_rel_tol = surv_rel_tol,
            cure_mstep_max_iter = cure_max_iter,
            cure_mstep_rel_tol = cure_rel_tol,
            tail_completion = tail_completion,
            tail_tau = tail_tau,
            pmin = pmin,
            early_stop = early_stop,
            verbose = verbose
        )
        ## add class
        class(out) <- "cox_cure"
    }
    ## add bootstrap se if available
    if (bootstrap <= 0) {
        out$bootstrap$surv_coef_mat <- out$bootstrap$cure_coef_mat <- NULL
    } else {
        out$bootstrap$surv_coef_se <- apply(out$bootstrap$surv_coef_mat,
                                            2L, se_interQ)
        out$bootstrap$cure_coef_se <- apply(out$bootstrap$cure_coef_mat,
                                            2L, se_interQ)
    }
    ## add covariate names back
    if (! is.null(surv_var_names <- colnames(surv_x))) {
        names(out$surv_coef) <- surv_var_names
    } else {
        names(out$surv_coef) <- paste0("x", seq_along(out$surv_coef))
    }
    if (! is.null(cure_var_names <- colnames(cure_x))) {
        names(out$cure_coef) <- c({if (cure_intercept) "(Intercept)" else NULL},
                                  colnames(cure_x))
    } else {
        names(out$cure_coef) <-
            if (cure_intercept) {
                c("(Intercept)",
                  paste0("z", seq_along(out$cure_coef[- 1L])))
            } else {
                paste0("z", seq_along(out$cure_coef))
            }
    }
    ## add function call
    out$call <- this_call
    ## return
    out
}