R/xhaz.R
In xhaz: Excess Hazard Modelling Considering Inappropriate Mortality Rates

Documented in xhaz

#' @title xhaz function
#'
#' @description Fits the excess hazard models proposed by Esteve et al. (1990) <doi:10.1002/sim.4780090506>,
#'  with the possibility to account for time dependent covariates. Fits also the
#'  non-proportional excess hazard model proposed by Giorgi et al. (2005) <doi:10.1002/sim.2400>.
#'  In addition, fits excess hazard models with possibility to rescale
#'  (Goungounga et al. (2019) <doi:10.1186/s12874-019-0747-3>) or to correct the background mortality with a
#'  proportional (Touraine et al. (2020) <doi:10.1177/0962280218823234>) or non-proportional (Mba et al. (2020) <doi:10.1186/s12874-020-01139-z>)
#'  effect.
#'
#' @param formula a formula object of the  function with the response on the left
#'  of a \code{~} operator and the terms on the right. The response must be a
#'  survival object as returned by the \code{Surv} function (time in first and status
#'  in second).
#'
#' @note `time` is OBLIGATORY in YEARS.
#'
#'
#' @param data a data frame in which to interpret the variables named in the
#'  formula
#'
#' @param ratetable a rate table stratified by age, sex, year (if missing,
#'  `ratedata` is used)
#'
#' @param rmap a list that maps data set names to the ratetable names.
#'
#' @param baseline an argument to specify the baseline hazard: if it follows a
#'  piecewise constant, \code{baseline = "constant"} is used and corresponds to
#'  the baseline in Esteve et al. model; if the baseline follows a quadratic b-splines,
#'  \code{baseline = "bsplines"}  is used, corresponding to the baseline excess
#'  hazard in Giorgi et al model.
#'
#' @param pophaz indicates three possibles arguments in character: classic or
#'  rescaled and corrected. If \code{pophaz = "classic"} chosen, fits the model
#'  that do not require to rescale or to correct the background mortality (i.e. the Esteve
#'  et al. model or Giorgi et al. model); if \code{pophaz = "rescaled"}  or
#'  \code{pophaz = "corrected"} chosen, fits the models that require to rescale
#'  or to correct the background mortality.
#'
#' @param only_ehazard a boolean argument (by default, \code{only_ehazard=FALSE}).
#'  If \code{only_ehazard = TRUE}, \code{pophaz = "classic"} must be provided and
#'  the total value of the log-likelihood will not account for the cumulative population hazard.
#'
#' @param add.rmap character that indicates the name in character of the additional
#' demographic variable from `data` to be used for correction of the life table,
#' in particular when one is in the presence of an insufficiently stratified life
#' table (see Touraine et al. model). This argument is not used if
#' \code{pophaz = "classic"} or \code{pophaz = "rescaled"}.
#'
#'
#' @param add.rmap.cut a list containing arguments to specify the modeling
#'  strategy for breakpoint positions, which allows a non-proportional effect of
#'  the correction term acting on the background mortality. By default
#'  \code{list(breakpoint = FALSE)}, i.e. a proportional effect of the correction
#'  term acting on the background mortality is needed; in this case, all the other
#'  argument of the list are not working for the model specification;
#'
#'  if \code{list(breakpoint = TRUE, cut = c(70))}, the chosen cut-point(s) is (are)
#'  the numeric value(s) proposed. If \code{list(breakpoint = TRUE, cut = NA)},
#'  there is the same number of breakpoints as the number of NA, with their possible
#'  positions specified as here by \code{probs},
#'  i.e. \code{list(breakpoint = TRUE, cut = NA, probs = seq(0, 1, 0.25))}.
#'  That corresponds to a numeric vector of probabilities with values between 0 and 1
#'  as in \code{quantile} function.
#'  \code{criterion} is used to choose the best model, using the AIC or the
#'  BIC (the default criterion). If needed, all the fitted models are printed
#'  by the user by adding in the list \code{print_stepwise = FALSE}.
#'
#'
#' @param interval a vector indicating either the location of the year-scale time
#' intervals for models with piecewise constant function, or the location of the
#' knots for models with B-splines functions for their baseline hazard (see the
#' appropriate specification in \code{baseline} argument). The first component of
#' the vector is 0, and the last one corresponds to the maximum time fellow-up of the study.
#'
#' @param ratedata a data frame of the hazards mortality in general population.
#'
#' @param subset an expression indicating which subset of the data should be used
#' in the modeling. All observations are included by default
#'
#' @param na.action as in the \code{coxph} function, a missing-data filter function.
#'
#'
#' @param init a list of initial values for the parameters to estimate. For each
#'  elements of the list, give the name of the covariate followed by the vector
#'  of the fixed initials values

#' @param control a list of control values used to control the optimization
#' process. In this list, `eps`, is a convergence criteria (by default, \code{eps=10^-4}),
#' `iter.max` is the  maximum number of iteration (by default, \code{iter.max=15}),
#' and \code{level}, is the level used for the confidence intervals (by default, \code{level=0.95}).
#'
#'
#' @param optim a Boolean argument (by default, \code{optim = FALSE}).
#' If \code{optim = TRUE}), the maximization algorithm uses the \code{optim} function
#'
#' @param scale a numeric argument to specify whether the life table contains death rates
#' per day (default \code{scale = 365.2425}) or death rates per year (\code{scale = 1}).
#'
#' @param trace a Boolean argument, if \code{trace = TRUE}), tracing information
#' on the progress of the optimization is produced
#'
#'
#' @param speedy a Boolean argument, if \code{speedy = TRUE}, optimization is done in a
#' parallel mode
#'
#'
#' @param nghq number of nodes and weights for Gaussian quadrature
#'
#' @param method corresponds to \code{optim} function argument.
#'
#' @param ... other parameters used with the \code{xhaz} function
#'
#'
#' @details Use the \code{Surv(time_start, time_stop, status)} notation for time
#'  dependent covariate with the appropriate organization of the data set (see
#'  the help page of the \code{Surv} function)
#'
#'  Only two interior knots are possible for the model with B-splines functions
#'  to fit the baseline (excess) hazard. Determination of the intervals might be
#'  user's defined or automatically computed according to the quantile of the
#'  distribution of deaths. Use NA for an automatic determination (for example,
#'   \code{interval = c(0, NA, NA, 5)}).
#'
#'
#' @keywords xhaz
#'
#'
#' @return An object of class \code{constant} or \code{bsplines},
#' according to the type of functions chosen to fit the baseline hazard of
#' model (see details for argument \code{baseline}). This object is a list containing
#' the following components:
#'
#'
#' \item{coefficients}{estimates found for the model}
#'
#' \item{varcov}{the variance-covariance matrix}
#'
#' \item{loglik}{for the Estève et al. model: the log-likelihood of the null
#' model, i.e without covariate, and the log-likelihood of the full model,
#' i.e with all the covariates declared in the formula; for the Giorgi et al.
#' model: the log-likelihood of the full model}
#'
#' \item{cov.test}{for the Esteve et al.model: the log-likelihood of the null
#' model, i.e without covariate, and the log-likelihood of the full model,
#' i.e with all the covariates declared in the formula; for the Giorgi et al.
#' model: the log-likelihood of the full model}
#'
#' \item{message}{a character string returned by the optimizer
#' see details in \code{optim} help page}
#'
#' \item{convergence}{an integer code as in \code{optim} when `"L-BFGS-B"` method
#' is used.}
#'
#' \item{n}{the number of individuals in the dataset}
#'
#' \item{n.events}{the number of events in the dataset. Event are considered
#' as death whatever the cause}
#'
#' \item{level}{the confidence level used}
#'
#' \item{interval}{the intervals used to split time for piecewise baseline excess
#'  hazard, or knots positions for Bsplines baseline}
#'
#' \item{terms}{the representation of the terms in the model}
#'
#' \item{call}{the function `call` based on model}
#'
#' \item{pophaz}{the assumption considered for the life table used in the
#' excess hazard model}
#'
#' \item{add.rmap}{the additional variable for which the life table is not
#' stratified}
#'
#' \item{ehazardInt}{the cumulative hazard of each individuals calculated from
#' the ratetable used in the model}
#'
#' \item{ehazard}{the individual expected hazard values from the ratetable
#' used to fit the model}
#'
#' \item{data}{the dataset used to run the model}
#'
#' \item{time_elapsed}{the time to run the model}
#'
#'
#'
#'
#' @author Juste Goungounga, Darlin Robert Mba, Nathalie Graffeo, Roch Giorgi
#'
#' @references Goungounga JA, Touraine C, Graff\'eo N, Giorgi R; CENSUR working
#'  survival group. Correcting for misclassification and selection effects in
#'  estimating net survival in clinical trials. BMC Med Res Methodol. 2019 May
#'  16;19(1):104. doi: 10.1186/s12874-019-0747-3. PMID: 31096911; PMCID:
#'  PMC6524224. (\href{https://pubmed.ncbi.nlm.nih.gov/31096911/}{PubMed})
#'
#'  Touraine C, Graff\'eo N, Giorgi R; CENSUR working survival group. More
#'  accurate cancer-related excess mortality through correcting background
#'  mortality for extra variables. Stat Methods Med Res. 2020 Jan;29(1):122-136.
#'  doi: 10.1177/0962280218823234. Epub 2019 Jan 23. PMID: 30674229.
#'  (\href{https://pubmed.ncbi.nlm.nih.gov/30674229/}{PubMed})
#'
#'  Mba RD, Goungounga JA, Graff\'eo N, Giorgi R; CENSUR working survival group.
#'  Correcting inaccurate background mortality in excess hazard models through
#'  breakpoints. BMC Med Res Methodol. 2020 Oct 29;20(1):268. doi:
#'  10.1186/s12874-020-01139-z. PMID: 33121436; PMCID: PMC7596976.
#'  (\href{https://pubmed.ncbi.nlm.nih.gov/33121436/}{PubMed})
#'
#'  Giorgi R, Abrahamowicz M, Quantin C, Bolard P, Esteve J, Gouvernet J, Faivre
#'  J. A relative survival regression model using B-spline functions to model
#'  non-proportional hazards. Statistics in Medicine 2003; 22: 2767-84.
#'  (\href{https://pubmed.ncbi.nlm.nih.gov/12939785/}{PubMed})
#'
#'
#' @examples
#'\donttest{
#' library("numDeriv")
#' library("survexp.fr")
#' library("splines")
#' library("statmod")
#' data("simuData","rescaledData", "dataCancer")
#' # load the data sets 'simuData', 'rescaledData' and 'dataCancer'.
#'
#'# Esteve et al. model: baseline excess hazard is a piecewise function
#'#                      linear and proportional effects for the covariates on
#'#                      baseline excess hazard.
#'
#'
#' fit.estv1 <- xhaz(formula = Surv(time_year, status) ~ agec + race,
#'                   data = simuData,
#'                   ratetable = survexp.us,
#'                   interval = c(0, NA, NA, NA, NA, NA, 6),
#'                   rmap = list(age = 'age', sex = 'sex', year = 'date'),
#'                   baseline = "constant", pophaz = "classic")
#'
#'
#' fit.estv1
#'
#'
#' # Touraine et al. model: baseline excess hazard is a piecewise function
#' #                        with a linear and proportional effects for the
#' #                        covariates on the baseline excess hazard.
#' # An additionnal cavariate (here race) missing in the life table is
#' # considered by the model.
#'
#'
#' fit.corrected1 <- xhaz(formula = Surv(time_year, status) ~ agec + race,
#'                        data = simuData,
#'                        ratetable = survexp.us,
#'                        interval = c(0, NA, NA, NA, NA, NA, 6),
#'                        rmap = list(age = 'age', sex = 'sex', year = 'date'),
#'                        baseline = "constant", pophaz = "corrected",
#'                        add.rmap = "race")
#'
#'
#'
#' fit.corrected1
#'
#' # extension of Touraine et al model: baseline excess hazard is a piecewise
#' # constant function with a linear and proportional effects for the covariates
#' # on the baseline excess hazard.
#'
#' # An additionnal cavariate (here race) missing in the life table is
#' # considered by the model with a breakpoint at 75 years
#'
#' fit.corrected2 <- xhaz(formula = Surv(time_year, status) ~ agec + race,
#'                        data = simuData,
#'                        ratetable = survexp.us,
#'                        interval = c(0, NA, NA, NA, NA, NA, 6),
#'                        rmap = list(age = 'age', sex = 'sex', year = 'date'),
#'                        baseline = "constant", pophaz = "corrected",
#'                        add.rmap = "race",
#'                         add.rmap.cut = list(breakpoint = TRUE, cut = 75))
#'
#'
#'
#' fit.corrected2
#'
#'
#' #Giorgi et al model: baseline excess hazard is a quadratic Bsplines
#' #                    function with two interior knots and allow here a
#' #                    linear and proportional effects for the covariates on
#' #                    baseline excess hazard.
#'
#'
#' fitphBS <- xhaz(formula = Surv(time_year, status) ~ agec + race,
#'                 data = simuData,
#'                 ratetable = survexp.us,
#'                 interval = c(0, NA, NA, 6),
#'                 rmap = list(age = 'age', sex = 'sex', year = 'date'),
#'                 baseline = "bsplines", pophaz = "classic")
#'
#' fitphBS
#'
#'
#'
#'
#'
#' # Application on `dataCancer`.
#' #Giorgi et al model: baseline excess hazard is a quadratic Bspline
#' #                    function with two interior knots and allow here a
#' #                    linear and proportional effect for the variable
#' #                    "immuno_trt" plus a non-proportional effect
#' #                    for the variable "ageCentre" on baseline excess hazard.
#'
#'
#' fittdphBS <- xhaz(formula = Surv(obs_time_year, event) ~ qbs(ageCentre) + immuno_trt,
#'                   data = dataCancer,
#'                   ratetable = survexp.fr,
#'                   interval = c(0, 0.5, 12, 15),
#'                   rmap = list(age = 'age', sex = 'sexx', year = 'year_date'),
#'                   baseline = "bsplines", pophaz = "classic")
#'
#' fittdphBS
#'
#'
#'
#'
#' # Application on `rescaledData`.
#' # rescaled model: baseline excess hazard is a piecewise function with a
#' # linear and proportional effects for the covariates on baseline excess hazard.
#'
#' # A scale parameter on the expected mortality of general population is
#' # considered to account for the non-comparability source of bias.
#'
#' rescaledData$timeyear <- rescaledData$time/12
#' rescaledData$agecr <- scale(rescaledData$age, TRUE, TRUE)
#'
#' fit.res <- xhaz(formula = Surv(timeyear, status) ~ agecr + hormTh,
#'                 data = rescaledData,
#'                 ratetable = survexp.fr,
#'                 interval = c(0, NA, NA, NA, NA, NA, max(rescaledData$timeyear)),
#'                 rmap = list(age = 'age', sex = 'sex', year = 'date'),
#'                 baseline = "constant", pophaz = "rescaled")
#'
#'  fit.res
#' }
#'
#' @import survival
#' @import stats
#' @import parallel
#' @import optimParallel
#' @import statmod
#' @import splines
#' @import survexp.fr
#'
#' @export
xhaz <- function(formula = formula(data),
                 data = sys.parent(),
                 ratetable,
                 rmap = list(age = NULL, sex = NULL, year = NULL),
                 baseline = c("constant", "bsplines"),
                 pophaz = c("classic", "rescaled", "corrected"),
                 only_ehazard = FALSE,
                 add.rmap = NULL,
                 add.rmap.cut = list(
                   breakpoint = FALSE,
                   cut = NA,
                   probs = NULL,
                   criterion = "BIC",
                   print_stepwise = FALSE
                 ),
                 interval,
                 ratedata = sys.parent(),
                 subset,
                 na.action,
                 init,
                 control = list(eps = 1e-4,
                                iter.max = 800,
                                level = 0.95),
                 optim = TRUE,
                 scale = 365.2425,
                 trace = 0,
                 speedy = FALSE,
                 nghq = 12,
                 method = "L-BFGS-B",
                 ...) {

  m_int <- match.call(expand.dots = FALSE)

  if ((
    add.rmap.cut$breakpoint == TRUE &
    !is.na(add.rmap.cut$cut[1]) & is.null(add.rmap.cut$probs)
  )) {
    if (!missing(rmap)) {
      rcall <- substitute(rmap)
      if (!is.call(rcall) || rcall[[1]] != as.name("list"))
        stop("Invalid rcall argument")
    }
    else
      rcall <- NULL

    breakpoint_with_cut(
      formula = formula,
      data = data,
      ratetable = ratetable,
      rmap = rmap,
      baseline = baseline,
      pophaz = pophaz,
      only_ehazard = only_ehazard,
      add.rmap = add.rmap,
      add.rmap.cut = add.rmap.cut,
      interval = interval,
      ratedata ,
      subset ,
      na.action ,
      init ,
      control ,
      optim ,
      scale ,
      trace ,
      speedy ,
      nghq,
      m_int,
      rcall,
      method,
      ...
    )
  } else if ((
    add.rmap.cut$breakpoint == TRUE &
    is.na(add.rmap.cut$cut[1]) & !is.null(add.rmap.cut$probs)
  )) {
    if (!missing(rmap)) {
      rcall <- substitute(rmap)
      if (!is.call(rcall) || rcall[[1]] != as.name("list"))
        stop("Invalid rcall argument")
    }
    else
      rcall <- NULL

    xhaz2(
      formula = formula,
      data = data,
      ratetable = ratetable,
      rmap = rmap,
      baseline = baseline,
      pophaz = pophaz,
      only_ehazard = only_ehazard,
      add.rmap = add.rmap,
      add.rmap.cut = add.rmap.cut,
      splitting  = TRUE,
      interval = interval,
      ratedata = ratedata,
      subset = subset,
      na.action = na.action,
      init = init,
      control = control,
      optim = optim,
      scale = scale,
      trace = trace,
      speedy = speedy,
      nghq = nghq,
      m_int = m_int,
      rcall = rcall,
      method = method,
      ...
    )

  } else if (add.rmap.cut$breakpoint == FALSE) {
    if (!missing(rmap)) {
      rcall <- substitute(rmap)
      if (!is.call(rcall) || rcall[[1]] != as.name("list"))
        stop("Invalid rcall argument")
    }
    else
      rcall <- NULL

    without_breakpoint_without_cut(
      formula = formula,
      data = data,
      ratetable = ratetable,
      rmap = rmap,
      baseline = baseline,
      pophaz = pophaz,
      only_ehazard = only_ehazard,
      add.rmap = add.rmap,
      add.rmap.cut = add.rmap.cut,
      interval = interval,
      splitting = FALSE,
      ratedata = ratedata,
      subset = subset,
      na.action = na.action,
      init = init,
      control = control,
      optim = optim,
      scale = scale,
      trace = trace,
      speedy = speedy,
      nghq = nghq,
      m_int = m_int,
      rcall = rcall,
      method = method,
      ...
    )
  }
}