R/multiRec.R
In multiRec: Analysis of Multi-Type Recurrent Events

Documented in multiRec

#' Fit the multi-type recurrent event model
#'
#' multi-type events, may be recurrent or terminating
#'
#' @param ... one or more models (see details)
#' @param data the dataset
#' @param na.action function, a function which indicates what should happen when
#'   the data contain NAs. The default is na.omit.
#' @param eventVar character, the name of the event variable in the data= dataset
#' @param startTimeVar character, the name of the variable that holds the start
#'   time of each interval in the data= dataset
#' @param stopTimeVar character, the name of the variable that holds the stop
#'   time of each interval in the data= dataset
#' @param idVar character, the name of the id variable which uniquely identifies
#'   individuals in the data= dataset
#' @param link the link used to model the hazard
#' @param maxit integer, the maximum number of iteration for the optimization
#'   method. Defaults to 500 for Nelder-Mead, 100 for all other methods.
#' @param noEvent character, the value(s) of the event variable that indicate
#'   that no event occurred in an interval
#' @param robust logical, if TRUE robust standard errors are used
#' @param method character, the optimization method see the optim function for
#'   details
#' @param method.seed numeric, the seed used
#' @param SANN.init integer, if this is greater than 0 it indicates the number
#'   of simulated annealing iterations used to refine the initial estimate of
#'   the parameters before calling the method specified in method=.
#' @param hazardPrefix character, the prefix used to denote prior event
#'   functions in the formulas for the hazard
#' @param fitDetails logical, if TRUE the returned fit object contains
#'   additional information used to create fit diagnostics.
#' @param hessian character, use the Hessian calculated in optim or calculated
#'   separately in numDeriv::hessian().
#' @param trace logical, if TRUE additional information is printed by the
#'   algorithm
#'
#' @details
#' models for the hazard are of the form
#'
#' lhs ~ rhs
#'
#' where lhs is one of the possible events and rhs is the usual model right
#' hand side, with one addition: the pseudo-function "nPrior." can be
#' used to indicate a dependence of the hazard on prior events. This function
#' has the form
#'
#' nPrior.event(~formula, alpha=value)
#'
#' where
#' \describe{
#'  \item{event}{should be replaced by the appropriate event. For example,
#'  "nPrior.stroke" indicates a dependence of the hazard on the number of
#'  strokes prior to the current interval}
#'  \item{formula}{is a possibly empty formula which expresses the possible
#'  covariate effects on the extent to which prior events impact the hazard. For
#'  example "nPrior.stroke(~hba1c+sbp)" indicates a dependence on the number of
#'  prior strokes which may be modulated by HbA1c (variable hba1c) and systolic
#'  blood pressure (variable sbp)}
#'  \item{value}{is the power to which the number of prior events should
#'  be raised. The default is 1, i.e. the hazard depends simply on the number of
#'  prior events. A value of 0.5 means that the hazard depends on the square
#'  root of the number of prior events, i.e. that additional events have
#'  a diminishing effect on the hazard.  Specifying alpha=NA means that
#'  the power is unknown and should be estimated.}
#' }
#'
#' For example a model of myocardial infaction (event='mi') might be
#'
#' mi ~ bmi + nPrior.mi(alpha=0.5) + nPrior.stroke(~hba1c+sbp)
#'
#' to indicate that the hazard of mi depends on body mass index (variable bmi),
#' on the square root of the number of prior MI's and on the number of prior
#' strokes, modulated by hba1c and systolic blood pressure.
#'
#' @importFrom stats as.formula ave model.matrix optim na.omit reformulate
#' @importFrom utils head
#'
#' @return an object of type agMTRE
#' @export
#' @examples
#' # Fit a model for the multiRecCVD2 dataset
#' fit = multiRec(hf   ~ nPrior.afib(~male),
#'                afib ~ age + nPrior.afib() + nPrior.hf(),
#'                data=multiRecCVD2,
#'                idVar='id',
#'                link='log')
#'
#' # Fit the model with robust variances
#' fit = multiRec(hf   ~ nPrior.afib(~male),
#'                afib ~ age + nPrior.afib() + nPrior.hf(),
#'                data=multiRecCVD2,
#'                idVar='id',
#'                link='log',
#'                robust = TRUE)   # Request robust (sandwich) variances
#'
#' # Use a specified power
#' fit = multiRec(hf   ~ nPrior.afib(~male),
#'                afib ~ age + nPrior.afib(alpha=0.5) + nPrior.hf(),
#'                data=multiRecCVD2,
#'                idVar='id',
#'                link='log',
#'                robust = TRUE)
#'
#' # Display the coefficients
#' coef(fit)
#' confint(fit)
#'
#' # Display intervals with a poor fit
#' boxplot(resid(fit))
multiRec = function(...,
                    data,
                    eventVar='event',
                    startTimeVar='tstart',
                    stopTimeVar='tstop',
                    idVar='id',
                    na.action=na.omit,
                    link=c('log','identity','yj'),
                    SANN.init=0,
                    noEvent=c('',NA),
                    robust=FALSE,
                    method=c("BFGS", "Nelder-Mead", "CG", "SANN"),
                    method.seed=NULL,
                    hazardPrefix='nPrior.',
                    maxit,
                    trace=FALSE,
                    fitDetails=FALSE,
                    hessian=c('optim','numDeriv')) {

  hessian = match.arg(hessian)
  method  = match.arg(method)

  # ----------------------------------------------------------------------------
  # Initialize
  # ----------------------------------------------------------------------------
  resetIndices()

  if (missing(maxit)) {
    if (method=='Nelder-Mead') maxit=500
    else maxit=100
  }

  if (method=='SANN' && !is.null(method.seed)) { oldseed <- if (exists('.Random.seed', envir = .GlobalEnv)) get('.Random.seed', envir = .GlobalEnv) else NULL; on.exit(if (!is.null(oldseed)) assign('.Random.seed', oldseed, envir = .GlobalEnv), add = TRUE); set.seed(method.seed) }

  # ----------------------------------------------------------------------------
  # handle the link
  # ----------------------------------------------------------------------------
  linkName = match.arg(link)
  dlink = list('identity'=identityLink,
               'log'=logLink,
               'yj'=yjLink)[[linkName]]
  iLink = list('identity'=identityInvLink,
               'log'=logInvLink,
               'yj'=yjInvLink)[[linkName]]

  link = list(
    link       = linkName,
    linkfun    = dlink,
    linkinv    = iLink,
    param.ix   = if (linkName=='yj') getIndices(1) else NULL,
    param.init = 0
  )

  # ----------------------------------------------------------------------------
  # Check the dataset
  #
  # Apply the missing value function. There should be no missing values once
  # this is done.
  # ----------------------------------------------------------------------------
  if (missing(data)) halt('data= not specified')
  if (!inherits(data,'data.frame')) halt('data= is not a data.frame')

  # ----------------------------------------------------------------------------
  # Check that the main variables are in the dataset
  # ----------------------------------------------------------------------------
  if (!eventVar %in% names(data)) halt('eventVar=%s in not in the dataset',as.character(eventVar))
  if (!startTimeVar %in% names(data)) halt('startTimeVar=%s in not in the dataset',as.character(startTimeVar))
  if (!stopTimeVar %in% names(data)) halt('stopTimeVar=%s in not in the dataset',as.character(stopTimeVar))
  if (!idVar %in% names(data)) halt('idVar=%s in not in the dataset',as.character(idVar))

  # ----------------------------------------------------------------------------
  # Handle missing values
  #
  # We first restrict the dataset to the variables we need (missing values in
  # unused variables should not matter)
  # ----------------------------------------------------------------------------
  models  = list(...)
  used    = intersect(unique(unlist(lapply(models,all.vars))),names(data))
  used    = c(used,eventVar,startTimeVar,stopTimeVar,idVar)

  data = na.action(data[,used])
  if (any(is.na(data))) halt('The dataset contains missing values.')
  omit = attr(data,'na.action')

  # ----------------------------------------------------------------------------
  # Sanity checks on the main variables
  # ----------------------------------------------------------------------------
  event  = trimws(data[[eventVar]])
  start  = data[[startTimeVar]]
  stop   = data[[stopTimeVar]]
  id     = data[[idVar]]

  # The id variable should not have missing values
  if (anyNA(id)) halt('Id variable %s has missing values', idVar)

  # check for ties
  endTime = paste0(id,':',stop)
  if (any(duplicated(endTime))) halt('Tied events found. This model does not allow ties')

  # Check that the start variable is numeric, not missing and not negative
  if (!is.numeric(start)) halt('Time variable %s is not numeric',startTimeVar)
  if (anyNA(start)) halt('Time variable %s has missing values', startTimeVar)
  if (any(start<0)) halt('Time variable %s has negative values',startTimeVar)  # do we care if some are negative?

  # Check that the stop variable is numeric, not missing and not negative
  if (!is.numeric(stop)) halt('Time variable %s is not numeric',stopTimeVar)
  if (anyNA(stop)) halt('Time variable %s has missing values', stopTimeVar)
  if (any(stop<0)) halt('Time variable %s has negative values',stopTimeVar)  # do we care if some are negative?

  # Calculate the interval widths
  width = stop-start
  if (any(width<0)) halt('Start time (%s) is after stop time (%s) for observation %d',
                         stopTimeVar, startTimeVar, which(width<0)[1])
  if (any(width==0)) halt('Time variable %s is the same as %s for observation %d',
                          startTimeVar, stopTimeVar,which(width==0)[1])

  # ----------------------------------------------------------------------------
  # Make sure the dataset is sorted by id and time
  # ----------------------------------------------------------------------------
  data = data[order(id,start),]

  # ----------------------------------------------------------------------------
  # What events do we have?
  # ----------------------------------------------------------------------------
  eventNames = unique(event)
  eventNames = eventNames[!eventNames %in% noEvent] # drop non-events
  nEvents    = table(event)

  # ----------------------------------------------------------------------------
  # Construct the event counts
  # --------------------------
  #
  # evCounts is a list with an entry for each event type. Each entry is a
  # vector of event counts. Specifically, the value in these vectors is the
  # number of events of a specific type that occurred strictly before the
  # current time.
  # ----------------------------------------------------------------------------
  nDots = max(nchar(gsub('^(\\.*).*$','\\1',names(data),perl=TRUE)))
  uniquePrefix = paste0(strrep('.',nDots+1),'N.')

  for (i in seq_along(eventNames)) {
    eventName = eventNames[i]
    data[[paste0(uniquePrefix,eventName)]] = ave(event==eventName, id, FUN=function(x) cumsum(c(0,x[-length(x)])))
  }

  # ----------------------------------------------------------------------------
  # Construct the list of model pieces
  # ----------------------------------------------------------------------------
  # Given a model with several hazards, each with intrinsic and prior components
  # such as
  #
  # h1 = i1 + p11 + p12 + p13
  # h2 = i2 + p21 + p22 + p23
  # h3 = i3 + p31 + p32 + p33
  #
  # This section of code constructs a list called "hazards" with one element for
  # each hazard.  Each element is itself a list of components, one for the
  # intrinsic term, and one for each of the prior terms. So the list "hazards"
  # has the following structure:
  #
  # hazards =     ┌──────────────────────────────────────┐
  #               |1  ┌────┐  ┌─────┐  ┌─────┐  ┌─────┐  │
  #               |   | i1 ├──┤ p11 ├──┤ p12 ├──┤ p13 |  │
  #           ┌───┤   └────┘  └─────┘  └─────┘  └─────┘  │
  #           │   └──────────────────────────────────────┘
  #           |
  #           |   ┌──────────────────────────────────────┐
  #           └──>|2  ┌────┐  ┌─────┐  ┌─────┐  ┌─────┐  |
  #               |   | i2 ├──┤ p21 ├──┤ p22 ├──┤ p23 |  |
  #           ┌───┤   └────┘  └─────┘  └─────┘  └─────┘  |
  #           |   └──────────────────────────────────────┘
  #           |
  #           |   ┌──────────────────────────────────────┐
  #           └──>|3  ┌────┐  ┌─────┐  ┌─────┐  ┌─────┐  |
  #               |   | i3 ├──┤ p31 ├──┤ p32 ├──┤ p33 |  |
  #               |   └────┘  └─────┘  └─────┘  └─────┘  |
  #               └──────────────────────────────────────┘
  #
  # At the top level, the list has one sublist for each hazard model.  Each
  # of these sublists contains entries for the intrinsic term (if any) and
  # the prior terms (if any).
  #
  # The entry for an intrisic term is a list with the following elements:
  #   * event:   the string '(Intrinsic)'
  #   * formula: a formula
  #   * X:       the corresponding design matrix
  #   * beta.ix: the vector of indices of the betas for this matrix inside the
  #              parameter vector, with length(beta.ix)=ncol(X)
  #
  # The entry for a prior term is a list with the following elements:
  #   * event:      the name of the prior event this piece corresponds to
  #   * formula:    the formula for the intrinsic term, or the term inside a
  #                 nPrior.event() call, as a character string
  #   * X:          the corresponding design matrix
  #   * beta.ix:    the vector of indices of the betas for this matrix inside
  #                 the parameter vector, with length(beta.ix)=ncol(X)
  #   * counts:     the vector of counts, with length(counts)=nrow(X)
  #   * alpha:      the power. This is a number if a fixed power was specified,
  #                 NA if the power is to be estimated
  #   * alpha.ix:   the index of alpha in the parameter vector. Only used if
  #                 alpha=NA.
  #   * alpha.init: the initial value for the unknown parameter. Only used if
  #                 alpha=NA.
  #
  # The parameter vector
  # --------------------
  # All model parameters end up in a single vector because that's the way the
  # optim function likes it. The parameter vector will generally contain three
  # groups of parameters
  #   - parameters associated with the design matrices (i.e. betas)
  #   - parameters associated with the link
  #   - parameters associated with the event count functions
  #
  # optim will pass a single parameter vector and the various parts of the code
  # will have to find extract what they need from this parameter vector.  They
  # do this through the ".ix" elements described above
  # ----------------------------------------------------------------------------

  # Create a "prior" function for each of the possible event types. This
  # is used to extract the information from the nPrior.event() function.
  priorFn = 'function(formula=NULL, alpha=1) return(list(event="%s", formula=formula, alpha=alpha))'

  for (eventName in eventNames) {
    f = str2lang(sprintf(priorFn, eventName))
    assign(paste0(hazardPrefix,eventName),eval(f),envir=environment())
  }

  hazards = list()
  for (m in seq_along(models)) {
    model = models[[m]]
    # make sure we have a formula
    if (!inherits(model,'formula')) {
      # is it a misnamed argument?
      if (!is.null(names(models)) && nzchar(names(models)[m])) {
        halt('multiRec argument "%s=" is not recognized. Is it misspelled?',names(models)[m])
      }
      else halt('Expecting a formula, but "%s" was found',
                deparse1(model))
    }

    # Isolate the event for which this is the hazard
    hazardName = as.character(model[[2]])
    if (!hazardName %in% eventNames) halt(paste('There is a hazard model for %s events,',
                                                'but there are no "%s" events in the data'),
                                          hazardName,hazardName)

    # If there are "nPrior." terms, make sure they all correspond to actual events
    funcs = setdiff(all.vars(model,functions=TRUE),all.vars(model,functions=FALSE))
    prior = funcs[startsWith(funcs,hazardPrefix)]
    if (length(prior)>0) {
      priorEv = substring(prior,nchar(hazardPrefix)+1)
      unknown = which(!priorEv %in% eventNames)
      n       = length(unknown)
      if (n>0) {
        h = sprintf('"%s"',prior[unknown])
        e = sprintf('"%s"',priorEv[unknown])

        if (n>1){
          h = paste(paste(h[-n],collapse=', '),'and',h[n])
          e = paste(paste(e[-n],collapse=', '),'or',e[n])
        }
        halt('The hazard model for %s has %s but there are no %s events in the data',
                                   hazardName,h,e)
      }
    }

    # extract the terms from the model formula and separate intrinsic terms
    # from prior ones
    terms   = labels(terms(model))
    isPrior = startsWith(terms,hazardPrefix)

    if (all(isPrior)) intrinsicFormula = ~1
    else intrinsicFormula = reformulate(terms[!isPrior])

    priorFormulas = lapply(terms[isPrior], str2lang)

    X.intrinsic = model.matrix(intrinsicFormula,data=data)
    hazardInfo = list('(Intrinsic)' = list(
      event = '(Intrinsic)',
      formula=intrinsicFormula,
      X = X.intrinsic,
      beta.ix = getIndices(ncol(X.intrinsic))
    ))

    # The prior event elements of the model
    # These will be of the form "nPrior.<eventName>(<formula>, .alpha=<value>)".
    # Since the formula may contain nested parentheses, it's simpler to process
    # it using R's parse than (say) regular expressions. So we convert them to
    # '<eventName>', <formula>, .alpha=<value>)" and run them through
    # parse.
    for (i in seq_along(priorFormulas)) {
      priorName = as.character(priorFormulas[[i]][[1]])
      term      = eval(priorFormulas[[i]], envir=data, enclos=environment())
      alpha     = term$alpha
      formula   = term$formula

      if (is.null(formula)) formula = ~1
      else {
        # Check that the formula a) is a formula and b) that it has no left hand
        # side
        if (!inherits(formula,'formula')) {
          halt('The %s term in the model for %s does not provide a formula',
               priorName, hazardName)
        }
        else if (length(formula)==3) {
          halt('The formula for the %s term in the model for %s should not have a right hand side',
               priorName, hazardName)
        }
      }

      X        = model.matrix(formula,data=data)

      out    = list(
        event     = term$event,
        formula   = formula,
        countName = paste0(uniquePrefix,term$event),
        X         =  X,
        beta.ix   = getIndices(ncol(X)),
        alpha     = alpha
      )

      if (is.na(alpha)) {
        out$alpha.ix   = getIndices(1)
        out$alpha.init = 1

        # Check to make sure that there are enough events
        n = nEvents[[term$event]]
        if (n<5) {
          if (n==1) howMany = sprintf('there is only one %s event',term$event)
          else howMany = sprintf('there are only %d %s events',n,term$event)
          warn(paste('The %s term in the model for %s estimates alpha, but %s',
                     'in the data. The estimate of alpha may be unreliable.'),
               priorName, hazardName, howMany)
        }

      }
      hazardInfo[[paste0(hazardPrefix,term$event)]] = out
    }
    rm(i)


    # Check that the variables in the model all exist in the dataset
    vars    = unique(unlist(lapply(hazardInfo, function(item) all.vars(item$formula))))
    unknown = setdiff(vars, names(data))
    if (length(unknown)>0) halt.n(length(unknown),
                                  'Variable %s is not in the dataset',
                                  'Variables %s are not in the dataset',
                                  paste.and(unknown))

    # Add to the hazards
    attr(hazardInfo,'hazardName') = hazardName
    attr(hazardInfo,'call') = deparse1(model)
    hazards[[hazardName]] = hazardInfo
  }

  # ----------------------------------------------------------------------------
  # Final checks
  #
  # Any events with no hazard formula?
  # ----------------------------------------------------------------------------
  noHazard = setdiff(eventNames,names(hazards))
  if (length(noHazard)>0) halt.n(length(noHazard),
                                 'Variable %1$s contains values %2$s but there is no corresponding hazard formula',
                                 'Variable %1$s contains values %2$s but there are no corresponding hazard formulas',
                                 eventVar, paste.and(noHazard))

  # ----------------------------------------------------------------------------
  # Construct the model object
  # ----------------------------------------------------------------------------
  selector = cbind(1:nrow(data),match(event,names(hazards)))
  modelObj = list(hazards=hazards,
                  id = id,
                  link=link,
                  w=width,
                  selector=selector,
                  timeVar=startTimeVar,
                  time2Var=stopTimeVar,
                  eventVar=eventVar,
                  uniquePrefix=uniquePrefix,
                  data=data)

  # ----------------------------------------------------------------------------
  # Get initial estimates
  # ----------------------------------------------------------------------------
  init = init(modelObj)

  # ----------------------------------------------------------------------------
  # If requested, use SANN to improve the initial estimates
  #
  # Note that this need not converge since all we want is initial parameter
  # estimates to feed into the next optim call
  # ----------------------------------------------------------------------------
  if (SANN.init>0) {
    if (!is.null(method.seed)) { oldseed <- if (exists('.Random.seed', envir = .GlobalEnv)) get('.Random.seed', envir = .GlobalEnv) else NULL; on.exit(if (!is.null(oldseed)) assign('.Random.seed', oldseed, envir = .GlobalEnv), add = TRUE); set.seed(method.seed) }
    fit0 = try(optim(par=init,
                     fn=likelihood,
                     method = 'SANN',
                     modelObj=modelObj,
                     hessian = FALSE,
                     control=list(maxit=SANN.init,fnscale=-1,temp=10,tmax=10)),
               silent=TRUE)
    init=fit0$par
  }

  # ----------------------------------------------------------------------------
  # Get an improved estimate
  # ----------------------------------------------------------------------------
  fit = try(optim(par=init,
                  fn=likelihood,
                  method = method,
                  modelObj=modelObj,
                  hessian = TRUE,
                  control=list(trace=trace,
                               REPORT=1,
                               maxit=maxit,
                               fnscale=-1)),
            silent=TRUE)

  if (inherits(fit,'try-error')) {
    emsg = attr(fit,'condition')

    # Create a fake fit object
    fit = list(par=init,
               value=0,
               count=c('function'=1, 'gradient'=1),
               convergence=1,
               message=emsg)
    warning(emsg)
  }

  # Calculate additional estimates, residuals
  obj = makeFitObject(fit=fit, modelObj=modelObj, robust=robust,
                      init=init,
                      fitDetails=fitDetails,
                      hessian=hessian,
                      omit=omit)
  return(obj)
}