primr: Patient rule-induction method

Documented in cv.trajectory jump.prim

###############################################################################
#
#               Functions for peeling trajectory analysis                    
#
###############################################################################

#' Peeling trajectory jump
#'
#' Identifies a jump in the peeling trajectory of \code{object}.
#'
#' @param object A \code{prim} object resulting from a call to \code{peeling}.
#' @param rel.support Logical indicating if the trajectory difference 
#'    should be relative to the support for finding the jump (default to TRUE).
#'
#' @details Computes the (relative) trajectory differences of \code{object}:
#'    \deqn{\frac{yfun[k] - yfun[k - 1]}{support[k - 1] - support[k]}}{(yfun[k] - yfun[k - 1])/(support[k - 1] - support[k])}
#'    and returns its maximum value. The rationale is that the biggest jump
#'    in peeling trajectory gives a good cut-off point for the peeling 
#'    algorithm. 
#'    If \code{rel.support = FALSE}, the denominator is not used in the
#'    differences calculation.
#'
#' @return A list with elements:
#'    \item{trajectory.difference}{Numeric vector of the computed (relative) 
#'      differences.}
#'    \item{npeel.opt}{Integer giving the npeel value of the highest 
#'      difference.}
#'    \item{final.box}{The extracted box corresponding to \code{npeel.opt}.
#'      See \code{\link{extract.box}}.}
#' @references
#'    Masselot P., Chebana F., Campagna C., Lavigne E., Ouarda T.B.M.J., 
#'      Gosselin P. On threshold identification for weather-health warning 
#'      systems. \emph{Submitted}.
#'
#' @examples
#'    # A simple bump
#'    set.seed(12345)
#'    x <- matrix(runif(2000), ncol = 2, dimnames = list(NULL, c("x1", "x2")))
#'    y <- 2 * x[,1] + 5 * x[,2] + 10 * (x[,1] >= .8 & x[,2] >= .5) + 
#'      rnorm(1000)
#'    # Peeling with alpha = 0.05 and beta.stop = 0.05
#'    peel_res <- peeling(y, x, beta.stop = 0.05)
#'    # Automatically choose the best box
#'    chosen <- jump.prim(peel_res)
#'
#' @export
jump.prim <- function(object, rel.support = TRUE)
{
  traj.change <- diff(object$yfun)
  if (rel.support) traj.change <- traj.change / -diff(object$support)  
  opt.sup <- which.max(traj.change)
  optimal.box <- extract.box(object, npeel = opt.sup)
  optimal.box$limits <- optimal.box$limits[[1]]
  return(list(trajectory.difference = traj.change, npeel.opt = opt.sup,
    final.box = optimal.box))
}



#' Cross-validated peeling trajectory
#'
#' Performs k-fold cross-validation on peeling for choosing the stopping 
#'    criterion.
#'
#' @param y Numeric vector of response values.
#' @param x Numeric or categorical data.frame of input values.
#' @param grid Vector of stopping supports
#'    for peeling trajectory prediction.
#'    If NULL (the default), an initial peeling is carried out on the 
#'    whole data and its \code{support} element is used.
#' @param folds An integer vector giving the fold index of each observation.
#'    If NULL (the default)
#'    \code{nfolds} folds are randomly generated. Directly using \code{folds}
#'    is useful for nonstandard folds such as blocks. Note that \code{folds} is 
#'    recycled if necessary.
#' @param nfolds Integer giving the number of folds to create if \code{folds}
#'    is NULL.
#' @param ... Additional arguments to be passed to \code{\link{peeling}}.
#'
#' @details The \code{cv.trajectory} function splits the provided data into
#'    \code{nfolds} several folds. The peeling is carried out on 
#'    \code{nfolds - 1} folds and the objective function is computed on the
#'    remaining fold. This process is repeated excluding each fold successively
#'    and the resulting trajectories are averaged at each value in 
#'    \code{grid}.
#'
#'    Folds can be given either directly through the argument \code{folds} or 
#'    randomly generated using the argument \code{nfolds}. 
#'
#' @return A \code{cv.prim} object that can be used in methods for \code{prim}
#'    objects (e.g. \code{\link{plot_trajectory}}). Contains the elements:
#'    \item{support}{The support grid provided in the argument 
#'      \code{grid} or generated if the latter is \code{NULL}.}
#'    \item{yfun}{The cross-validated objective function values at each
#'      \code{support} value.}
#'    \item{se.yfun}{Cross-validation standard errors associated with 
#'      \code{yfun} values.}
#'    \item{x,y}{The input and response data used.}
#'    \item{numeric.vars}{A logical vector indicating, for each input variable,
#'      if it was considered as a numeric variable.}
#'    \item{alpha, peeling.side, obj.fun}{The value of the arguments used for
#'      peeling. Useful for prim methods.}
#' 
#' @seealso \code{\link{peeling}} for the peeling algorithm used in the
#'    function. \code{\link{plot_trajectory}} to analyse the cross-validated
#'    trajectory.
#'
#' @references
#'    Friedman, J.H., Fisher, N.I., 1999. Bump hunting in high-dimensional data.
#'      Statistics and Computing 9, 123-143. 
#'      https://doi.org/10.1023/A:1008894516817
#'
#' @examples
#'    # A simple bump
#'    set.seed(12345)
#'    x <- matrix(runif(2000), ncol = 2, dimnames = list(NULL, c("x1", "x2")))
#'    y <- 2 * x[,1] + 5 * x[,2] + 10 * (x[,1] >= .8 & x[,2] >= .5) + rnorm(1000)
#'    
#'    # 10-fold cross-validation
#'    cv_res <- cv.trajectory(y, x)
#'    
#'    # Display the cross-validated trajectory
#'    plot_trajectory(cv_res, type = "b", pch = 16, col = "cornflowerblue", 
#'      support = 0.1, npeel = which.max(cv_res$yfun), 
#'      abline.pars = list(lwd = 2, col = "indianred"), 
#'      xlab = "", xlim = c(0, 0.2), ylim = c(10, 18))
#'
#' @export
cv.trajectory <- function(y, x, grid = NULL, 
  folds = NULL, nfolds = 10, ...)
{
  x <- as.data.frame(x)
  p <- ncol(x)
  n <- nrow(x)
  if (is.null(grid)){
    all_peel <- peeling(y, x, ...)
    grid <- all_peel$support
  }
  nsup <- length(grid)
  if (is.null(folds)){
    folds <- sample(rep_len(1:nfolds, n))
  } else {
    folds <- rep_len(folds, n)
    nfolds <- length(unique(folds))
  }
  fun.mat <- matrix(NA, nrow = nsup, ncol = nfolds)
  for (i in 1:nfolds){
      train_ind <- folds != i
      test_ind <- folds == i
      peeli <- peeling(y[train_ind], x[train_ind,], ...)
      traji <- predict.prim(peeli, newy = y[test_ind], newx = x[test_ind,],
        npeel = 0:peeli$npeel)        
      sup.inds <- sapply(grid, function(b){ 
        max(which(peeli$support >= b))
      })
      fun.mat[,i] <- traji$yfun[sup.inds]
  }
  final.traj <- apply(fun.mat, 1, mean, na.rm=T)
  n.traj <- apply(fun.mat, 1, function(x) sum(!is.na(x)))
  sd.traj <- apply(fun.mat, 1, stats::sd, na.rm=T)
  out <- list(support = grid, yfun = final.traj, se.yfun = sd.traj,
    npeel = nsup, x = x, y = y, alpha = peeli$alpha, 
    peeling.side = peeli$peeling.side, numeric.vars = peeli$numeric.vars,
    obj.fun = peeli$obj.fun)
  class(out) <- "cv.prim"
  return(out)
}