primr: Patient rule-induction method

Documented in peeling

###############################################################################
#
#                   Peeling functions                             
#
###############################################################################

#' Top-down peeling
#'
#' Iteratively peels a dataset for bump hunting.
#'
#' @param y Numeric vector of response values.
#' @param x Numeric or categorical data.frame of input values.
#' @param alpha The peeling fraction of the algorithm. A value between 0 and 1
#'    giving the proportion of peeled observations at each step.
#' @param beta.stop The stopping support of the algorithm. A value 
#'    between 0 and 1 giving the proportion of remaining data below
#'    which the algorithm stops.
#' @param obj.fun The function of \code{y} to be maximized. Can be a user 
#'    defined function (see details).
#' @param peeling.side A numeric vector for side constraints on the peeling of 
#'    each input variable. -1 indicates peeling only the 'left' of the box
#'    (i.e. increasing the lower limit only), 1 indicate peeling only the
#'    'right' and 0 for no constraint.
#'
#' @details The function \code{peeling} carries out the top-down peeling 
#'    which is the first step of the PRIM algorithm. At each iteration 
#'    it peels a proportion \code{alpha} of data from one side of the domain
#'    in order to increase the value of the function \code{obj.fun} applied
#'    to the response \code{y}. The algorithm iterates the peeling until 
#'    the support of the box (i.e. the proportion of remaining observations)
#'    is below the value \code{beta.stop}.
#'
#'    Many function can be used in \code{obj.fun} including user defined 
#'    functions. User defined function should take two arguments: \code{y}
#'    and \code{x} representing corresponding variables 
#'    and \code{inbox} which is a boolean
#'    vector indicating the observations inside the current box.
#'    Note that a classical function can also be passed to \code{obj.fun}
#'    such as \code{mean}, \code{var} or \code{median}. In this case
#'    the function is created internally to fit the above structure. 
#'    For more functions more complicated than the basic ones,
#'    it is recommended that the user set its own function as stated
#'    above.  
#'    
#'    The function also allows directed peeling, i.e. to contraint the peeling
#'    occuring on a single side of some input variables. Thus when 
#'    \code{peeling.side = -1}, only the lower part of the variable is peeled
#'    (the "left" of the domain) and when \code{peeling.side = 1}, only the
#'    upper part of the variable is peeled. Note that a vector can be passed,
#'    thus applying different constraints to the input variables.
#'
#' @return A \code{prim} object which is a list with the following elements:
#'    \item{npeel}{The number of peeling iteration performed.}
#'    \item{support}{A vector of length \code{npeel + 1} containing the support 
#'      of each successivepeeled box.}
#'    \item{yfun}{A vector of length \code{npeel + 1} containing the objective 
#'      function value of each successive peeled box.}
#'    \item{limits}{A list of length \code{npeel + 1} containing the limits 
#'      of each successive box. Each limit is a list with one element per input 
#'      variable.}
#'    \item{x,y}{The input and response data used in the algorithm.}
#'    \item{numeric.vars}{A logical vector indicating, for each input variable,
#'      if it was considered as a numeric variable.}
#'    \item{alpha, peeling.side, obj.fun}{The value of the arguments used for
#'      peeling. Useful for prim methods.}
#'    \item{npaste}{Number of pasting iteration performed. Should be 0 here,
#'      but useful for \code{\link{pasting}}.}
#'
#'    Note that the first box in a \code{prim} object is the starting box
#'      containing the whole dataset. This is why the \code{limits},
#'      \code{yfun} and \code{support} elements have length \code{npeel + 1}.
#'
#' @seealso \code{\link{extract.box}} to extract information about a 
#'    particular box in a \code{prim} object. 
#'    \code{\link{plot_trajectory}} and \code{\link{plot_box}} to explore
#'    the peeling trajectory. \code{\link{jump.prim}} to automatically
#'    choose the best box. \code{\link{predict.prim}} to predict if new data
#'    falls into particular boxes. \code{\link{pasting}} to carry out the
#'    pasting refining the edges of the chosen box.
#'
#' @references
#'    Friedman, J.H., Fisher, N.I., 1999. Bump hunting in high-dimensional data.
#'      Statistics and Computing 9, 123-143. 
#'      https://doi.org/10.1023/A:1008894516817
#'
#' @examples
#'    # A simple bump
#'    set.seed(12345)
#'    x <- matrix(runif(2000), ncol = 2, dimnames = list(NULL, c("x1", "x2")))
#'    y <- 2 * x[,1] + 5 * x[,2] + 10 * (x[,1] >= .8 & x[,2] >= .5) + 
#'      rnorm(1000)
#'    # Peeling with alpha = 0.05 and beta.stop = 0.05
#'    peel_res <- peeling(y, x, beta.stop = 0.05)
#'    # Automatically choose the best box
#'    chosen <- jump.prim(peel_res)
#'    # Plot the resulting box
#'    plot_box(peel_res, pch = 16, ypalette = hcl.colors(10), 
#'      support = chosen$final.box$support, box.args = list(lwd = 2))
#'
#'    # Examples of directed peeling
#'    set.seed(12345)
#'    x <- matrix(runif(2000), ncol = 2, dimnames = list(NULL, c("x1", "x2")))
#'    y <- 10 * (x[,1] <= .2 & x[,2] <= .2) + 10 * (x[,1] >= .8 & x[,2] >= .8) +
#'      rnorm(1000)
#'    # Left peeling
#'    peel_left <- peeling(y, x, peeling.side = -1)
#'    chosen <- jump.prim(peel_left)
#'    plot_box(peel_left, pch = 16, ypalette = hcl.colors(10), 
#'      support = chosen$final.box$support, box.args = list(lwd = 2),
#'      main = "Left peeling")
#'    # Right peeling
#'    peel_right <- peeling(y, x, peeling.side = 1)
#'    chosen <- jump.prim(peel_right)
#'    plot_box(peel_right, pch = 16, ypalette = hcl.colors(10), 
#'      support = chosen$final.box$support, box.args = list(lwd = 2),
#'      main = "Right peeling")
#'
#'    # User-defined objective function to minimize the mean
#'    set.seed(3333)
#'    x <- matrix(runif(2000), ncol = 2, dimnames = list(NULL, c("x1", "x2")))
#'    y <- - 10 * (x[,1] <= .2 & x[,2] <= .2) + 10 * (x[,1] >= .8 & x[,2] >= .8) +
#'      rnorm(1000)
#'    peel_res <- peeling(y, x, obj.fun = function(x) -mean(x))
#'    chosen <- jump.prim(peel_res)
#'    plot_box(peel_res, pch = 16, ypalette = hcl.colors(10), 
#'      support = chosen$final.box$support, box.args = list(lwd = 2))
#'
#'    # User-defined function maximizing the slope of a linear regression
#'    set.seed(5555)
#'    x <- runif(500)
#'    ym <- 0.5 * x + 5 * (x - 0.7) * (x >= 0.7)
#'    y <- ym + rnorm(500, sd = 0.1)    
#'    peel_res <- peeling(y, x, beta.stop = 0.1, 
#'      obj.fun = function(y, x, inbox){
#'        dat <- data.frame(y, x)
#'        coef(lm(y ~ x, data = dat[inbox,]))[2]
#'    })   
#'    par(mfrow = c(1,2))
#'    plot_trajectory(peel_res, type = "b", pch = 16, col = "cornflowerblue", 
#'      support = 0.3, abline.pars = list(lwd = 2, col = "indianred"))
#'    plot_box(peel_res, pch = 16, ypalette = hcl.colors(10), 
#'      support = 0.3, box.args = list(lwd = 2))
#'    lines(sort(x), ym[order(x)], col = "red", lwd = 2)
#'    
#' @export
peeling <- function(y, x, alpha = 0.05, beta.stop = 0.01, 
  obj.fun = mean, peeling.side = 0)  
{
    x <- as.data.frame(x)
    p <- ncol(x)
    n <- nrow(x)
    numeric.vars <- sapply(x, is.numeric)
    nnumeric <- sum(numeric.vars)
    x[!numeric.vars] <- lapply(x[!numeric.vars], as.factor)
    peeling.side <- rep_len(peeling.side, nnumeric)
    nstep <- ceiling(log(beta.stop)/log(1-alpha))
    support <- yfun <- numeric(nstep + 1)
    limits <- vector("list", nstep+1)
    support[1] <- 1    
    obj.fun <- construct_objfun(obj.fun)
    yfun[1] <- do.call(obj.fun, list(y = y, x = x, inbox = rep(T, n)))
    limits[[1]] <- vector("list", p)
    limits[[1]][numeric.vars] <- lapply(x[,numeric.vars, drop = F], range)
    limits[[1]][!numeric.vars] <- lapply(x[,!numeric.vars, drop = F], unique)
    count <- 1
    while (support[count] > beta.stop){
      new.peel <- peel(y = y, x = x, alpha = alpha, obj.fun = obj.fun, 
        peeling.side = peeling.side, limits = limits[[count]], 
        numeric.vars = numeric.vars)
      count <- count + 1
      support[count] <- new.peel$support
      yfun[count] <- new.peel$yfun
      limits[[count]] <- new.peel$limits
      if (count == (nstep + 1) || 
        length(unique(y[in.box(x, limits[[count]])])) == 1){ 
          break
      }
    }
    if (count < nstep){
      irem <- count:nstep + 1
      support <- support[-irem]
      yfun <- yfun[-irem]
      limits <- limits[-irem]
    }  
    out <- list(npeel = count - 1, support = support, yfun = yfun, 
      limits = limits, x = x, y = y, numeric.vars = numeric.vars, alpha = alpha, 
      peeling.side = peeling.side, obj.fun = obj.fun,
      npaste = 0)
    class(out) <- "prim"
    return(out)
}

peel <- function(y, x, alpha = 0.05, obj.fun = mean, limits, 
  numeric.vars = rep(TRUE, ncol(x)), peeling.side = rep(0,sum(numeric.vars)))
{
    p <- ncol(x)
    n <- nrow(x)
    yfun <- -Inf
    nnumeric <- sum(numeric.vars)
    numinds <- which(numeric.vars)
    inbox <- in.box(x, limits)
    yin <- y[inbox]
    xin <- x[inbox,, drop = FALSE]
    for (j in 1:p){
      if (numeric.vars[j]){
        boxes <- list(c(alpha, 1), c(0, 1 - alpha))
        boxes <- boxes[peeling.side[numinds == j] != c(1, -1)]
      } else {
        if (length(limits[[j]]) > 1){
          boxes <- limits[[j]]
        } else {
          next
        }
      }            
      for (k in 1:length(boxes)){
        if (numeric.vars[j]){
          newlims <- stats::quantile(xin[,j], boxes[[k]])
          movinglim <- which(boxes[[k]] %in% c(alpha,1 - alpha))
          if (newlims[movinglim] == limits[[j]][movinglim]){ 
          # To manage the case in which there are many ties 
            newlims[movinglim] <- sort(unique(xin[,j]), 
              decreasing = as.logical(movinglim - 1))[2]
          }
          inboxk <- x[,j] >= newlims[1] & x[,j] <= newlims[2]
        } else {
          inboxk <- x[,j] != boxes[k]
          newlims <- limits[[j]][-k]
        }
        jyfun <- do.call(obj.fun, list(y = y, x = x, inbox = inbox & inboxk))
        if (jyfun > yfun){
           finbox <- inbox & inboxk
           yfun <- jyfun
        }
      }      
    }
    # Final readjustment of limits
    final.x <- x[finbox,,drop = FALSE] 
    new.limits <- Map(function(xj, numj){
      if (numj){
        out <- range(xj)
      } else {
        out <- unique(xj)
      }
    }, final.x, numeric.vars)
    return(list(limits = new.limits, yfun = yfun, support = mean(finbox)))
}