R/SurvTreeLaplaceHazards_ranger.R
In discSurv: Discrete Time Survival Analysis

Documented in survTreeLaplaceHazardRanger

#' Laplace Hazards for a Competing Risk Survival Tree Object
#'
#' Predicts the laplace-smoothed hazards of discrete survival data based
#' on a survival tree from class "ranger". Currently only single-risk data is supported.
#' @param treeModel Fitted tree object as generated by "ranger" ("class data.frame"). Must be a single ranger tree.
#' @param rangerdata Original training data with which \emph{treeModel} was fitted ("class data.frame"). Must be in
#' long format.
#' @param newdata Data in long format for which hazards are to be computed ("class data.frame"). Must 
#' contain the same columns that were used for tree fitting.
#' @param lambda Smoothing parameter for laplace-smoothing ("class data.frame"). Must be a non-negative 
#' number. A value of zero corresponds to no smoothing.
#' @return A m by k matrix with m being the length of newdata and k being the 
#' number of classes in \emph{treeModel}. Each row corresponds to the smoothed hazard 
#' of the respective observation.
#'
#' @keywords survival
#' @examples
#' library(pec)
#' library(caret)
#' library(ranger)
#' data(cost)
#' # Take subsample and convert time to years
#' cost$time <- ceiling(cost$time/365)
#' costSubTrain <- cost[1:50,]
#' costSubTest <- cost[51:70,]
#' # Specify column names for data augmentation
#' timeColumn<-"time"
#' eventColumn<-"status"
#' costSubTrainLong <- dataLong(costSubTrain, timeColumn, eventColumn)
#' costSubTestLong <- dataLong(costSubTest, timeColumn, eventColumn)
#' #create tree
#' formula <- y ~ timeInt + diabetes + prevStroke + age + sex
#' rangerTree <- ranger(formula, costSubTrainLong, num.trees = 1, mtry = 5, 
#' classification = TRUE, splitrule = "hellinger", replace = FALSE, 
#' sample.fraction = 1, max.depth = 5)
#' #compute laplace-smoothed hazards
#' laplHaz <- survTreeLaplaceHazardRanger(rangerTree, costSubTrainLong, 
#' costSubTestLong, lambda = 1)
#' laplHaz
#' @export survTreeLaplaceHazardRanger
survTreeLaplaceHazardRanger <- function(treeModel, rangerdata, newdata, lambda)
{
  if(!is.data.frame(newdata)) {stop("Argument *newdata* is not in the correct format! Please specify as data.frame object.")}
  if(!is.data.frame(rangerdata)) {stop("Argument *rangerdata* is not in the correct format! Please specify as data.frame object.")}
  if(lambda < 0|!is.numeric(lambda)|length(lambda)!=1)
  {
    stop("Lambda must be a non-negative number.")
  }
  if(!all(unique(treeModel$frame$var)[-which(unique(treeModel$frame$var) == "<leaf>")]
          %in% colnames(newdata)))
  {
    stop("Newdata does not contain the same covariates as the tree model.")
  }
  if(is.null(rangerdata$y))
  {
    rangerdata$y <- as.numeric(factor(rangerdata$responses)) - 1
  }
  #get predicted nodes of original data frame
  pred_nodes_rangerdata <- cbind(predict(treeModel, rangerdata, type = "terminalNodes")$predictions, rangerdata$y)
  #get event and node count
  n_ev <- sort(unique(rangerdata$y)) - 1
  uq_row <- sort(unique(pred_nodes_rangerdata[, 1]))
  #get absolute and laplace-smoothed relative frequency of observations per event per node
  ev_node <- expand.grid(uq_row, n_ev)
  freq_vals <- apply(ev_node, 1, function(x) length(which(rowSums(sweep(pred_nodes_rangerdata, 2, x, "!="))==0)))
  freq_table <- matrix(freq_vals + lambda, nrow = length(uq_row), byrow = TRUE)
  freq_table <- t(apply(freq_table, 1, function(x) x / sum(x)))
  #get predicted nodes for new data
  pred_nodes <- factor(predict(treeModel,data = newdata, type = "terminalNodes")$predictions,levels = uq_row)
  hazards_fitted <- freq_table[pred_nodes, ]
  return(hazards_fitted)
}