R/xgboostClassification.R

#' Classification using the xgboost algorithm.
#'
#' @title xgboost classification
#'
#' @param object An instance of class \code{"\linkS4class{MSnSet}"}.
#' @param assessRes An instance of class
#'     \code{"\linkS4class{GenRegRes}"}, as generated by
#'     \code{\link{xgboostOptimisation}}.
#' @param scores One of \code{"prediction"}, \code{"all"} or
#'     \code{"none"} to report the score for the predicted class
#'     only, for all classes or none.
#' @param max_depth If \code{assessRes} is missing, a \code{max_depth} must be
#'     provided.
#' @param gamma If \code{assessRes} is missing, a \code{gamma} must
#'     be provided.
#' @param fcol The feature meta-data containing marker definitions.
#'     Default is \code{markers}.
#' @param ... Additional parameters passed to \code{\link{xgb.train}} from
#'     package \code{xgboost}.
#'
#' @return An instance of class \code{"\linkS4class{MSnSet}"} with
#'     \code{xgboost} and \code{xgboost.scores} feature variables storing the
#'     classification results and scores respectively.
#'
#' @export
#'
#' @examples
#' library(pRolocExtra)
#' data(tan2009r1)
#' # reducing iterations
#' params <- xgboostOptimisation(tan2009r1, times = 3)
#' params
#' plot(params)
#' f1Count(params)
#' levelPlot(params)
#' getParams(params)
#' res <- xgboostClassification(tan2009r1, params)
#' getPredictions(res, fcol = "xgboost")
#' getPredictions(res, fcol = "xgboost", t = 0.75)
#' plot2D(res, fcol = "xgboost")

xgboostClassification <- function (object, assessRes,
                                   scores = c("prediction", "all", "none"),
                                   max_depth, gamma, nrounds = 1000, fcol = "markers", ...)
{
  scores <- match.arg(scores)
  if (missing(assessRes)) {
    if (missing(max_depth) | missing(gamma))
      stop("First run 'xgboostOptimisation' or set 'max_depth' and 'gamma' manually.")
    params <- c(max_depth = max_depth, gamma = gamma)
  }
  else {
    params <- pRoloc::getParams(assessRes)
    if (is.na(params["max_depth"]))
      stop("No 'max_depth' found.")
    if (is.na(params["gamma"]))
      stop("No 'gamma' found.")
  }

  # set train and test set
  trainInd <- which(fData(object)[, fcol] != "unknown")
  testInd <- which(fData(object)[, fcol] == "unknown")
  trainSet <- pRoloc:::subsetAsDataFrame(object, fcol, train = TRUE)
  testSet <- pRoloc:::subsetAsDataFrame(object, fcol, train = FALSE)
  .trainSep <- pRoloc:::separateDataSet(trainSet, fcol)
  .testSep <- pRoloc:::separateDataSet(testSet, fcol)

  #create xgb matrix in order to use xgb algorithm
  train_xg <- xgboost::xgb.DMatrix(data = as.matrix(.trainSep$theData),
                                     label = as.integer(.trainSep$theLabels) - 1)
  test_xg <- xgboost::xgb.DMatrix(data = as.matrix(.testSep$theData),
                                    label = as.integer(.testSep$theLabels) - 1)
  params_xg <- list(
    booster = "gbtree",
    eta = 0.001,
    max_depth = params["max_depth"],
    gamma = params["gamma"],
    subsample = 0.75,
    colsample_bytree = 1,
    lambda = 1, alpha = 0,
    objective = "multi:softprob",
    eval_metric = "mlogloss",
    num_class = length(levels(.trainSep$theLabels))
  )
  model <- xgboost::xgb.train(params = params_xg,
                     data = train_xg,
                     nrounds = nrounds,
                     early_stopping_rounds = 10,
                     watchlist = list(train = train_xg),
                     verbose = 0,
                     ...)

  #get predictions from xgboost
  ans <- as.data.frame(predict(model, test_xg, reshape = TRUE))
  colnames(ans) <- levels(.trainSep$theLabels)
  predictedLabels <- factor(apply(ans, 1, function(x) colnames(ans)[which.max(x)]), levels = levels(.trainSep$theLabels))
  temp <- rep("", length(trainInd) + length(testInd))
  i <- 1:length(trainInd)
  temp[trainInd[i]] <- as.character(.trainSep$theLabels[i])
  i <- 1:length(testInd)
  temp[testInd[i]] <- as.character(predictedLabels[i])
  fData(object)$xgboost <- temp

  #get back scores from xgboost
  if (scores == "all") {
    nbLabels <- length(levels(.trainSep$theLabels))
    tempScores <- matrix(rep(0, nbLabels * (length(trainInd) +
                                              length(testInd))), ncol = nbLabels)
    colnames(tempScores) <- levels(.trainSep$theLabels)
    for(i in 1:length(trainInd)){
      tempScores[trainInd[i], .trainSep$theLabels[i]] <- 1
    }
    i <- 1:length(testInd)
    ans <- as.matrix(ans)
    tempScores[testInd[i], ] <- ans[i, ]
    scoreMat <- tempScores
    colnames(scoreMat) <- paste0(colnames(scoreMat), ".xgboost.scores")
    fData(object)$xgboost.all.scores <- scoreMat
  }
  else if (scores == "prediction") {
    nbLabels <- length(levels(.trainSep$theLabels))
    tempScores <- rep(0, length(trainInd) + length(testInd))
    i <- 1:length(trainInd)
    tempScores[trainInd[i]] <- 1
    i <- 1:length(testInd)
    tempScores[testInd[i]] <- apply(ans, 1, max)
    fData(object)$xgboost.scores <- tempScores
  }
  object@processingData@processing <- c(processingData(object)@processing,
                                        paste0("Performed xgboost prediction (",
                                               paste(paste(names(params),
                                                           params, sep = "="), collapse = " "), ") ", date()))
  if (validObject(object))
    return(object)
}
mgerault/pRolocExtra documentation built on Sept. 15, 2022, 9:26 a.m.