R/xgez.R

Defines functions xg_load_data xg_train xg_gs xg_predict xg_auto_ml xg_bo

Documented in xg_auto_ml xg_bo xg_gs xg_load_data xg_predict xg_train

###############################################################################
#                                    __  __                                   #
#                             ___ ___\ \/ /__ _                               #
#                            / _ \_  /\  // _` |                              #
#                           |  __// / /  \ (_| |                              #
#                           \___/___/_/\_\__, |                               #
#                                        |___/                                #
#                        Easy Xgboost Implementation                          #
###############################################################################

# Load Data -----

#' Load Data
#'
#' \code{xg_load_data} returns a list with all the prepared element for loading
#' and preparing the data for xgboost modelling. The model principally relies
#' on two categories of input: numeric (\emph{num}) and category (\emph{cat}).
#'
#' @param file \strong{Character}. The link to the file containing the data.
#'  The data are imported with the \code{fread} function from
#'  the \code{data.table} package (\link[data.table]{fread}), so the format
#'  must be consistent with a csv file.
#' @param inputs \strong{Character vector}. Vector of the column names for
#' the inputs of the model. Only those columns will be used for the model.
#' Using the "auto" value will use as inputs all the columns from the
#' table except the one labelled as output.
#' @param output \strong{Character}. A single string specifying the name
#' of the output column for the model training.
#' @param inputs.class \strong{Character vector}. A vector specifying the
#' classes for the input column. If set to "auto", the classes will be
#' determined from the output of the \link[data.table]{fread} function.
#' Else, it must me a vector whose size is exactly the number of input and
#' whose values can only be \emph{num} (for numerical inputs) and \emph{cat}
#'  (for categorical inputs).
#' @param output.class \strong{Character}. Class for output. If set to "auto",
#' the class will be determide from the output of the \link[data.table]{fread}
#' function. Else, it must be equal to \emph{num} or \emph{cat} for numerical
#' or categorical inputs.
#' @param train.size \strong{Numeric}. Size for training set for the future
#' model. Can go from 0 (no training set: will produce an error) to 1 (no test
#' set).
#' @param seed \strong{Numeric}. Seed for reproducibility of the results.
#' @param na.handle \strong{Character}. Way to handle na value in numeric
#' inputs. Five possibilities have been implemented:
#' \itemize{
#'  \item{\emph{inf}}: replace missing values with \code{Inf}.
#'  \item{\emph{mean}}: replace missing values with the mean of the column.
#'  \item{\emph{median}}: replace missing values with the median of the column.
#'  \item{\emph{max}}: replace missing values with the max of the column.
#'  \item{\emph{min}}: replace missing values with the min of the column.
#' }
#' @param max.levels \strong{Numeric}. Maximum number of levels admitted for
#' a category. This parameters is here to make sure that the model does not
#' have to many input data when transformed into a one-hot encoded matrix.
#'
#' @return A list with following values:
#' \itemize{
#'  \item{\strong{train}}: training set for the model, with a matrix for the
#'  input values and a vector for the target variables.
#'  \item{\strong{test}}: test set for the model, on the same format that the
#'  training set
#'  \item{\strong{formula}}: the formula used for constructing the model matrix
#'  and that is applied when running the model.
#'  \item{\strong{template}}: an empty \code{data.table} that has saved all the
#'  input values and that is used to appropriately format data when using
#'  the prediction function.
#'  \item{\strong{data}}: A data.table with the cleaned data and an additional
#'  logical column, \emph{train}, that indicates which data are used in the
#'  training data set.
#'  \item{\strong{na.handle}}: passed to reapply to prediction
#' }
#'
#' @import data.table
#'
#' @examples
#' d <- xg_load_data(system.file("extdata", "titanic.csv", package = "ezXg"),
#'                inputs = c("Pclass", "Sex", "Age", "SibSp",
#'                           "Parch", "Fare", "Embarked"),
#'                output = "Survived")
#'
#' @export
xg_load_data <- function(file,
                         inputs = "auto",
                         output,
                         inputs.class = "auto",
                         output.class = "auto",
                         train.size = 1,
                         seed = 1,
                         na.handle = "inf",
                         max.levels = 50){
  # Load the data
  d <- data.table::fread( file, stringsAsFactors = T)

  # Compute the input names in case their value is "auto"
  if (inputs[1] == "auto"){
    inputs <- colnames(d)[colnames(d) != output]
  }

  # Validation for inputs
  # Check that na.handle is in the accepted values
  if (!(na.handle %in% c("inf", "min", "max", "mean", "median"))){
    stop(na.handle,
         " is an invalid value for na.handle. Please check documentation
         for authorized values.")
  }
  # Check that the inputs classes are consistent
  if (inputs.class[1] != "auto"){
    if (length(inputs.class) != length(inputs)){
      stop("Unconsitent number of inputs and classes for inputs. The values
           should be the same")
    }
    if (!all(inputs.class %in% c("num", "cat"))){
      stop("Invalid input classes. Authorized values are 'num' and 'cat'.")
    }
  }
  # Check consistency for output
  if (length(output.class) != 1 | length(output) != 1){
    stop("Only one output may be trained at a time.")
    if (!all(output.class %in% c("num", "cat"))){
      stop("Invalid output class. Authorized values are 'num' and 'cat'.")
    }
  }
  # Check train size consistency
  if (train.size > 1 | train.size <= 0){
    stop("Unvalid train size. The value should be between 0 and 1.")
  }

  # Select relevant columns
  d <- d[, .SD, .SDcols = c(inputs, output)]

  # Input data type management
  if (inputs.class[1] != "auto"){
    # On numerical values
    sc <- inputs[inputs.class == "num"]
    if (length(sc) > 0){
      d[, (sc) := lapply(.SD, function(x) as.numeric(gsub(",",
                                                          ".",
                                                          x,
                                                          fixed = T))),
        .SDcols = sc]
    }
    # On categorical values
    sc <- inputs[inputs.class == "cat"]
    if (length(sc) > 0){
      d[, (sc) := lapply(.SD, function(x) factor(x, exclude = NULL)),
        .SDcols = sc]
    }
  }

  # Output data management
  # Apply numerical transformation on output
  if (output.class == "num"){
    d[, (output) := lapply(.SD,
                           function(x) as.numeric(gsub(",",
                                                       ".",
                                                       x,
                                                       fixed = T))),
      .SDcols = output]
  }
  # Apply categorical transformation on output
  if (output.class == "cat"){
    d[, (output) := lapply(.SD, function(x) factor(x, exclude = NULL)),
      .SDcols = output]
  }

  # Test if the number of levels is consistent
  n.lev <- unlist(d[, lapply(.SD, function(x) length(levels(x)))])
  if (!all(n.lev <= max.levels)){
    stop(cat("Too many levels on category: ",
             names(n.lev)[n.lev > max.levels][1],
             ". Max authorized value is ",
             max.levels, "."))
  }

  # NA management
  # selection of the numeric column
  sc <- sapply(d, class)
  sc <- names(sc[sc %in% c("integer", "numeric")])
  if (length(sc) > 0){
    # case infinity
    if (na.handle == "inf"){
      d[, (sc) := lapply(.SD, function(x) ifelse(is.na(x), Inf, x)),
        .SDcols = sc]
    } else {
      # case function
      fn <- function(x) {
        x[is.na(x)] <- eval(parse(text = paste0(na.handle,
                                                "(x, na.rm = T)")))
        return(x)
      }
      d[, (sc) := lapply(.SD, fn), .SDcols = sc]
    }
  }
  # Remove remaining NA
  d <- na.omit(d)

  # Create data partition
  # Set seed (for reproducibility)
  set.seed(seed)
  if (class(d[, get(output)]) == "factor"){
    d[, train := sample(.N) / .N <= train.size, by = output]
    t <- which(d$train)
  } else{
    d[order(get(output)), quantile :=  cut(1:.N,
                                           quantile(1:.N,
                                                    seq(0, 1, 0.05)))]
    d[, train := sample(.N) / .N <= train.size, by = quantile]
    t <- which(d$train)
    d[, quantile := NULL]
  }

  # Create model matrix
  formula <- paste0("~-1+", paste(inputs, collapse = "+"))
  m.d <- model.matrix(as.formula(formula), d)
  # Creation of the xgb Matrix
  dtrain <- list(data = m.d[t, ],
                 label = d[t, get(output)])
  # Creation of the test matrix
  dtest <- list(data = m.d[-t, ],
                label = d[-t, get(output)])

  # Return values
  return(list(train = dtrain,
              test = dtest,
              formula = formula,
              template = d[0, inputs, with = F],
              na.handle = na.handle,
              data = d))
}

# Calibrate Model -----

#' Calibrate Model
#'
#' \code{xg_train} trains an xgboost model in order on the data structure
#' generated by the \link[ezXg]{xg_load_data} function.
#'
#' @param data \strong{Object}. A data structure created by the call of the
#' \link[ezXg]{xg_load_data} function.
#' @param eta \strong{Numeric}. Eta parameter for xgboost calibration.
#' See \link[xgboost]{xgb.train} for more details.
#' @param gamma \strong{Numeric}. Gamma parameter for xgboost calibration.
#' See \link[xgboost]{xgb.train} for more details.
#' @param max_depth \strong{Numeric}. Max_depth parameter for xgboost
#' calibration. See \link[xgboost]{xgb.train} for more details.
#' @param colsample_bytree \strong{Numeric}. Colsample_bytree parameter
#' for xgboost calibration. See \link[xgboost]{xgb.train} for more details.
#' @param min_child_weight \strong{Numeric}. Min_child_weight parameter for
#' xgboost calibration. See \link[xgboost]{xgb.train} for more details.
#' @param nrounds \strong{Numeric}. Nrounds parameter for xgboost calibration.
#' See \link[xgboost]{xgb.train} for more details.
#' @param nthread \strong{Numeric}. Nthread parameter for xgboost calibration.
#' See \link[xgboost]{xgb.train} for more details.
#' @param verbose \strong{Numeric}. Verbose parameter for xgboost calibration.
#' See \link[xgboost]{xgb.train} for more details.
#' @param cv \strong{Numeric}. Number of folds in cross validation. If this
#' parameter is set to 1, this means that cross-validation will not be
#' performed.
#' @param seed \strong{Numeric}. Seed for computation reproducability.
#' @param objective \strong{Character}. Objective function for the
#' optimization. . Eta parameter for xgboost calibration. See
#'  \link[xgboost]{xgb.train} for more details. Can be set to \emph{auto}
#'  in order to let the function choose the better model regarding the
#'  output variable.
#'
#' @return A trained model with following values:
#' \itemize{
#'  \item{\strong{model}}: calibrated model as returned by the
#'  \link[xgboost]{xgb.train} function.
#'  \item{\strong{ntree}}: optimal number of tree according to the test set.
#'  \item{\strong{formula}}: the formula used for constructing the model matrix
#'  and that is applied when running the model.
#'  \item{\strong{template}}: an empty \code{data.table} that has saved all the
#'  input values and that is used to appropriately format data when using
#'  the prediction function.
#'  \item{\strong{labels}}: The possible labels for prediction when performing
#'  a classification task with xgboost.
#'  \item{\strong{na.handle}}: passed to reapply to prediction
#' }
#' In case the parameter \emph{cv} is set to anithing but 1, the function
#' returns a 1 line data.table with the average error on the
#' cross-validation.
#'
#' @import data.table
#' @import xgboost
#'
#' @examples
#' d <- xg_load_data(system.file("extdata", "titanic.csv", package = "ezXg"),
#'                inputs = c("Pclass", "Sex", "Age", "SibSp",
#'                           "Parch", "Fare", "Embarked"),
#'                output = "Survived",
#'                train.size = 0.8)
#' md <- xg_train(d)
#'
#' @export
xg_train <- function(data,
                     eta = 0.3,
                     gamma = 0,
                     max_depth = 6,
                     colsample_bytree = 1,
                     min_child_weight = 1,
                     nrounds = 100,
                     nthread = 2,
                     verbose = 1,
                     cv = 1,
                     seed = 1,
                     objective = "auto"){
  # Copy data
  d <- data
  # Seed value
  set.seed(seed)

  # definition of the objective function
  if (objective == "auto"){
    if (is.numeric(d$train$label)){
      # Regression
      if (all(d$train$label %in% c(0, 1))){
        objective <- "reg:logistic"
      } else {
        objective <- "reg:linear"
      }
    } else {
      # Classification
      if ( length(levels(d$train$label)) == 2){
        objective <- "binary:logistic"
      } else {
        objective <- "multi:softprob"
      }
    }
  }

  # Labels defintion in case of classification
  if (objective %in% c("reg:linear", "reg:logistic")){
    lab <- NULL
  } else if (objective %in% c("binary:logistic", "multi:softprob")){
    lab <- levels(d$train$label)
    num_class <- length(lab)
    d$train$label <- as.numeric(d$train$label) - 1
    d$test$label <- as.numeric(d$test$label) - 1
  }

  # Definition of the parameters for running the xgboost model
  params <- list(eta = eta,
                 gamma = gamma,
                 max_depth = max_depth,
                 colsample_bytree = colsample_bytree,
                 min_child_weight = min_child_weight,
                 nrounds = nrounds,
                 nthread = nthread,
                 objective = objective)
  if (objective == "multi:softprob"){
    params$num_class <- num_class
  }

  # Creation of the xgb matrices
  dtrain <- xgb.DMatrix(data = d$train$data, label = d$train$label)
  dtest <- xgb.DMatrix(data = d$test$data, label = d$test$label)

  # Cross validation or not
  if (cv < 2){
    # No cross validation
    if (length(d$test$label) > 0){
      # Case of a test dataset
      # Creation of the watchlist
      watchlist <- list(train = dtrain, test = dtest)
      # Run the model
      bst <- xgb.train(data = dtrain,
                       params = params,
                       nrounds = nrounds,
                       watchlist = watchlist,
                       verbose = verbose)
      # Return the output
      return(
        list(model = bst,
             ntree = as.numeric(which.min(unlist(bst$evaluation_log[, 3]))),
             template = d$template,
             formula = d$formula,
             labels = lab,
             na.handle = data$na.handle)
      )
    } else {
      # Case of no test dataset
      # Creation of the watchlist
      watchlist <- list(train = dtrain)
      # Run the model
      bst <- xgb.train(data = dtrain,
                       params = params,
                       nrounds = nrounds,
                       watchlist = watchlist,
                       verbose = verbose)
      # Return the output
      return(
        list(model = bst,
             ntree = nrounds,
             template = d$template,
             formula = d$formula,
             labels = lab,
             na.handle = data$na.handle)
      )
    }
  } else{
    # case of a cross validation
    # Create partition
    Xtrain <- d$train$data
    ytrain <- data.table(lab = d$train$label)
    if (class(ytrain$lab) == "factor"){
      ytrain[, class := sample(1:cv, .N, replace = T), by = lab]
    } else{
      ytrain[order(lab),
             quantile :=  cut(1:.N, quantile(1:.N, seq(0, 1, 0.05)))]
      ytrain[, class := sample(1:cv, .N, replace = T), by = quantile]
    }
    # Loop on model
    rez <- data.table()
    for (i in 1:cv){
      if (verbose > 0){
        cat("           Fold", i, "/", cv, "\n")
        Sys.sleep(1)
      }
      # Training and test matrices
      dtrain <- xgb.DMatrix(data = Xtrain[ytrain$class != i, ],
                            label = ytrain[class != i, (lab)])
      dtest <- xgb.DMatrix(data = Xtrain[ytrain$class == i, ],
                           label = ytrain[class == i, (lab)])
      watchlist <- list(train = dtrain, test = dtest)
      # Model training
      bst <- xgb.train(data = dtrain,
                       params = params,
                       nrounds = nrounds,
                       watchlist = watchlist,
                       verbose = verbose)
      # Model validation
      rez <- rbindlist(list(rez,
                            data.table(cv = i,
                                       tree = as.numeric(which.min(unlist(bst$evaluation_log[, 3]))),
                                       train = as.numeric(bst$evaluation_log[which.min(unlist(bst$evaluation_log[, 3])), 2]),
                                       test = min(unlist(bst$evaluation_log[, 3])))))
    }
    # Return value
    return(cbind(as.data.table(params), rez))
  }
}

# Grid Search -----

#' Grid search
#'
#' \code{xg_gs} use coordinate descent
#'  (\url{https://en.wikipedia.org/wiki/Coordinate_descent}) in order
#'  to select the best set of parameters for an xgboost model. At the end
#'  of the coordinate descent algorithm, a full search on each of the
#'  individual parameter vectors is made in order to potentially improve
#'  the selection.
#'
#' @param data \strong{Object}. A data structure created by the call of the
#' \link[ezXg]{xg_load_data} function.
#' @param eta \strong{Numeric Vectors}. Eta parameter list for grid search.
#' See \link[xgboost]{xgb.train} for more details.
#' @param gamma \strong{Numeric Vector}. Gamma parameter list for grid search.
#' See \link[xgboost]{xgb.train} for more details.
#' @param max_depth \strong{Numeric Vector}. Max_depth parameter list for grid
#'  search. See \link[xgboost]{xgb.train} for more details.
#' @param colsample_bytree \strong{Numeric Vector}. Colsample_bytree parameter
#' list for grid search. See \link[xgboost]{xgb.train} for more details.
#' @param min_child_weight \strong{Numeric Vector}. Min_child_weight parameter
#' list for grid search. See \link[xgboost]{xgb.train} for more details.
#' @param nrounds \strong{Numeric}. Nrounds parameter for xgboost calibration.
#' See \link[xgboost]{xgb.train} for more details.
#' @param nthread \strong{Numeric}. Nthread parameter for xgboost calibration.
#' See \link[xgboost]{xgb.train} for more details.
#' @param verbose \strong{Logical}. Verbose parameter for grid search.
#' @param cv \strong{Numeric}. Number of folds in cross validation. Needs
#' to be more than 2.
#' @param seed \strong{Numeric}. Seed for computation reproducability.
#' @param objective \strong{Character}. Objective function for the
#' optimization. . Eta parameter for xgboost calibration. See
#'  \link[xgboost]{xgb.train} for more details. Can be set to \emph{auto}
#'  in order to let the function choose the better model regarding the
#'  output variable.
#'
#' @return The optimization results with the following fields:
#' \itemize{
#'  \item{\strong{param}}: the optimal set of parameters.
#'  \item{\strong{err}}: the error associated to the optimal parameter
#'  set.
#'  \item{\strong{results}}: the history of the results for the
#'  cross-validation with all the tested sets of parameters.
#' }
#'
#' @import data.table
#' @import xgboost
#' @importFrom stats as.formula median model.matrix na.omit predict quantile
#'
#' @examples
#' d <- xg_load_data(system.file("extdata", "titanic.csv", package = "ezXg"),
#'                inputs = c("Pclass", "Sex", "Age", "SibSp",
#'                           "Parch", "Fare", "Embarked"),
#'                output = "Survived",
#'                train.size = 0.8)
#' t <- xg_gs(d)
#'
#' @export
xg_gs <- function(data,
                  eta = c(0.05, 0.10, 0.15, 0.20, 0.25, 0.30),
                  gamma = c(0, 0.1, 0.2, 0.3, 0.4, 0.5),
                  max_depth = c(1, 3, 4, 5, 6, 8, 10, 12, 15),
                  colsample_bytree = c(0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1),
                  min_child_weight = c(1, 3, 5, 7),
                  nrounds = 100,
                  nthread = 2,
                  cv = 5,
                  seed = 1,
                  verbose = TRUE,
                  objective = "auto"){

  # Check the parameters
  if (cv < 2) stop("Invalid number of cross validation")

  # Optimization function
  fn <- function(x){
    md <- xg_train(data,
                   cv = cv,
                   verbose = 0,
                   eta = x[1],
                   gamma = x[2],
                   max_depth = x[3],
                   colsample_bytree = x[4],
                   min_child_weight = x[5],
                   nthread = nthread,
                   nrounds = nrounds,
                   seed = seed,
                   objective = objective
    )
    return(mean(md$test))
  }

  # Definition of the grid
  grid <- list(eta = eta,
               gamma = gamma,
               max_depth = max_depth,
               colsample_bytree = colsample_bytree,
               min_child_weight = min_child_weight)

  # Initial point
  ini.pos <- list( eta = round(median(1:length(eta))),
                   gamma = round(median(1:length(gamma))),
                   max_depth = round(median(1:length(max_depth))),
                   colsample_bytree = round(median(1:length(colsample_bytree))),
                   min_child_weight = round(median(1:length(min_child_weight))))

  # Initialization of the error
  ini.err <- fn(sapply(1:5, function(x) grid[[x]][ini.pos[[x]]]))
  best.pos <- ini.pos
  best.err <- ini.err
  rez <- cbind(as.data.table(ini.pos), data.table(err = ini.err))

  # Print the initial state
  if (verbose){
    cat("Initializing with value:\n")
    cat("             eta:", grid$eta[ini.pos$eta], "\n")
    cat("             gamma:", grid$gamma[ini.pos$gamma], "\n")
    cat("             max_depth:", grid$max_depth[ini.pos$max_depth], "\n")
    cat("             colsample_bytree:",
        grid$colsample_bytree[ini.pos$colsample_bytree], "\n")
    cat("             min_child_weight:",
        grid$min_child_weight[ini.pos$min_child_weight], "\n")
    cat("with error :", best.err, "\n")
  }

  # While loop as long as there is improvement
  imp <- T
  while (imp){
    temp.rez <- data.table()
    for (i in 1:5){
      if (ini.pos[[i]] > 1){
        pos <- ini.pos
        pos[[i]] <- pos[[i]] - 1
        err <- fn(sapply(1:5, function(x) grid[[x]][pos[[x]]]))
        temp.rez <- rbind(temp.rez,
                          cbind(as.data.table(pos), data.table(err = err)))
        if (err < best.err){
          best.pos <- pos
          best.err <- err
        }
      }
      if (ini.pos[[i]] < length(grid[[i]])){
        pos <- ini.pos
        pos[[i]] <- pos[[i]] + 1
        err <- fn(sapply(1:5, function(x) grid[[x]][pos[[x]]]))
        temp.rez <- rbind(temp.rez,
                          cbind(as.data.table(pos), data.table(err = err)))
        if (err < best.err){
          best.pos <- pos
          best.err <- err
        }
      }
    }
    rez <- rbind(rez, temp.rez)
    if (best.err < ini.err){
      val.modif <- names(which(unlist(ini.pos) - unlist(best.pos) != 0))
      if (verbose){
        cat(val.modif,
            "switched from",
            grid[[val.modif]][ini.pos[[val.modif]]],
            "to",
            grid[[val.modif]][best.pos[[val.modif]]],
            "for new error",
            best.err,
            "\n"
        )
      }
      ini.pos <- best.pos
      ini.err <- best.err
    } else{
      imp <- F
    }
  }
  # From the best position, check all values on univariates.
  for (i in 1:5){
    topass <- best.pos[[i]]
    for (j in 1:length(grid[[i]])){
      if (j != topass){
        pos <- best.pos
        pos[[i]] <- j
        err <- fn(sapply(1:5, function(x) grid[[x]][pos[[x]]]))
        rez <- rbind(rez,
                     cbind(as.data.table(pos), data.table(err = err)))
        if (err < best.err){
          val.modif <- names(which(unlist(pos) - unlist(best.pos) != 0))
          if (verbose){
            cat(val.modif,
                "switched from",
                grid[[val.modif]][best.pos[[val.modif]]],
                "to",
                grid[[val.modif]][pos[[val.modif]]],
                "for new error",
                err,
                "\n"
            )
          }
          best.pos <- pos
          best.err <- err
        }
      }
    }
  }

  # Select the best parameters
  best.param <- sapply(1:5, function(x) grid[[x]][best.pos[[x]]])
  names(best.param) <- names(best.pos)
  for (i in 1:5){
    rez[, c(names(rez)[i]) := grid[[i]][unlist(rez[, i, with = F])]]
  }

  # Return the results
  return(list(param = best.param,
              err = best.err,
              results = rez))
}

# Predict Function -----

#' Predict on new values
#'
#' \code{xg_predict} use the previously trained xgboost model to perform
#' prediction on new data.
#'
#' @param model \strong{Object} A model object created by the function
#' \link[ezXg]{xg_train}.
#' @param data \strong{data.frame}. A data.frame or data.table structure with
#' column names equal to the input names.
#'
#' @return The prediction with the following fields:
#' \itemize{
#'  \item{\strong{pred}}: the prediction for the model.
#'  \item{\strong{proba}}: (optionnal) the associated probabilities for the
#'  prediction in case of a classification
#' }
#'
#' @import data.table
#' @import xgboost
#'
#' @examples
#' d <- xg_load_data(system.file("extdata", "titanic.csv", package = "ezXg"),
#'                inputs = c("Pclass", "Sex", "Age", "SibSp",
#'                           "Parch", "Fare", "Embarked"),
#'                output = "Survived",
#'                train.size = 0.8)
#' md <- xg_train(d)
#' p <- xg_predict(md, d$data[train == FALSE])
#'
#' @export
xg_predict <- function(model,
                       data){
  # formating data
  col <- sapply(model$template, class)
  data <- data.table(data)
  col2cat <- names(which(col == "factor"))
  for (c in col2cat){
    data[, (c) := factor(as.character(get(c)),
                         levels = levels(model$template[, get(c)]))]
  }
  col2num <- names(which(col == "numeric"))
  if (length(col2num) > 0){
    data[, (col2num) := lapply(.SD,
                               function(x) as.numeric(gsub(",",
                                                           ".",
                                                           x,
                                                           fixed = T))),
         .SDcols = col2num]
  }

  # selection of the numeric column
  sc <- sapply(data, class)
  sc <- names(sc[sc %in% c("integer", "numeric")])
  if (length(sc) > 0){
    # case infinity
    if (model$na.handle == "inf"){
      data[, (sc) := lapply(.SD, function(x) ifelse(is.na(x), Inf, x)),
           .SDcols = sc]
    } else {
      # case function
      fn <- function(x) {
        x[is.na(x)] <- eval(parse(text = paste0(model$na.handle,
                                                "(x, na.rm = T)")))
        return(x)
      }
      data[, (sc) := lapply(.SD, fn), .SDcols = sc]
    }
  }

  # Model matrix
  m.d <- model.matrix(as.formula(model$formula), data)

  # Prediction
  if (model$model$params$objective %in% c("reg:linear", "reg:logistic")){
    # Case of regression
    return(list(pred = predict(model$model, m.d, ntreelimit = model$ntree)))
  } else if (model$model$params$objective == "binary:logistic"){
    # Case of binary classification
    p <- predict(model$model, m.d, ntreelimit = model$ntree)
    proba <- p
    pred <- ifelse(p < 0.5, model$labels[1], model$labels[2])

    # Return
    return(list(pred = pred,
                proba = proba))
  } else if (model$model$params$objective == "multi:softprob"){
    # Case of multi-class classification
    p <- predict(model$model, m.d, ntreelimit = model$ntree)
    proba <- matrix(p, ncol = model$model$params$num_class, byrow = TRUE)
    pred <- model$labels[apply(proba, 1, which.max)]

    # Return
    return(list(pred = pred,
                proba = proba))
  }
}

# Auto Machine Learning Function -----

#' Auto Machine Learning
#'
#' \code{xg_auto_ml} compiles all the functions for calibrating a xgboost
#' model into a single one.
#'
#' @param data \strong{Character}. The link to the file containing the data.
#'  The data are imported with the \code{fread} function from
#'  the \code{data.table} package (\link[data.table]{fread}), so the format
#'  must be consistent with a csv file.
#' @param param \strong{Character}. A link to a YAML file with all the
#' the parameters for calibrating the model. See
#' system.file("extdata", "ex_param.yml", package = "ezXg") for an example.
#'
#' @return A calibrated model directly usable for prediction, consistent
#' with a model produced by \link[ezXg]{xg_train}.
#'
#' @import data.table
#' @import xgboost
#' @import yaml
#'
#' @examples
#' md <- xg_auto_ml(system.file("extdata", "titanic.csv", package = "ezXg"),
#'                  system.file("extdata", "ex_param.yml", package = "ezXg"))
#'
#' @export
xg_auto_ml <- function(data,
                       param){

  # Load parameters
  p <- yaml::read_yaml(param)
  # Add default parameters
  p.def <- system.file("extdata", "xgparam.yml", package = "ezXg")
  p.def <- yaml::read_yaml(p.def)
  for (n in names(p.def)){
    if (!(n %in% names(p))){
      p[[n]] <- p.def[[n]]
    } else {
      for (m in names(p.def[[n]])){
        if (!(m %in% names(p[[n]]))){
          p[[n]][[m]] <- p.def[[n]][[m]]
        }
      }
    }
  }

  # Check that parameters are all here
  if (!("output" %in% names(p[["model"]]))){
    stop("Output parameters not given in configuration file")
  }

  # Load data
  d <- xg_load_data(file = data,
                    inputs = p$model$inputs,
                    output = p$model$output,
                    inputs.class = p$model$inputs.class,
                    output.class = p$model$output.class,
                    train.size = p$global$train.size,
                    seed = p$global$seed,
                    na.handle = p$model$na.handle,
                    max.levels = p$global$max.levels)
  # Print status
  if (p$global$verbose){
    cat("---- Data Loaded \n",
        "     with inputs:\n        ",
        paste0(paste(names(d$data)[!(names(d$data) %in%
                                       c(p$model$output, "train"))],
                     sapply(d$data, class)[!(names(d$data) %in%
                                               c(p$model$output, "train"))],
                     sep = " | "),
               collapse = "\n         "),
        "\n      and output:\n         ",
        paste(p$model$output,
              sapply(d$data, class)[names(d$data) %in% p$model$output],
              sep = " | "),
        "\n      training size:\n         ",
        nrow(d$data[train == T]), "observations",
        "\n      test size:\n         ",
        nrow(d$data[train == FALSE]), "observations\n")
  }

  # Grid Search
  if (p$param$cv > 1){
    # Print beginning of step
    if (p$global$verbose){
      cat("---- Initializing grid search\n")
    }
    # Launch grid search
    g <- xg_gs(data = d,
               eta = p$param$eta,
               gamma = p$param$gamma,
               max_depth = p$param$max_depth,
               colsample_bytree = p$param$colsample_bytree,
               min_child_weight = p$param$min_child_weight,
               nrounds = p$param$nrounds,
               nthread = p$global$nthread,
               cv = p$param$cv,
               verbose = p$global$verbose,
               objective = p$param$objective)
    # End of step
    # Print beginning of step
    if (p$global$verbose){
      cat("---- Grid search ended with selected parameters:\n        ",
          paste0(paste(names(g$param),
                       g$param,
                       sep = " : "),
                 collapse = "\n         "),
          "\n     For error:\n        ",
          g$err,
          "\n"
      )
    }
  }

  # Train the model

  # Select parameters
  if (exists("g")){
    prm <- g$param
  } else{
    prm <- c(eta = p$param$eta[1],
             gamma = p$param$gamma[1],
             max_depth = p$param$max_depth[1],
             colsample_bytree = p$param$colsample_bytree[1],
             min_child_weight = p$param$min_child_weight[1])
  }
  # Print parameters
  if (p$global$verbose){
    cat("---- Training model with parameters:\n        ",
        paste0(paste(names(prm),
                     prm,
                     sep = " : "),
               collapse = "\n         "),
        "\n         nrounds :",
        p$param$nrounds,
        "\n"
    )
  }
  # Train model
  md <- xg_train(data = d,
                 eta = prm[1],
                 gamma = prm[2],
                 max_depth = prm[3],
                 colsample_bytree = prm[4],
                 min_child_weight = prm[5],
                 nrounds = p$param$nrounds,
                 nthread = p$global$nthread,
                 verbose = 0,
                 cv = 1,
                 seed = p$global$seed,
                 objective = p$param$objective
  )
  # Print results
  if (p$global$verbose){
    fin <- md$model$evaluation_log[md$ntree, ]
    cat("---- Model trained with results:\n        ",
        paste0(paste(names(fin),
                     fin,
                     sep = " : "),
               collapse = "\n         "),
        "\n         objective :",
        md$model$params$objective,
        "\n"
    )
  }

  # Retrain on full database
  if (p$global$retrain.full){
    # Print step
    if (p$global$verbose){
      cat("---- Retraining model on full database\n")
    }
    # Load data
    d <- xg_load_data(file = data,
                      inputs = p$model$inputs,
                      output = p$model$output,
                      inputs.class = p$model$inputs.class,
                      output.class = p$model$output.class,
                      train.size = 1,
                      seed = p$global$seed,
                      na.handle = p$model$na.handle,
                      max.levels = p$global$max.levels)
    # Train model
    md <- xg_train(data = d,
                   eta = prm[1],
                   gamma = prm[2],
                   max_depth = prm[3],
                   colsample_bytree = prm[4],
                   min_child_weight = prm[5],
                   nrounds = md$ntree,
                   nthread = p$global$nthread,
                   verbose = 0,
                   cv = 1,
                   seed = p$global$seed,
                   objective = p$param$objective
    )
  }

  # Return
  return(list(
    data = d,
    res = md,
    prm = prm
  ))
}

# Bayesian Optimization -----

#' Bayesian Optimization
#'
#' \code{xg_bo} use the bayesian optimization algorithm implemented in
#'  the \code{rBayesianOptimization} package in order
#'  to select the best set of parameters for an xgboost model.
#'
#' @param data \strong{Object}. A data structure created by the call of the
#' \link[ezXg]{xg_load_data} function.
#' @param eta \strong{Numeric Vectors}. Eta parameter min and max bounds.
#' @param gamma \strong{Numeric Vector}. Gamma parameter min and max bounds.
#' @param max_depth \strong{Numeric Vector}. Max_depth parameter min and max bounds.
#' Use "L" suffix to indicate integer hyperparameter.
#' @param colsample_bytree \strong{Numeric Vector}. Colsample_bytree parameter
#' min and max bounds.
#' @param min_child_weight \strong{Numeric Vector}. Min_child_weight parameter
#' min and max bounds. Use "L" suffix to indicate integer hyperparameter.
#' @param nrounds \strong{Numeric}. Nrounds parameter for xgboost calibration.
#' See \link[xgboost]{xgb.train} for more details.
#' @param nthread \strong{Numeric}. Nthread parameter for xgboost calibration.
#' See \link[xgboost]{xgb.train} for more details.
#' @param verbose \strong{Logical}. Verbose parameter for grid search.
#' @param cv \strong{Numeric}. Number of folds in cross validation. Needs
#' to be more than 2.
#' @param seed \strong{Numeric}. Seed for computation reproducability.
#' @param objective \strong{Character}. Objective function for the
#' optimization. . Eta parameter for xgboost calibration. See
#'  \link[xgboost]{xgb.train} for more details. Can be set to \emph{auto}
#'  in order to let the function choose the better model regarding the
#'  output variable.
#' @param ... Other parameters from \link[rBayesianOptimization]{BayesianOptimization}.
#'
#' @return The optimization results with the following fields:
#' \itemize{
#'  \item{\strong{param}}: the optimal set of parameters.
#'  \item{\strong{err}}: the error associated to the optimal parameter
#'  set.
#'  \item{\strong{results}}: the history of the results for the
#'  cross-validation with all the tested sets of parameters.
#' }
#'
#' @import data.table
#' @import xgboost
#' @import rBayesianOptimization
#' @importFrom stats as.formula median model.matrix na.omit predict quantile
#'
#' @examples
#' d <- xg_load_data(system.file("extdata", "titanic.csv", package = "ezXg"),
#'                inputs = c("Pclass", "Sex", "Age", "SibSp",
#'                           "Parch", "Fare", "Embarked"),
#'                output = "Survived",
#'                train.size = 0.8)
#' t <- xg_bo(d)
#'
#' @export
xg_bo <- function(data,
                  eta = c(0.05, 0.3),
                  gamma = c(0, 0.5),
                  max_depth = c(1L,15L),
                  colsample_bytree = c(0.5,1),
                  min_child_weight = c(1L, 7L),
                  nrounds = 100,
                  nthread = 2,
                  cv = 5,
                  seed = 1,
                  objective = "auto",
                  init_grid_dt = NULL,
                  kernel = list(type =
                                  "exponential", power = 2),
                  init_points = 10,
                  n_iter = 50,
                  acq = "ucb",
                  kappa = 2.576,
                  eps = 0.0,
                  verbose = TRUE){

  # Check the parameters
  if (cv < 2) stop("Invalid number of cross validation")

  # Optimization function
  fn <- function(eta, gamma, max_depth, colsample_bytree, min_child_weight){
    md <- xg_train(data,
                   cv = cv,
                   verbose = 0,
                   eta = eta,
                   gamma = gamma,
                   max_depth = max_depth,
                   colsample_bytree = colsample_bytree,
                   min_child_weight = min_child_weight,
                   nthread = nthread,
                   nrounds = nrounds,
                   seed = seed,
                   objective = objective
    )
    return(list(Score = - mean(md$test), Pred = 1))
  }

  # Optimization process
  opt <- BayesianOptimization(fn,
                              bounds = list(eta = eta,
                                            gamma = gamma,
                                            max_depth = max_depth,
                                            colsample_bytree = colsample_bytree,
                                            min_child_weight = min_child_weight),
                              init_grid_dt = init_grid_dt,
                              init_points = init_points,
                              n_iter = n_iter,
                              acq = acq,
                              kernel = kernel,
                              kappa = kappa,
                              eps = eps,
                              verbose = verbose)

  # Return the results
  return(list(param = opt$Best_Par,
              err = abs(opt$Best_Value),
              results = opt$History))
}
ArnaudBu/ezXg documentation built on Oct. 30, 2019, 4:59 a.m.