R/build_model.R

Defines functions get_predictions_rf get_predictions_linear handle_new_test_levels train_rf_model train_linear_model train_model_xgb cross_validate_xgb guess_hyperparameters

Documented in cross_validate_xgb get_predictions_linear get_predictions_rf guess_hyperparameters handle_new_test_levels train_linear_model train_model_xgb train_rf_model

#' This function guesses sensible parameters for setting up the
#' xgb model. If the user has supplied parameters, the function
#' chooses the ones least likely to lead to overfitting.
#'
#' Objective functions and eval metrics are always taken
#' to be the user supplied values, if the suer has supplied them,
#' else they are guessed based on the type of the target variable.
#'
#' @param train_structure is the data structure produced by prepare_training_set
#' @param n_estimators minimum number of estimators
#' @param learning_rate maximum learning rate
#' @param depth minimum depth of trees
#' @param nrounds the number of rounds of training
#' @param objective_function the objective function to be used
#' @param eval_metric the evaluation metric
#' @export
guess_hyperparameters <- function(train_structure,
                                  n_estimators = 10,
                                  learning_rate = 0.05,
                                  depth = 2,
                                  nrounds = 100,
                                  objective_function = NA,
                                  eval_metric = NA){
  hyperparameters <- list()
  features <-Matrix::sparse.model.matrix(as.formula(paste(train_structure$target_variable, "~ .")),
                                         data = train_structure$data)[,-1]
  width <- ncol(features)
  height <- nrow(features)
  hyperparameters[["depth"]] <- max(depth, floor(sqrt(width)))
  hyperparameters[["n_estimators"]] <- {floor(max(n_estimators,
                                                 exp(floor(log(height)))/
                                                   hyperparameters[["depth"]])) %>% log() %>% `-`(1) %>% exp() %>% floor()}
  hyperparameters[["learning_rate"]] <- min(learning_rate,
                                            1/(log(hyperparameters[["n_estimators"]]*
                                                     hyperparameters[["depth"]])
                                               )
                                            )
  hyperparameters[['alpha']] <- log(hyperparameters[["n_estimators"]]) * hyperparameters[["depth"]] * hyperparameters[["learning_rate"]]
  hyperparameters[['lambda']] <- log(hyperparameters[["n_estimators"]]) * hyperparameters[["depth"]] * hyperparameters[["learning_rate"]]
  class_target <- ("target_reference" %in% names(train_structure))
  if(class_target){
    hyperparameters[['rf_probability']] <- TRUE
    hyperparameters[["objective_function"]] <- "multi:softprob"
    hyperparameters[["eval_metric"]] <- "mlogloss"
    hyperparameters[["num_class"]] <- train_structure$data[[train_structure$target_variable]] %>% dplyr::n_distinct()
    if (nrow(train_structure[["target_reference"]]) > 2){
      hyperparameters[["glmnet_family"]] <- "multinomial"
    } else {
      hyperparameters[["glmnet_family"]] <- "binomial"
    }
  } else {
    hyperparameters[['rf_probability']] <- FALSE
    hyperparameters[["objective_function"]] <- "reg:linear"
    hyperparameters[["eval_metric"]] <- "mae"
    hyperparameters[["glmnet_family"]] <- "gaussian"
  }
  if(!is.na(objective_function)){
    hyperparameters[["objective_function"]] <- objective_function
  }
  if(!is.na(eval_metric)){
    hyperparameters[["eval_metric"]] <- eval_metric
  }
  hyperparameters[["nrounds"]] <- max(nrounds, floor(hyperparameters[["depth"]]*log(hyperparameters[["n_estimators"]])))
  hyperparameters[['rf_trees']] <- floor(sqrt(hyperparameters[["depth"]]) * exp(floor(log(height)/2)))
  hyperparameters[['rf_mtry']] <- min(floor(sqrt(ncol(train_structure$data) + floor(log(height)))), ncol(train_structure$data))
  return(hyperparameters)
}


#' this function takes the train structure and hyperparameters, and an integer n
#' and does an n fold cross validation and returns the result.
#' @param train_structure the data structure generated by prepare_training_set
#' @param hyperparameters the list generated by guess_hyperparameters function
#' @param nfold number of folds of CV
#' @export
cross_validate_xgb <- function(train_structure, hyperparameters, nfold = 5){
  # print("defining the xgb parameters")
  xgb_params <- list("objective" = hyperparameters[["objective_function"]],
                     "eval_metric" = hyperparameters[["eval_metric"]],
                     "eta" = hyperparameters[["learning_rate"]],
                     "max_depth" = hyperparameters[["depth"]],
                     "n_estimators" = hyperparameters[["n_estimators"]],
                     "alpha" = hyperparameters[["alpha"]],
                     "lambda" = hyperparameters[["lambda"]])
  if("num_class" %in% names(hyperparameters)){
    xgb_params[["num_class"]] <- hyperparameters[["num_class"]]
  }
  # print("building the sparse model matrix")
  features <-Matrix::sparse.model.matrix(as.formula(paste(train_structure$target_variable, "~ .")),
                                  data = train_structure$data)[,-1]
  # print("identifying the labels")
  lab <- train_structure$data[[train_structure$target_variable]]
  # print("building the xgb DMatrix for training")
  dtrain <- xgboost::xgb.DMatrix(data = features, label = lab)
  # print("running the cross validation")
  cv_model <- xgboost::xgb.cv(params = xgb_params,
                              data = dtrain,
                              verbose = F,
                              nfold = nfold,
                              nrounds = hyperparameters[["nrounds"]],
                              prediction = T)
  ret_struct <- list()
  ret_struct[["cv_model"]] <- cv_model
  # print("computing useful metrics")
  if("num_class" %in% names(hyperparameters)){
    OOF_prediction <- tibble::tibble(cv_model$pred) %>%
      dplyr::mutate(max_prob = max.col(., ties.method = "last")) %>%
      dplyr::mutate(label = lab+1)
    cm <- caret::confusionMatrix(factor(OOF_prediction$max_prob),
                          factor(OOF_prediction$label),
                          mode = "everything")
    ret_struct[["confusion_matrix"]] <- cm
    ret_struct[["metric"]] <- cv_model$evaluation_log$test_mlogloss_mean[length(cv_model$evaluation_log$test_mlogloss_mean)]
  } else {
    ret_struct[["metric"]] <- cv_model$evaluation_log$test_mae_mean[length(cv_model$evaluation_log$test_mae_mean)]
  }
  return(ret_struct)
}


#' this function returns a structure that contains the xgb model as well as
#' everything else it needs (normalization factors, levels of variables etc)
#' a predict function would need
#' @param train_structure the data structure generated by prepare_training_set
#' @param hyperparameters the list generated by guess_hyperparameters function
#' @export
train_model_xgb <- function(train_structure, hyperparameters){
  xgb_params <- list("objective" = hyperparameters[["objective_function"]],
                     "eval_metric" = hyperparameters[["eval_metric"]],
                     "eta" = hyperparameters[["learning_rate"]],
                     "max_depth" = hyperparameters[["depth"]],
                     "n_estimators" = hyperparameters[["n_estimators"]])
  if("num_class" %in% names(hyperparameters)){
    xgb_params[["num_class"]] <- hyperparameters[["num_class"]]
  }
  features <-Matrix::sparse.model.matrix(as.formula(paste(train_structure$target_variable, "~ .")),
                                         data = train_structure$data)[,-1]
  lab <- train_structure$data[[train_structure$target_variable]]
  dtrain <- xgboost::xgb.DMatrix(data = features, label = lab)
  xgbmodel <- xgboost::xgb.train(params = xgb_params,
                                 data = dtrain,
                                 verbose = F,
                                 nrounds = hyperparameters[["nrounds"]],
                                 prediction = T)
  model_structure <- list()
  model_structure[['models']] <- list()
  model_structure[['models']][['model_xgb']] <- xgbmodel
  if('num_class' %in% names(hyperparameters)){
    model_structure[['target_reference']] <- train_structure[['target_reference']]
  }
  model_structure[['normalize_by']] <- train_structure[['normalize_by']]
  model_structure[['levels']] <- train_structure[['levels']]
  model_structure[['target_variable']] <- train_structure[['target_variable']]
  return(model_structure)
}

#' this function returns a structure that contains the linear model as well as
#' everything already contained in the train_xgb return structure
#' @param train_structure the data structure generated by prepare_training_set
#' @param hyperparameters the list generated by guess_hyperparameters function
#' @param model_structure the model structure created by train xgb
#' @export
train_linear_model <- function(train_structure, model_structure, hyperparameters){
  doParallel::registerDoParallel(8)
  features <-Matrix::sparse.model.matrix(as.formula(paste(train_structure$target_variable, "~ .")),
                                         data = train_structure$data)[,-1]
  lab <- train_structure$data[[train_structure$target_variable]]
  linear_model <- glmnet::cv.glmnet(features, lab, family = hyperparameters[["glmnet_family"]], parallel = TRUE)
  model_structure[['models']][['linear_model']] <- linear_model
  return(model_structure)
}

#' this function returns a structure that contains the rf model as well as
#' everything already contained in the train_xgb return structure
#' @param train_structure the data structure generated by prepare_training_set
#' @param hyperparameters the list generated by guess_hyperparameters function
#' @param model_structure the model structure created by train xgb
#' @export
train_rf_model <- function(train_structure, model_structure, hyperparameters){
  rf_model <- ranger::ranger(dependent.variable.name=train_structure[["target_variable"]],
                            #  formula = as.formula(paste(train_structure$target_variable, "~ .")),
                             data = train_structure$data,
                             num.trees = hyperparameters[['rf_trees']],mtry = hyperparameters[['rf_mtry']],
                             probability = hyperparameters[['rf_probability']],
                             save.memory = TRUE
                             )
  model_structure[['models']][['rf_model']] <- rf_model
  return(model_structure)
}

#' this functions deals with categorical variables in the test set
#' which are not present in the training set.
#' @param model_structure the model strucxture created by xgb
#' @param features is the output of xgboost::xgb.DMatrix for the test set after combining with the levels from training set
#' @export
handle_new_test_levels <- function(model_structure, features){
  test_names <- colnames(features)
  train_names <- model_structure[['models']][['model_xgb']][['feature_names']]
  extra_test_names <- setdiff(test_names, train_names)
  features <- features[,!colnames(features) %in% extra_test_names]
  return(features)
}


#' this function takes the model structure generated by train_model_xgb, along with
#' a test set in the same format as the untransformed *input* df
#' to the prepare_training_set function, to return a prediction vector
#' in the untransformed df.
#' @param model_structure model structure cresated earlier
#' @param test_df the test data frame
#' @export
get_predictions_xgb <- function (model_structure, test_df)
{
  levels_df <- model_structure[["levels"]]
  test_cols <- colnames(test_df)
  level_cols <- colnames(levels_df)
  for (i in 1:length(level_cols)) {
    if (!(level_cols[i] %in% test_cols))
      levels_df[[level_cols[[i]]]] <- NULL
  }
  test_df[[model_structure[["target_variable"]]]] <- NULL
  test_df <- rationalize_categoricals(test_df)
  norm_test_df <- normalize_df(test_df, facs_df = model_structure[["normalize_by"]],
                               target_variable = model_structure[["target_variable"]])
  norm_test_df <- rbind(levels_df,norm_test_df)
  norm_test_df <- handle_missing_values(norm_test_df,
                                        target_variable = model_structure[["target_variable"]],
                                        train_facs = model_structure[["normalize_by"]])
  norm_test_df[[model_structure[["target_variable"]]]] <- 0
  features <- Matrix::sparse.model.matrix(stats::as.formula(paste(model_structure[["target_variable"]],"~ .")),
                                          data = norm_test_df, row.names = F)[, -1]
  features <- handle_new_test_levels(model_structure = model_structure, features = features)
  # print(colnames(features))
  dtest <- xgboost::xgb.DMatrix(data = features)
  preds <- predict(model_structure[["models"]][['model_xgb']], dtest)
  if (model_structure[["models"]][['model_xgb']][["params"]][["objective"]] ==
      "multi:softprob") {
    prob_matrix <- matrix(preds, nrow = nrow(norm_test_df),
                          byrow = T)
    predictions <- tibble::as_tibble(prob_matrix) %>% tail(nrow(test_df))
    if(length(as.character(model_structure[["target_reference"]][[1]]))==2){
      predictions <- predictions %>%
        mutate(V2 = 1-V1)
    }
    colnames(predictions) <- as.character(model_structure[["target_reference"]][[1]])
    cat_df <- predictions %>% tibble::rownames_to_column("row_id") %>%
      dplyr::mutate(row_id = as.numeric(row_id)) %>%
      tidyr::gather(category, value, -row_id) %>%
      dplyr::group_by(row_id) %>%
      dplyr::slice(which.max(value)) %>%
      dplyr::arrange(row_id)
    predictions[["category"]] <- cat_df[["category"]]
  }
  else {
    predictions <- tibble::tibble(prediction = preds[1:nrow(test_df)])
    colnames(predictions) <- model_structure[["target_variable"]]
  }
  return(predictions)
}

#' this function takes the model structure generated by train_linear_model, along with
#' a test set in the same format as the untransformed *input* df
#' to the prepare_training_set function, to return a prediction vector
#' in the untransformed df.
#' @param model_structure model structure cresated earlier
#' @param test_df the test data frame
#' @export
get_predictions_linear <- function(model_structure, test_df){
  levels_df <- model_structure[["levels"]]
  test_cols <- colnames(test_df)
  level_cols <- colnames(levels_df)
  for (i in 1:length(level_cols)) {
    if (!(level_cols[i] %in% test_cols))
      levels_df[[level_cols[[i]]]] <- NULL
  }
  test_df[[model_structure[["target_variable"]]]] <- NULL
  test_df <- rationalize_categoricals(test_df)
  norm_test_df <- normalize_df(test_df, facs_df = model_structure[["normalize_by"]],
                               target_variable = model_structure[["target_variable"]])
  norm_test_df <- rbind(levels_df,norm_test_df)
  norm_test_df <- handle_missing_values(norm_test_df,
                                        target_variable = model_structure[["target_variable"]],
                                        train_facs = model_structure[["normalize_by"]])
  norm_test_df[[model_structure[["target_variable"]]]] <- 0
  features <- Matrix::sparse.model.matrix(stats::as.formula(paste(model_structure[["target_variable"]],"~ .")),
                                          data = norm_test_df, row.names = F)[, -1]
  features <- handle_new_test_levels(model_structure = model_structure, features = features)

  if (model_structure[["models"]][['model_xgb']][["params"]][["objective"]] ==
      "multi:softprob") {
    preds <- plogis(stats::predict(model_structure[["models"]][['linear_model']],
                                           newx = features))
    prob_matrix <- matrix(preds, nrow = nrow(norm_test_df),
                          byrow = T)
    predictions <- tibble::as_tibble(prob_matrix) %>% tail(nrow(test_df))
    if(length(as.character(model_structure[["target_reference"]][[1]]))==2){
      predictions <- predictions %>%
        mutate(V2 = V1) %>%
        mutate(V1 = 1-V2)
    }
    colnames(predictions) <- as.character(model_structure[["target_reference"]][[1]])
    cat_df <- predictions %>% tibble::rownames_to_column(var = "row_id")
    cat_df %>%
      dplyr::mutate(row_id = as.numeric(row_id)) -> cat_df
    cat_df %>%
      tidyr::gather(category, value, -row_id) -> cat_df
    cat_df %>%
      dplyr::group_by(row_id) %>%
      dplyr::slice(which.max(value)) %>%
      dplyr::arrange(row_id) -> cat_df
    predictions[["category"]] <- cat_df[["category"]]
  }
  else {
    preds <- stats::predict(model_structure[["models"]][['linear_model']],
                                       newx = features)
    predictions <- tibble::tibble(prediction = preds[1:nrow(test_df)])
    colnames(predictions) <- model_structure[["target_variable"]]
  }
  return(predictions)
}


#' this function takes the model structure generated by train_linear_model, along with
#' a test set in the same format as the untransformed *input* df
#' to the prepare_training_set function, to return a prediction vector
#' in the untransformed df.
#' @param model_structure model structure cresated earlier
#' @param test_df the test data frame
#' @export
get_predictions_rf <- function(model_structure, test_df){
  levels_df <- model_structure[["levels"]]
  test_cols <- colnames(test_df)
  level_cols <- colnames(levels_df)
  for (i in 1:length(level_cols)) {
    if (!(level_cols[i] %in% test_cols))
      levels_df[[level_cols[[i]]]] <- NULL
  }
  test_df[[model_structure[["target_variable"]]]] <- NULL
  test_df <- rationalize_categoricals(test_df)
  norm_test_df <- normalize_df(test_df, facs_df = model_structure[["normalize_by"]],
                               target_variable = model_structure[["target_variable"]])
  norm_test_df <- rbind(levels_df,norm_test_df)
  norm_test_df <- handle_missing_values(norm_test_df,
                                        target_variable = model_structure[["target_variable"]],
                                        train_facs = model_structure[["normalize_by"]])
  norm_test_df[[model_structure[["target_variable"]]]] <- 0
  if (model_structure[["models"]][['model_xgb']][["params"]][["objective"]] ==
      "multi:softprob") {
    preds <- predict(model_structure[["models"]][['rf_model']],
                                              data = norm_test_df)
    prob_matrix <- preds[['predictions']]
    predictions <- tibble::as_tibble(prob_matrix) %>% tail(nrow(test_df))
    class_list <- model_structure$models$rf_model$forest$class.values
    column_names_predictions <- list()
    for (counter_classes in 1:length(class_list)){
      column_names_predictions[counter_classes] <- model_structure[["target_reference"]][[1]][class_list[counter_classes]+1]
    }
    colnames(predictions) <- column_names_predictions
    cat_df <- predictions %>% tibble::rownames_to_column("row_id") %>%
      dplyr::mutate(row_id = as.numeric(row_id)) %>%
      tidyr::gather(category, value, -row_id) %>%
      dplyr::group_by(row_id) %>%
      dplyr::slice(which.max(value)) %>%
      dplyr::arrange(row_id)
    # print(cat_df)
    predictions[["category"]] <- cat_df[["category"]]
  } else {
    preds <- predict(model_structure[["models"]][['rf_model']],
                                       data = norm_test_df)
    preds <- preds[['predictions']]
    predictions <- tibble::tibble(prediction = preds[1:nrow(test_df)])
    colnames(predictions) <- model_structure[["target_variable"]]
  }
  return(predictions)
}
pbhogale/simplexgb documentation built on Nov. 27, 2021, 3:06 p.m.