R/helpers.r

Defines functions generate_xgb_folds generate_xgb_params generate_reg_metrics calc_deviance generate_deviance_metrics split_data

Documented in generate_deviance_metrics generate_reg_metrics generate_xgb_folds generate_xgb_params split_data

#' Generates a set of cross validation folds used in the xgb_reg, xgb_cat, and xgb_pois functions.
#'
#' @param tune_rounds Specifies the number of random hyperparamter combos to return
#' @return List with cross validation fold indices

generate_xgb_folds <- function(cv_folds, nrows){


  folds_idx <- sample(c(1:cv_folds), nrows, replace = T)
  folds <- list()
  for(i in unique(folds_idx)){
    folds[[i]] <- c(1:nrows)[folds_idx == i]
  }

  return(folds)
}

#' Generates a random set of hyperparamters used in the xgb_reg, xgb_cat, and xgb_pois functions.
#'
#' @param tune_rounds specifies the number of random hyperparamter combos to return.
#' @return data.frame with parameters with the number of rows of tune_rounds.

generate_xgb_params <- function(tune_rounds){
  # randomly samples xgboost hyperparameters based on the number of tune rounds
  set.seed(1234)
  out <- data.frame(eta               = runif(tune_rounds, 0.025, 0.5),
                    gamma             = runif(tune_rounds, 0.1, 1),
                    max_depth         = 0,
                    subsample         = runif(tune_rounds, 0.5, 1),
                    colsample_bytree  = runif(tune_rounds, 0.5, 1.0),
                    min_child_weight  = sample(1:6, tune_rounds, replace = T),
                    max_leaves        = sample(c(5, 10, 25, 50, 100, 250, 500, 1000), tune_rounds, replace = T))
  out$eval <- NA
  return(out)
}

#' Generates bootstrap replicates of y and y_hat and calculates error metrics on the replicates.
#' Metrics are focused on common regression tasks.
#'
#' @param y True values.
#' @param y_hat Predicted values.
#' @param n Indicates the number of boostrap replicates.
#' @return data.frame with error metrics.

generate_reg_metrics <- function(y, y_hat, n = 10000){
  out <- list()
  for(i in 1:n){
    samp         <- sample(1:length(y), length(y), replace = T)
    rmse         <- yardstick::rmse_vec(y[samp], y_hat[samp])
    mae          <- yardstick::mae_vec(y[samp], y_hat[samp])
    rsq          <- yardstick::rsq_vec(y[samp], y_hat[samp])
    spearman_cor <- cor(y[samp], y_hat[samp], method = "spearman")
    out[[i]]     <- data.frame(rmse, mae, rsq, spearman_cor)
  }
  out <- do.call(rbind, out)
  return(out)
}

#' Calculates residual deviance for poisson regression. 0 y values are replaced with
#' 10^-10 and this seems to align with the residual deviance calc in predict.glm.
#'
#' @param y True values.
#' @param y_hat Predicted values.
#' @return Residual deviance.
calc_deviance <- function(y, y_hat){
  y   <- ifelse(y <= 0, 10^-10, y)
  r   <- (y * log(y / y_hat) - (y - y_hat))
  dev <- 2 * sum(r)
  return(dev)
}

#' Generates bootstrap replicates of y and y_hat and calculates residual deviance.
#'
#' @param y True values.
#' @param y_hat Predicted values.
#' @param n Indicates the number of bootstrap replicates.
#' @return Vector of n length of residual deviance calculations.

generate_deviance_metrics <- function(y, y_hat, n = 1000){
  out <- c()
  for(i in 1:n){
    samp <- sample(1:length(y), length(y), replace = T)
    out <- c(out, calc_deviance(y[samp], y_hat[samp]))
  }
  return(out)
}


#' splits train and test frames
#'
#' @param df data.frame to split into train and test.
#' @param perc_train Percent of data to place in train data set.
#' @return a list with a test data.frame, train data.frame and train idices.

split_data <- function(df, perc_train = 0.8){
  set.seed(1234)
  train_idx     <- sample(1:nrow(df), floor(nrow(df) * perc_train), replace = F)
  df_train      <- df[train_idx, ]
  df_test       <- df[-train_idx, ]

  #outputs
  out              <- list()
  out$df_train     <- df_train
  out$df_test      <- df_test
  out$train_idx    <- train_idx

  return(out)
}
prescient/modelpipe documentation built on Dec. 25, 2019, 3:20 a.m.