R/model_lgb.R

Defines functions generate_lgb_folds generate_lgb_params model_lgb_reg interpret_results lgb_feature_filter

#' Specifies cross validation folds for lightgbm models
#'
#' @param cv_folds number of cross validation folds
#' @param nrows training data row count
#' @return list of cross validation folds

generate_lgb_folds <- function(cv_folds, nrows){
  folds_idx <- sample(c(1:cv_folds), nrows, replace = T)
  folds <- list()
  for(i in unique(folds_idx)){
    folds[[i]] <- c(1:nrows)[folds_idx == i]
  }

  folds
}

#' Automatically generates lightgbm hyperparameter sets for optimziation
#' tuning guide: https://towardsdatascience.com/automated-machine-learning-hyperparameter-tuning-in-python-dfda59b72f8a
#'
#' @param tune_rounds number of parameter sets to generate
#' @return a dataframe with hyperparameters for tuning

generate_lgb_params <- function(tune_rounds){
  set.seed(1234)
  out <- data.frame(boosting_type    = "goss",
                    #ints
                    max_depth        = runif(tune_rounds, -100, 100) %>% round(),
                    num_leaves       = runif(tune_rounds, 30, 150) %>% round(),
                    min_data_in_leaf = runif(tune_rounds, 10, 30) %>% round(),
                    #numeric
                    learning_rate    = KScorrect::rlunif(tune_rounds, 0.005, 0.2),
                    reg_alpha        = runif(tune_rounds, 0.0, 1.0),
                    reg_lambda       = runif(tune_rounds, 0.0, 1.0))

  #save model evaluations
  out$max_depth <- ifelse(out$max_depth < 0, 0, out$max_depth)
  out$nrounds <- NA
  out$eval <- NA
  out
}

#' Automatically models regression objectives with random search hyperparameter optimziation
#' and may be converted to bayesian hyperparameter optimization in the future.
#'
#' @param mdl_data a list of frames from the treat_numeric_data function.
#' @param target target variable for prediction
#' @param cv_folds number of cross validation folds
#' @param tune_rounds number of hyperoptimization tuning rounds
#' @param verbose print model iterations (T/F)
#' @param max_rounds int. Activates early stopping. Requires at least one validation data
#' and one metric. If there's more than one, will check all of them except the training data.
#' Returns the model with (best_iter + early_stopping_rounds). If early stopping occurs, the
#' model will have 'best_iter' field.
#' @param early_stopping_rounds
#' @return model, data, and model results

model_lgb_reg <- function(mdl_data,
                          target_var,
                          cv_folds              = 5,
                          tune_rounds           = 50,
                          verbose               = T,
                          max_rounds            = 10^5,
                          early_stopping_rounds = 10,
                          obj                   = 'regression'){

  #generate parameters
  params <- generate_lgb_params(tune_rounds)
  params$eval <- NA

  #specify fold indexes
  folds <- generate_lgb_folds(cv_folds, nrow(mdl_data$train))

  # find best model params
  for(i in 1:tune_rounds){
    if(verbose == T){
      print(glue::glue("Testing param set {i} of {tune_rounds}"))
    }
    lgb_mdl <- lightgbm::lgb.cv(params = list(
                                  boosting_type    = params$boosting_type[i],
                                  max_depth        = params$max_depth[i],
                                  num_leaves       = params$num_leaves[i],
                                  min_data_in_leaf = params$min_data_in_leaf[i],
                                  learning_rate    = params$learning_rate[i],
                                  reg_alpha        = params$reg_alph[i],
                                  reg_lambda       = params$reg_lambda[i]
                                ),
                                obj                   = obj,
                                verbose               = -1,
                                data                  = mdl_data$train_lgb_data_set,
                                nrounds               = max_rounds,
                                early_stopping_rounds = early_stopping_rounds,
                                seed                  = 1234,
                                folds                 = folds

    )

    #record early stopping and best result
    params$eval[i]    <- lgb_mdl$best_score
    params$nrounds[i] <- lgb_mdl$best_iter
  }

  #fit best param set
  final_mdl <- lightgbm::lgb.train(data   = mdl_data$train_lgb_data_set,
                                  obj     = 'regression',
                                  verbose = -1,
                                  nrounds = params$nrounds[params$eval == min(params$eval)],
                                  params  = list(
                                    boosting_type    = params$boosting_type[params$eval == min(params$eval)],
                                    max_depth        = params$max_depth[params$eval == min(params$eval)],
                                    num_leaves       = params$num_leaves[params$eval == min(params$eval)],
                                    min_data_in_leaf = params$min_data_in_leaf[params$eval == min(params$eval)],
                                    learning_rate    = params$learning_rate[params$eval == min(params$eval)],
                                    reg_alpha        = params$reg_alph[params$eval == min(params$eval)],
                                    reg_lambda       = params$reg_lambda[params$eval == min(params$eval)]
                                  ))

  #record params tested
  mdl_data$params_tested <- params

  #record best model
  mdl_data$final_mdl <- final_mdl
  mdl_data$importance <- lgb.importance(final_mdl)




  #generate metrics
  mdl_data$test_yhat    <- predict(final_mdl, mdl_data$test_lgb)
  mdl_data$rmse         <- yardstick::rmse_vec(mdl_data$test_y, mdl_data$test_yhat)
  mdl_data$r2           <- yardstick::rsq_vec(mdl_data$test_y, mdl_data$test_yhat)
  mdl_data$spearman_cor <- cor(mdl_data$test_y, mdl_data$test_yhat, method = 'spearman')
  mdl_data$mae          <- yardstick::mae_vec(mdl_data$test_y, mdl_data$test_yhat)

  return(mdl_data)
}


#' Takes a mdl_data list from mdl_lgb_reg and returns interpretations for train and test sets
#'
#' @param mdl_data mdl_data list from mdl_lgb_reg
#' @return mdl_data list with interpretations from train and test sets
interpret_results <- function(mdl_data){
  #interpreting data
  print("Interpreting test data")
  mdl_data$test_interpreted <- lgb.interprete(mdl_data$final_mdl, mdl_data$test_lgb, c(1:nrow(mdl_data$test_lgb)))
  print("Interpreting train data")
  mdl_data$train_interpreted <- lgb.interprete(mdl_data$final_mdl, mdl_data$train_lgb, c(1:nrow(mdl_data$train_lgb)))
  return(mdl_data)
}


#' randomly samples columns and creates bootstrap replicates to find feature importances
#'
#' @param mdl_data mdl_data list
#' @param col_perc number of colums to select for filtering
#' @param n number of feature filter iterations
#' @param objective lightgbm objective to evaluate
#' @return mdl_data list with interpretations from train and test sets
#'
lgb_feature_filter <- function(mdl_data,
                               target_var,
                               col_perc   = 0.2,
                               n          = 300,
                               objective  = 'regression'){
  set.seed(1234)
  cnames <- colnames(mdl_data$train)
  cnames <- cnames[!(cnames %in% target_var)]
  train <- mdl_data$train
  target <- train[,colnames(train) %in% target_var]
  col_sample = floor(col_perc * length(cnames))

  out <- list()
  for(i in 1:n){
    flush.console()
    print(paste0("Iteration ", i, " of ", n))

    #setup data
    boot        <-sample(1:nrow(train),nrow(train), replace = T)
    train_boot  <- train[boot, sample(cnames, col_sample, replace = F)]
    target_boot <- target[boot]
    train_boot  <- lightgbm::lgb.Dataset(data  = as.matrix(train_boot),
                                         label = as.matrix(target_boot))

    #build model
    mdl <- lightgbm::lgb.train(data                  = train_boot,
                               objective             = objective,
                               nrounds               = 50,
                               verbose               = -1)

    #extract importance
    imp <- lgb.importance(mdl)
    imp <- data.frame(feature = imp$Feature, important = ifelse(imp$Gain > mean(imp$Gain), 1, 0))
    out[[i]] <- imp
  }
  out <- do.call(rbind, out)
  out <- out %>%
    dplyr::group_by(feature) %>%
    summarise(
      selected_count = n(),
      importance_count = sum(important),
      importance_perc = sum(important) / n()
    ) %>%
    dplyr::arrange(-importance_perc)

  return(out)
}
prescient/opinionated_pipelines documentation built on Dec. 10, 2019, 12:15 a.m.