#' @title Bagged Ensemble GLMNET Model.
#'
#' @description This function creates a bagged ensemble generalized linear model via penalized maximum likelihood given a dataset.
#' This function effictively acts as a wrapper for the glmnet package.
#' The GLMNET model is a very versitile regression model that incorporates feature selection using shrinkage parametres and cross validation.
#'
#' @param y_index A column index representing the response variable of the model.
#'
#' @param train A dataset for the GLMNET model to be trained on.
#' The order and names of train set should be the exact same as the test set.
#'
#' @param valid_size A natural number indicating the number of observations to be randomly sampled from the training data for model validation.
#'
#' @param test A dataset for the GLMNET model to predict for.
#' The order and names of test set should be the exact same as the train set.
#'
#' @param alpha The elasticnet mixing parameter, a numeric value between 0 and 1.
#' When alpha is 1, a LASSO model is fitted.
#' When alpha is 0, a Ridge Regression model is fitted.
#' When alpha is not 0 or 1, an Elastic Nets model is fitted.
#' Default is 1.
#'
#' @param family A character object indicating the type of response variable in the model.
#' Either one of; "gaussian", "binomial", "poisson", "multinomial", "cox" or "mgaussian".
#' Default is gaussian.
#'
#' @param type The type of prediction required.
#' Either one of; "link", "response", "coefficients", "nonzero" or "class".
#' Default is "link"
#'
#' @param n A natural number indicating the number of GLMNET models to be built.
#'
#' @param r The number of rows to be bagged.
#' Note r < nrow(train).
#'
#' @param r_replace A logical object allow resampling when bagging rows.
#' Default is FALSE.
#'
#' @param c The number of columns to be bagged
#' Note c < ncol(train)
#'
#' @param c_replace A logical object allowing resampling when bagging columns
#' Default is FALSE.
#'
#' @param standardize A logical object indicating whether the predictor variables X should be standardised.
#' Default is TRUE.
#'
#' @param nfolds The number of cross-vaidate folds to perform.
#' Default is 10.
#'
#' @param plots A logical object indicating whether plots should be constructed for each bagged model.
#'
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @param seed Logical, indicating whether a random seed should be implemented.
#'
#' @return Outputs a list of information related to the ensemble GLMNET model.
#' The first object of the list is a data frame of the response observations, the corresponding predictions and the error associated with the prediction.
#' The second object of the list is a data frame of model performance metrics.
#' The third object of the list is a vector of predictions / classifications for the specified test set.
#'
#' @import glmnet
#'
#' @export
#'
#' @seealso ensemble_mars, ensemble_mlr
#'
#' @keywords glmnet, ensemble modelling, prediction, classification, evaluation
#'
#' @examples
#' # Example 1
#' # Numeric prediction example with Iris Data
#' data <- iris[sample(1:150, size = 150, replace = FALSE),]
#' # subset training data
#' train_data <- data[1:150,-5]
#' # bag rows only
#' ensemble_glmnet(y_index = 1, train = train_data, valid_size = 50, n = 10, r = 80, alpha = 0.5, family = "gaussian", plots = T)
#' # bag columns only
#' ensemble_glmnet(y_index = 1, train = train_data, valid_size = 50, n = 10, c = 2)
#' # bagg rows and columns
#' ensemble_glmnet(y_index = 1, train = train_data, valid_size = 50, n = 10, r = 80, c = 2)
#'
#' # Example 2
#' # Binomial classication example with irisData
#' data <- iris[sample(1:150, size = 150, replace = FALSE),]
#' # Dummy encode the Species
#' data <- derive_variables(dataset = data, type = "dummy", integer = TRUE, return_dataset = TRUE)
#' # Convert the response variable into a binary factor with two class
#' data$Species_setosa <- as.factor(data$Species_setosa)
#' # Extract the test data
#' test <- data[101:50,c(1,2,3,4,6,7)]
#' # move Species_setosa to the front of the data frame
#' data <- data[,c(5,1,2,3,4,6,7)]
#' # fit a LASSO model with no bagging
#' ensemble_glmnet(y_index = 1, train = data, valid_size = 50, n = 10, alpha = 1, family = "binomial", type = "class")
#' # fit a Ridge Regression model with bagged rows
#' ensemble_glmnet(y_index = 1, train = data, valid_size = 50, n = 20, r = 80, alpha = 0, family = "binomial", type = "class", plots = FALSE)
#' # fit an Elastic Nets model to predict the test data with no validation
#' ensemble_glmnet(y_index = 1, train = data, valid_size = NULL, test = test, n = 10, alpha = 1, family = "binomial", type = "class")
#'
#' # Example 3
#' # Validation set with Multinomial Class prediction example using iris
#' data <- iris
#' data <- data[sample(1:150, size = 150, replace = FALSE),c(5,1,2,3,4)]
#' # plots
#' ense <- glmnet(y = as.vector(data[,1]), x = as.matrix(data[,-1]), alpha = 1, family = "binomial")
#' plot(ense)
#' ense <- cv.glmnet(y = as.vector(data[,1]), nfolds = 10, x = as.matrix(data[,-1]), alpha = 1, family = "binomial")
#' plot(ense)
#' # raw calculations
#' predict.cv.glmnet(object = cv.glmnet(y = as.vector(data[1:100,1]), x = as.matrix(data[1:100,-1]), alpha = 1, family = "multinomial"), newx = as.matrix(data[101:50,-1]), type = "class")
#'
#' # Example 4
#' # Test set with numeric prediction using titanic
#' descriptive_statistics(dataset = titanic)
#' str(titanic)
#' data <- titanic[,c(6,2,3,7,8,10)]
#' data <- data[order(data$Age),]
#' data$Age
#' train <- data[1:714, ]
#' valid_size <- 50
#' test <- data[715:891, ]
#' ensemble_glmnet(y_index = 1, train = train, valid = valid_size, test = test, n = 10, r = 600, c = 4)
#' ensemble_glmnet(y_index = 1, train = train, valid = valid_size, n = 10)
#'
#' # Example 5
#' # Possion Prediction with IrisData
#' counts = rpois(n = 150, lambda = 3)
#' data <- iris[sample(1:150, 150, FALSE), ]
#' data = cbind(counts, data)
#' train <- data[1:100,]
#' # test <- data[101:150,]
#' test <- data[101:150, -1]
#' ensemble_glmnet(y_index = 1, train = train, test = test, valid_size = 50, family = "poisson", type = "response")
#'
ensemble_glmnet <- function(y_index,
train,
valid_size = NULL,
test = NULL,
alpha = 1,
family = c("gaussian", "binomial", "poisson", "multinomial", "cox", "mgaussian"),
type = c("link", "response", "coefficients", "nonzero", "class"),
n = 10,
nfolds = 10,
r = NULL,
r_replace = FALSE,
c = NULL,
c_replace = FALSE,
standardize = TRUE,
plots = FALSE,
seed = TRUE)
{
# set a random seed
if (seed == TRUE){
set.seed(1234)
}
#-- (1) Preliminaries --#
# create a list to store all the relevant information regarding the model
lasso_list <- list()
# Confirm correct choice for family
family <- match.arg(family)
# Confirm correct choice for family
type <- match.arg(type)
# Ensure the training data is a data frame
train <- as.data.frame(train)
#-- (2) Prepare Validation Data --#
if(!is.null(valid_size) & family == "gaussian"){
# Create a data frame to hold the validations and observations
validations <- as.data.frame(matrix(ncol = 3,
nrow = valid_size))
# Add appropriate column names to the validations data frame
colnames(validations) <- c("Obs.", "Pred.", "Err.")
} else if(!is.null(valid_size) & family == "binomial"){
# Create a data frame to hold the validations and observations
validations <- as.data.frame(matrix(ncol = 3,
nrow = valid_size))
# Add appropriate column names to the validations data frame
colnames(validations) <- c("Obs.", "Class.", "Err.")
} else if(!is.null(valid_size) & family == "poisson"){
# Create a data frame to hold the validations and observations
validations <- as.data.frame(matrix(ncol = 3,
nrow = valid_size))
# Add appropriate column names to the validations data frame
colnames(validations) <- c("Obs.", "Pred.", "Err.")
}
#-- (3) Prepare Test Data --#
# Create a data frame to hold the test predictions
if(!is.null(test)){
# Ensure the test data is a dataframe
test <- as.data.frame(test)
# Save the predictor variables from the testing data
# x_test <- test[, -y_index]
x_test <- test
predictions <- as.data.frame(matrix(ncol = n,
nrow = nrow(test)))
}
# create m a model index
mindex <- 1
# Use a for loop to create the models and bag the data
for( i in 1:n) {
#-----------------------------------------------------------------------------#
# Data Processing #
#-----------------------------------------------------------------------------#
#--(1) Process Training Data --#
if(!is.null(valid_size)){
# Randomly Subset the validation data from the Training data
valid <- train[sample(x = 1:nrow(train), size = valid_size, replace = FALSE), ]
}
# Extract the observed response variable from the remaining training data
y_train <- train[, y_index]
# Save the predictor variables from the remaining training data
x_train <- train[, -y_index]
#--(2) Process Validation Data --#
if(!is.null(valid_size)){
# Extract the observed responsevariable from the validation data
y_valid <- valid[, y_index]
# Save the predictor variables from the validation data
x_valid <- valid[, -y_index]
}
#-----------------------------------------------------------------------#
# Build the GLMNET models #
#-----------------------------------------------------------------------#
#-------------------------------------------------------#
# When r is NULL and c is NULL #
#-------------------------------------------------------#
# When r and c are both null, then a regular un bagged glmnet is fitted
if(is.null(r) & is.null(c)){
#-- (1) Bag Training Data --#
# subset the training response variable with the bagged rows
rand_y_train <- y_train
# subset the training predictor variables with the bagged rows and columns
rand_x_train <- x_train
#-- (2) Bag Validation Data --#
if(!is.null(valid_size)){
# subset the validation predictor variables with the columns
rand_x_valid <- x_valid
# subset the validation response variable with the columns
rand_y_valid <- y_valid[]
}
#-- (3) Bag Testing Data --#
if(!is.null(test)){
# subset the test predictor variables with the columns
rand_x_test <- x_test
}
#-----------------------------------------------------------#
# When r is not NULL and c is NULL #
#-----------------------------------------------------------#
# When r is not NULL and c is NULL, then bagging occurs on the rows
} else if(!is.null(r) & is.null(c)){
#-- (1) Bag Training Data --#
# randomly sample the rows of the training data
rindices <- sample(x = 1:nrow(x_train), size = r, replace = r_replace)
# subset the training response variable with the bagged rows
rand_y_train <- y_train[rindices]
# subset the training predictor variables with the bagged rows and columns
rand_x_train <- x_train[rindices, ]
#-- (2) Bag Validation Data --#
if(!is.null(valid_size)){
# subset the validation predictor variables with the columns
rand_x_valid <- x_valid[,]
# subset the validation response variable with the columns
rand_y_valid <- y_valid
}
#-- (3) Bag Testing Data --#
if(!is.null(test)){
# subset the test predictor variables with the columns
rand_x_test <- x_test[,]
}
#-----------------------------------------------------------#
# When r is NULL and c is not NULL #
#-----------------------------------------------------------#
# When r is NULL and c is not NULL, then bagging occurs on the columns
} else if(is.null(r) & !is.null(c)){
#-- (1) Bag Training Data --#
# randomly sample the columns of the training data
cindices <- sample(x = 1:ncol(x_train), size = c, replace = c_replace)
# subset the training response variable with the bagged rows
rand_y_train <- y_train
# subset the training predictor variables with the bagged rows and columns
rand_x_train <- x_train[, cindices]
#-- (2) Bag Validation Data --#
if(!is.null(valid_size)){
# subset the validation predictor variables with the columns
rand_x_valid <- x_valid[, cindices]
# subset the validation response variable with the columns
rand_y_valid <- y_valid
}
#-- (3) Bag Testing Data --#
if(!is.null(test)){
# subset the test predictor variables with the columns
rand_x_test <- x_test[,cindices]
}
#---------------------------------------------------------------#
# When r is not NULL and c is not NULL #
#---------------------------------------------------------------#
# When r is not NULL and c is not NULL, then bagging occurs on both the rowsand columns
} else if(!is.null(r) & !is.null(c)){
#-- (1) Bag Training Data --#
# randomly sample the rows of the training data
rindices <- sample(x = 1:nrow(x_train), size = r, replace = r_replace)
# randomly sample the columns of the training data
cindices <- sample(x = 1:ncol(x_train), size = c, replace = c_replace)
# subset the training response variable with the bagged rows
rand_y_train <- y_train[rindices]
# subset the training predictor variables with the bagged rows and columns
rand_x_train <- x_train[rindices, cindices]
#-- (2) Bag Validation Data --#
if(!is.null(valid_size)){
# subset the validation predictor variables with the columns
rand_x_valid <- x_valid[, cindices]
# subset the validation response variable with the columns
rand_y_valid <- y_valid
}
#-- (3) Bag Testing Data --#
if(!is.null(test)){
# subset the test predictor variables with the columns
rand_x_test <- x_test[, cindices]
}
}
#-------------------------------------------------------------#
# Fit the GLMNET Models #
#-------------------------------------------------------------#
Y <- as.vector(rand_y_train)
X <- as.matrix(rand_x_train)
if(plots == TRUE){
# fit Basic GLMNET model
GLMNETmodel <- glmnet(x = X,
y = Y,
family = family,
alpha = alpha,
standardize = standardize)
# Plot the coefficients plot
plot(x = GLMNETmodel, label = T)
# fit a cross validate GLMNET model
cvGLMNETmodel <- cv.glmnet(x = X,
y = Y,
nfolds = 10,
family = family,
alpha = alpha,
standardize = standardize)
# Plot the mean squared error log(lambda) plot
plot(cvGLMNETmodel)
}
#----------------------------------------------------#
# Make validations on the valid set using the glmnet #
#----------------------------------------------------#
if(!is.null(valid_size)){
mod_valid_pred <- predict.cv.glmnet(object = cv.glmnet(x = X,
y = Y,
nfolds = nfolds,
family = family,
alpha = alpha,
standardize = standardize),
newx = as.matrix(rand_x_valid),
type = type)
if(family == "gaussian"){
# Update the Validation Data Frame
validations[1:nrow(valid) + (nrow(valid) * (mindex - 1)), 1] <- as.vector(rand_y_valid)
validations[1:nrow(valid) + (nrow(valid) * (mindex - 1)), 2] <- as.vector(mod_valid_pred)
validations[1:nrow(valid) + (nrow(valid) * (mindex - 1)), 3] <- as.vector(rand_y_valid) - as.vector(mod_valid_pred)
} else if(family == "binomial"){
# Update the Validation Data Frame
validations[1:nrow(valid) + (nrow(valid) * (mindex - 1)), 1] <- as.vector(rand_y_valid)
validations[1:nrow(valid) + (nrow(valid) * (mindex - 1)), 2] <- as.vector(mod_valid_pred)
} else if(family == "possion"){
# Update the Validation Data Frame
validations[1:nrow(valid) + (nrow(valid) * (mindex - 1)), 1] <- as.vector(rand_y_valid)
validations[1:nrow(valid) + (nrow(valid) * (mindex - 1)), 2] <- as.vector(mod_valid_pred)
validations[1:nrow(valid) + (nrow(valid) * (mindex - 1)), 3] <- as.vector(rand_y_valid) - as.vector(mod_valid_pred)
}
}
#---------------------------------------------------#
# Make predictions on the test set using the glmnet #
#---------------------------------------------------#
if(!is.null(test) & family == "gaussian"){
# Make predictions using the test dataset
mod_test_pred <- predict.cv.glmnet(object = cv.glmnet(x = X,
y = Y,
nfolds = nfolds,
family = family,
alpha = alpha,
standardize = standardize),
newx = as.matrix(rand_x_test),
type = type)
# Update the Predictions Data Frame
predictions[,i] <- mod_test_pred
# assign a column name to the data frame
colnames(predictions)[i] <- paste("Model", mindex, sep = " ")
} else if (!is.null(test) & family == "binomial"){
# Make predictions using the test dataset
mod_test_pred <- predict.cv.glmnet(object = cv.glmnet(x = X,
y = Y,
nfolds = nfolds,
family = family,
alpha = alpha,
standardize = standardize),
newx = as.matrix(rand_x_test),
type = type)
# Update the Predictions Data Frame
predictions[,i] <- mod_test_pred
# assign a column name to the data frame
colnames(predictions)[i] <- paste("Model", mindex, sep = " ")
} else if(!is.null(test) & family == "poisson"){
# Make predictions using the test dataset
mod_test_pred <- predict.cv.glmnet(object = cv.glmnet(x = X,
y = Y,
nfolds = nfolds,
family = family,
alpha = alpha,
standardize = standardize),
newx = as.matrix(rand_x_test),
type = type)
# Update the Predictions Data Frame
predictions[,i] <- mod_test_pred
# assign a column name to the data frame
colnames(predictions)[i] <- paste("Model", mindex, sep = " ")
}
#-------------------------------#
# Print a Model Progress Report #
#-------------------------------#
# print a model index update
print(paste("Model", mindex, "of", n, "Completed.", sep = " "))
# update the model index
mindex <- mindex + 1
}
#---------------------------------------------------------------------------#
# Evaluate the Mode #
#---------------------------------------------------------------------------#
if(!is.null(valid_size)){
if(family == "gaussian"){
#-- (1) Evaulation Metrics using the Validation Set --#
# Create a data frame to hold the Model Evaluation Metrics
metrics <- as.data.frame(x = matrix(nrow = 1,
ncol = 7))
# Add approptiate column names to the metrics data frame
colnames(metrics) <- c("SSE", "SAE", "MSE", "MAE", "RMSE", "RMAE", "Rsq")
# Calculate the sum squared error
metrics[1,1] <- sum((validations$Err.)^2)
metrics[1,2] <- sum(abs(validations$Err.))
metrics[1,3] <- mean((validations$Err.)^2)
metrics[1,4] <- mean(abs(validations$Err.))
metrics[1,5] <- sqrt(mean((validations$Err.)^2))
metrics[1,6] <- sqrt(mean(abs(validations$Err.)))
metrics[1,7] <- 1 - (sum(validations$Err.^2) / sum((validations$Obs. - mean(validations$Pred.))^2))
# Update the lasso list with the relevant information
#lasso_list[[1]] <- validations
lasso_list[[2]] <- metrics
#-- (2) Visualisations using the validation set --#
if(plots == TRUE){
# Residual vs Fits Plot
p1 <- plot(x = validations$Pred.,
y = validations$Err.,
main = "Residuals vs Fits Plot",
xlab = "Fits",
ylab = "Residuals")
print(p1)
# Response vs Fits Plot
p2 <- plot(x = validations$Pred.,
y = validations$Obs.,
main = "Response vs Fits Plot",
xlab = "Fits",
ylab = "Response")
print(p2)
# Residual vs Response Plot
p3 <- plot(x = validations$Obs.,
y = validations$Err.,
main = "Residuals vs Response Plot",
xlab = "Response",
ylab = "Residuals")
print(p3)
}
} else if(family == "binomial"){
#-- (1) Evaulation Metrics using the Validation Set --#
# Assign Column Names
colnames(validations) <- c("Obs.", "Class.", "Err.")
# Concert the columns to factors
validations[,1] <- as.factor(validations[,1])
validations[,2] <- as.factor(validations[,2])
validations[,3] <- as.factor(validations[,3])
# add in the apropriate levels for the Err variable
levels(validations[,1]) <- levels(validations[,2])
levels(validations[,3]) <- c("FN", "FP", "TN", "TP")
# use a for loop with if else statements to fill in the classification error
for(i in 1:nrow(validations)){
if(validations[i, 1] == "1" & validations[i, 2] == "1"){
validations[i, 3] <- "TP"
} else if(validations[i, 1] == "0" & validations[i, 2] == "0"){
validations[i, 3] <- "TN"
} else if(validations[i, 1] == "1" & validations[i, 2] == "0"){
validations[i, 3] <- "FN"
} else if(validations[i, 1] == "0" & validations[i, 2] == "1"){
validations[i, 3] <- "FP"
}
}
# Basic Measures
STP <- length(validations[validations$Err. == "TP", 3])
STN <- length(validations[validations$Err. == "TN", 3])
SFP <- length(validations[validations$Err. == "FP", 3])
SFN <- length(validations[validations$Err. == "FN", 3])
# Calulate Performance Metrics
ACC <- (STP + STN) / (STP + STN + SFP + SFN)
BACC <- ((STP / (STP + SFP)) + (STN / (STN + SFN))) / 2
TPR <- STP /(STP + SFN)
TNR <- STN / (STN + SFP)
PPR <- STP / (STP + SFP)
NPR <- STN / (STN + SFN)
F1S <- (2 * STP) / (2*STP + SFP + SFN)
# Create a data frame to hold the Model Evaluation Metrics
metrics <- as.data.frame(x = matrix(nrow = 1,
ncol = 7))
# Add approptiate column names to the metrics data frame
colnames(metrics) <- c("ACC", "BACC", "TPR", "TNR", "PPR", "NPR", "F1S")
# Save Performance Metrics into the metrics data frame
metrics[1,1] <- ACC
metrics[1,2] <- BACC
metrics[1,3] <- TPR
metrics[1,4] <- TNR
metrics[1,5] <- PPR
metrics[1,6] <- NPR
metrics[1,7] <- F1S
# Update the lasso list with the relevant information
#lasso_list[[1]] <- validations
lasso_list[[2]] <- metrics
} else if(family == "poisson"){
#-- (1) Evaulation Metrics using the Validation Set --#
# Create a data frame to hold the Model Evaluation Metrics
metrics <- as.data.frame(x = matrix(nrow = 1,
ncol = 7))
# Add approptiate column names to the metrics data frame
colnames(metrics) <- c("SSE", "SAE", "MSE", "MAE", "RMSE", "RMAE", "Rsq")
# Calculate the sum squared error
metrics[1,1] <- sum((validations$Err.)^2)
metrics[1,2] <- sum(abs(validations$Err.))
metrics[1,3] <- mean((validations$Err.)^2)
metrics[1,4] <- mean(abs(validations$Err.))
metrics[1,5] <- sqrt(mean((validations$Err.)^2))
metrics[1,6] <- sqrt(mean(abs(validations$Err.)))
metrics[1,7] <- 1 - (sum(validations$Err.^2) / sum((validations$Obs. - mean(validations$Pred.))^2))
# Update the lasso list with the relevant information
#lasso_list[[1]] <- validations
lasso_list[[2]] <- metrics
} else if(family == "multinomial"){
}
}
#----------------------------------------------#
# Calculate Final Predictions for Test Set #
#----------------------------------------------#
if(!is.null(test) & family == "gaussian"){
# apply mean row wise
test_pred <- apply(X = predictions, MARGIN = 1, FUN = function(x) mean(x))
# save the test predictions to the lasso list
lasso_list[[3]] <- test_pred
} else if(!is.null(test) & family == "binomial"){
# find the mode row wise
test_pred <- apply(X = predictions, MARGIN = 1, FUN = function(x) Mode(x))
# save the test predictions to the lasso list
lasso_list[[3]] <- test_pred
} else if(!is.null(test) & family == "poisson"){
# apply mean row wise
test_pred <- apply(X = predictions, MARGIN = 1, FUN = function(x) mean(x))
# save the test predictions to the lasso list
lasso_list[[3]] <- test_pred
}
# return the lasso list
return(lasso_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.