R/ensemble_mars.R

#' @title Bagged Ensemble MARS Model.
#' 
#' @description This function creates a bagged ensemble multivariate adaptive regression splines (MARS) model given a dataset. 
#' This function effictively acts as a wrapper for the earth package.
#' The MARS model is a very versitile regression model that incorporates feature selection using shrinkage parametres and cross validation.
#' 
#' @param y_index A column index representing the response variable of the model.
#' 
#' @param train A dataset for the MARS model to be trained on. 
#' The order and names of train set should be the exact same as the test set.
#' 
#' @param valid_size A natural number indicating the number of observations to be randomly sampled from the training data for model validation.
#' 
#' @param test A dataset for the GLMNET model to predict for.
#' The order and names of test set should be the exact same as the train set.
#' 
#' @param family A character object indicating the type of response variable in the model.
#' Either one of; "gaussian", "binomial", "poisson".
#' Default is gaussian.
#' 
#' @param degree An optional integer specifying maximum interaction degree (default is 1).
#' Default is 1.
#' 
#' @param pmethod Pruning method. One of: "backward", "none", "exhaustive", "forward", "seqrep" or "cv".
#' Default is "backward".
#' 
#' @param nprune an optional integer specifying the maximum number of model terms.
#' Default is NULL.
#' 
#' @param nfold Number of cross-validation folds.
#' Default is 0.
#' 
#' @param n A natural number indicating the number of GLMNET models to be built.
#' 
#' @param r The number of rows to be bagged. 
#' Note r < nrow(train).
#' 
#' @param r_replace A logical object allow resampling when bagging rows.
#' Default is FALSE.
#' 
#' @param c The number of columns to be bagged
#' Note c < ncol(train)
#' 
#' @param c_replace A logical object allowing resampling when bagging columns
#' Default is FALSE.
#' 
#' @param type The type of prediction required.
#' Either one of; "link", "response", "earth", "class" or "terms".
#' Default is "link"
#' 
#' @param  plots A logical object indicating whether plots should be constructed for each bagged model.
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' The name must include the .csv suffixs.
#'
#' @param seed Logical, indicating whether a random seed should be implemented.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @return Outputs a list of information related to the ensemble GLMNET model.
#' The first object of the list is a data frame of the response observations, the corresponding predictions and the error associated with the prediction.
#' The second object of the list is a data frame of model performance metrics.
#' The third object of the list is a vector of predictions / classifications for the specified test set.
#' 
#' @import earth
#' 
#' @export
#' 
#' @seealso ensemble_mars, ensemble_mlr
#' 
#' @keywords mars, ensemble modelling, prediction, classification, evaluation
#' 
#' @examples
#' # Example 1
#' # set data
#' data <- iris[sample(1:150, 150, FALSE), ]
#' train <- data[1:100,]
#' test <- data[101:150, -1]
#' ensemble_mars(y_index = 1, train = train, test = test, valid_size = 50, family = "gaussian")
#' 
#' # Example 2
#' # Binomial classication example with irisData
#' data <- iris[sample(1:150, size = 150, replace = FALSE),]
#' # Dummy encode the Species
#' data <- derive_variables(dataset = data, type = "dummy", integer = TRUE, return_dataset = TRUE)
#' # Convert the response variable into a binary factor with two class
#' data$Species_setosa <- as.factor(data$Species_setosa)
#' # Extract the test data
#' test <- data[101:50,c(5,1,2,3,4,6,7)]
#' # move Species_setosa to the front of the data frame
#' # data <- data[,c(5,1,2,3,4,6,7)]
#' data <- data[,c(5,1,2,3,4,6,7)]
#' # fit a MARS model with no bagging
#' ensemble_mars(y_index = 1, train = data, test = test, valid_size = 50, family = "binomial", type = "class")
#' 
#' # Example 3  
#' # Possion Prediction with IrisData
#' counts = rpois(n = 150, lambda = 3)
#' data <- iris[sample(1:150, 150, FALSE), ]
#' data = cbind(counts, data)
#' train <- data[1:100,]
#' # test <- data[101:150,]
#' test <- data[101:150, -1]
#' ensemble_mars(y_index = 1, train = train, test = test, valid_size = 50, family = "poisson", type = "response")
#' 
#' 
ensemble_mars <- function(y_index, 
                          train, 
                          valid_size = NULL, 
                          test = NULL, 
                          pmethod =  c("backward", "none", "exhaustive", "forward", "seqrep", "cv"), 
                          family = c("gaussian", "binomial", "poisson"), 
                          type = c("link", "response", "earth", "class", "terms", "poisson"), 
                          degree = 1, 
                          nprune = NULL, 
                          nfold = 0, 
                          n = 10, 
                          r = NULL, 
                          r_replace = FALSE, 
                          c = NULL, 
                          c_replace = FALSE, 
                          plots = FALSE,
                          seed = TRUE)
{
  
  # set a random seed
  if (seed == TRUE){
    
    set.seed(1234)
    
  }
  
  #-- (1) Preliminaries --#
  
  # create a list to store all the relevant information regarding the model
  mars_list <- list()
  
  # Confirm correct choice for family
  family <- match.arg(family)
  
  # Confirm correct choice for family
  type <- match.arg(type)
  
  # Confirm correct choice for pmethod
  pmethod <- match.arg(pmethod)
  
  # Ensure the training data is a data frame
  train <- as.data.frame(train)
  
  #-- (2) Prepare Validation Data Frame --#
  
  if(!is.null(valid_size) & family == "gaussian"){
    
    # Create a data frame to hold the validations and observations
    validations <- as.data.frame(matrix(ncol = 3, 
                                        nrow = valid_size))
    
    # Add appropriate column names to the validations data frame
    colnames(validations) <- c("Obs.", "Pred.", "Err.")
    
  } else if(!is.null(valid_size) & family == "binomial"){
    
    # Create a data frame to hold the validations and observations
    validations <- as.data.frame(matrix(ncol = 3, 
                                        nrow = valid_size))
    
    # Add appropriate column names to the validations data frame
    colnames(validations) <- c("Obs.", "Class.", "Err.")
    
  } else if(!is.null(valid_size) & family == "poisson"){
    
    # Create a data frame to hold the validations and observations
    validations <- as.data.frame(matrix(ncol = 3, 
                                        nrow = valid_size))
    
    # Add appropriate column names to the validations data frame
    colnames(validations) <- c("Obs.", "Pred.", "Err.")
    
  }
  
  #-- (3) Prepare Test Data Frame --#
  
  # Create a data frame to hold the test predictions
  if(!is.null(test)){
    
    # Ensure the test data is a dataframe
    test <- as.data.frame(test)
    
    # Save the predictor variables from the testing data
    # x_test <- test[, -y_index] (update removed -y_index for test set)
    x_test <- test
    predictions <- as.data.frame(matrix(ncol = n, 
                                        nrow = nrow(test)))
    
  }
  
  # create m a model index
  mindex <-  1
  
  # Use a for loop to create the models and bag the data
  for( i in 1:n) {
    
    #-----------------------------------------------------------------------------#
    # Data Processing                                                             #
    #-----------------------------------------------------------------------------#
    
    #--(1) Process Training Data --#
    
    if(!is.null(valid_size)){
      
      # Randomly Subset the validation data from the Training data
      valid <- train[sample(x = 1:nrow(train), 
                            size = valid_size, 
                            replace = FALSE), ]
      
    }
    
    # Extract the observed response variable from the remaining training data
    y_train <- train[, y_index]
    
    # Save the predictor variables from the remaining training data
    x_train <- train[, -y_index] 
    
    #--(2) Process Validation Data --#
    
    if(!is.null(valid_size)){
      
      # Extract the observed responsevariable from the validation data
      y_valid <- valid[, y_index]
      
      # Save the predictor variables from the validation data
      x_valid <- valid[, -y_index]
    }
    
    #-----------------------------------------------------------------------#
    # Build the GLMNET models                                               #
    #-----------------------------------------------------------------------#
    
    #-------------------------------------------------------#
    # When r is NULL and c is NULL                          #
    #-------------------------------------------------------#
    
    # When r and c are both null, then a regular un bagged glmnet is fitted
    if(is.null(r) & is.null(c)){
      
      #-- (1) Bag Training Data --#
      
      # subset the training response variable with the bagged rows
      rand_y_train <- y_train
      
      # subset the training predictor variables with the bagged rows and columns
      rand_x_train <- x_train
      
      #-- (2) Bag Validation Data --#
      
      if(!is.null(valid_size)){
        
        # subset the validation predictor variables with the columns
        rand_x_valid <- x_valid
        
        # subset the validation response variable with the columns
        rand_y_valid <- y_valid
        
      }
      
      #-- (3) Bag Testing Data --#
      
      if(!is.null(test)){
        
        # subset the test predictor variables with the columns
        rand_x_test <- x_test
      }
      
      #-----------------------------------------------------------#
      # When r is not NULL and c is NULL                          #
      #-----------------------------------------------------------#
      
      # When r is not NULL and c is NULL, then bagging occurs on the rows
    } else if(!is.null(r) & is.null(c)){
      
      #-- (1) Bag Training Data --#
      
      # randomly sample the rows of the training data
      rindices <- sample(x = 1:nrow(x_train), size = r, replace = r_replace)
      
      # subset the training response variable with the bagged rows
      rand_y_train <- y_train[rindices]
      
      # subset the training predictor variables with the bagged rows and columns
      rand_x_train <- x_train[rindices, ]
      
      #-- (2) Bag Validation Data --#
     
      if(!is.null(valid_size)){
        
        # subset the validation predictor variables with the columns
        rand_x_valid <- x_valid
        
        # subset the validation response variable with the columns
        rand_y_valid <- y_valid
      }
      
      #-- (3) Bag Testing Data --#
      
      if(!is.null(test)){
        
        # subset the test predictor variables with the columns
        rand_x_test <- x_test[,]
        
      }
      
      #-----------------------------------------------------------#
      # When r is NULL and c is not NULL                          #
      #-----------------------------------------------------------#
      
      # When r is NULL and c is not NULL, then bagging occurs on the columns
    } else if(is.null(r) & !is.null(c)){
      
      #-- (1) Bag Training Data --#
      
      # randomly sample the columns of the training data
      cindices <- sample(x = 1:ncol(x_train), size = c, replace = c_replace)
      
      # subset the training response variable with the bagged rows
      rand_y_train <- y_train
      
      # subset the training predictor variables with the bagged rows and columns
      rand_x_train <- x_train[, cindices]
      
      
      #-- (2) Bag Validation Data --#
      
      if(!is.null(valid_size)){
        
        # subset the validation predictor variables with the columns
        rand_x_valid <- x_valid[, cindices]
        
        # subset the validation response variable with the columns
        rand_y_valid <- y_valid
        
      }
      
      #-- (3) Bag Testing Data --#
      
      if(!is.null(test)){
        
        # subset the test predictor variables with the columns
        rand_x_test <- x_test[,cindices]
        
      }
      #---------------------------------------------------------------#
      # When r is not NULL and c is not NULL                          #
      #---------------------------------------------------------------#
      
      # When r is not NULL and c is not NULL, then bagging occurs on both the rowsand columns
    } else if(!is.null(r) & !is.null(c)){
      
      #-- (1) Bag Training Data --#
      
      # randomly sample the rows of the training data
      rindices <- sample(x = 1:nrow(x_train), size = r, replace = r_replace)
      
      # randomly sample the columns of the training data
      cindices <- sample(x = 1:ncol(x_train), size = c, replace = c_replace)
      
      # subset the training response variable with the bagged rows
      rand_y_train <- y_train[rindices]
      
      # subset the training predictor variables with the bagged rows and columns
      rand_x_train <- x_train[rindices, cindices]
      
      #-- (2) Bag Validation Data --#
      
      if(!is.null(valid_size)){
        
        # subset the validation predictor variables with the columns
        rand_x_valid <- x_valid[, cindices]
        
        # subset the validation response variable with the columns
        rand_y_valid <- y_valid
        
      }
      
      #-- (3) Bag Testing Data --#
      
      if(!is.null(test)){
        
        # subset the test predictor variables with the columns
        rand_x_test <- x_test[, cindices]
        
      }
    }
    
    #-------------------------------------------------------------#
    # Fit the Mars Models                                         #
    #-------------------------------------------------------------#
    
    
    Y <- rand_y_train
    X <- rand_x_train
   
    # fit the MARS model
    MARSmodel <- earth(x = X, 
                       y = Y, 
                       glm = list(family = family),
                       degree = degree,
                       pmethod = pmethod,
                       nprune = nprune,
                       nfold = nfold)
    
    if(plots == TRUE){
      
      
    }
    
    #----------------------------------------------------#
    # Make validations on the valid set using the glmnet #
    #----------------------------------------------------#
    
    if(!is.null(valid_size)){
      
      mod_valid_pred <- predict(object = MARSmodel, 
                                newdata = rand_x_valid,
                                type = type)
      
      if(family == "gaussian"){ 
        
        # Update the Validation Data Frame
        row_update <- valid_size * (mindex - 1)
        validations[(1:valid_size) + row_update, 1] <- rand_y_valid
        validations[(1:valid_size) + row_update, 2] <- mod_valid_pred
        
        # calculate error
        error <- as.vector(rand_y_valid) - as.vector(mod_valid_pred)
        validations[(1:valid_size) + row_update, 3] <- error
        
      } else if(family == "binomial"){
        
        # Update the Validation Data Frame
        validations[1:nrow(valid) + (nrow(valid) * (mindex - 1)), 1] <- as.vector(rand_y_valid)
        validations[1:nrow(valid) + (nrow(valid) * (mindex - 1)), 2] <- as.vector(mod_valid_pred)
        
      } else if(family == "poisson"){
        
        # Update the Validation Data Frame
        row_update <- valid_size * (mindex - 1)
        validations[(1:valid_size) + row_update, 1] <- rand_y_valid
        validations[(1:valid_size) + row_update, 2] <- mod_valid_pred
        
        # calculate error
        error <- as.vector(rand_y_valid) - as.vector(mod_valid_pred)
        validations[(1:valid_size) + row_update, 3] <- error
        
      }
      
    }
    
    #---------------------------------------------------#
    # Make predictions on the test set using the glmnet #
    #---------------------------------------------------#
    
    if(!is.null(test)  & family == "gaussian"){
      
      # Make predictions using the test dataset
      mod_test_pred <- predict(object = MARSmodel, 
                               newdata = rand_x_test,
                               type = type)
      
      # Update the Predictions Data Frame
      predictions[,i] <- mod_test_pred
      
      # assign a column name to the data frame
      colnames(predictions)[i] <- paste("Model", mindex, sep = " ")
      
    } else if (!is.null(test)  & family == "binomial"){
      
      # Make predictions using the test dataset
      mod_test_pred <- predict(object = MARSmodel, 
                               newdata = rand_x_test,
                               type = type)
      
      # Update the Predictions Data Frame
      predictions[,i] <- mod_test_pred
      
      # assign a column name to the data frame
      colnames(predictions)[i] <- paste("Model", mindex, sep = " ")
      
    } else if (!is.null(test) & family == "poisson"){
      
      # Make predictions using the test dataset
      mod_test_pred <- predict(object = MARSmodel, 
                               newdata = rand_x_test,
                               type = type)
      
      # Update the Predictions Data Frame
      predictions[,i] <- mod_test_pred
      
      # assign a column name to the data frame
      colnames(predictions)[i] <- paste("Model", mindex, sep = " ")
      
    }
    
    #-------------------------------#
    # Print a Model Progress Report #
    #-------------------------------#
    
    # print a model index update
    print(paste("Model", mindex, "of", n, "Completed.", sep = " "))
    
    # update the model index
    mindex <- mindex + 1
  }
  
  #---------------------------------------------------------------------------#
  # Evaluate the Mode                                                         #
  #---------------------------------------------------------------------------#
  
  if(!is.null(valid_size)){
    
    if(family == "gaussian"){
      
      #-- (1) Evaulation Metrics using the Validation Set --#
      
      # Create a data frame to hold the Model Evaluation Metrics
      metrics <- as.data.frame(x = matrix(nrow = 1,
                                          ncol = 7))
      
      # Add approptiate column names to the metrics data frame
      colnames(metrics) <- c("SSE", "SAE", "MSE", "MAE", "RMSE", "RMAE", "Rsq")
      
      # Calculate the sum squared error
      metrics[1,1] <- sum((validations$Err.)^2)
      metrics[1,2] <- sum(abs(validations$Err.))
      metrics[1,3] <- mean((validations$Err.)^2)
      metrics[1,4] <- mean(abs(validations$Err.))
      metrics[1,5] <- sqrt(mean((validations$Err.)^2))
      metrics[1,6] <- sqrt(mean(abs(validations$Err.)))
      metrics[1,7] <- 1 - (sum(validations$Err.^2) / sum((validations$Obs. - mean(validations$Pred.))^2))
      
      # Update the lasso list with the relevant information
      #mars_list[[1]] <- validations
      mars_list[[2]] <- metrics
      
      #-- (2) Visualisations using the validation set --#
      
      if(plots == TRUE){
        
        # Residual vs Fits Plot
        p1 <- plot(x = validations$Pred., 
                   y = validations$Err., 
                   main = "Residuals vs Fits Plot", 
                   xlab = "Fits", 
                   ylab = "Residuals")
        
        print(p1)
        
        # Response vs Fits Plot
        p2 <- plot(x = validations$Pred., 
                   y = validations$Obs., 
                   main = "Response vs Fits Plot", 
                   xlab = "Fits", 
                   ylab = "Response")
        
        print(p2)
        
        # Residual vs Response Plot
        p3 <- plot(x = validations$Obs., 
                   y = validations$Err., 
                   main = "Residuals vs Response Plot", 
                   xlab = "Response", 
                   ylab = "Residuals")
        
        print(p3)
      }
      
    } else if(family == "binomial"){
      
      #-- (1) Evaulation Metrics using the Validation Set --#
      
      # Assign Column Names
      colnames(validations) <- c("Obs.", "Class.", "Err.")
      
      # Concert the columns to factors
      validations[,1] <- as.factor(validations[,1])
      validations[,2] <- as.factor(validations[,2])
      validations[,3] <- as.factor(validations[,3])
      
      # add in the apropriate levels for the Err variable
      levels(validations[,1]) <- levels(validations[,2])
      levels(validations[,3]) <- c("FN", "FP", "TN", "TP")
      
      # use a for loop with if else statements to fill in the classification error
      for(i in 1:nrow(validations)){
        if(validations[i, 1] == "1" & validations[i, 2] == "1"){
          validations[i, 3] <- "TP"
        } else if(validations[i, 1] == "0" & validations[i, 2] == "0"){
          validations[i, 3] <- "TN"
        } else if(validations[i, 1] == "1" & validations[i, 2] == "0"){
          validations[i, 3] <- "FN"
        } else if(validations[i, 1] == "0" & validations[i, 2] == "1"){
          validations[i, 3] <- "FP"
        }
      }
      
      # Basic Measures
      STP <- length(validations[validations$Err. == "TP", 3])
      STN <- length(validations[validations$Err. == "TN", 3])
      SFP <- length(validations[validations$Err. == "FP", 3])
      SFN <- length(validations[validations$Err. == "FN", 3])
      
      # Calulate Performance Metrics
      ACC <- (STP + STN) / (STP + STN + SFP + SFN)
      BACC <- ((STP / (STP + SFP)) + (STN / (STN + SFN))) / 2
      TPR <- STP /(STP + SFN)
      TNR <- STN / (STN + SFP)
      PPR <- STP / (STP + SFP)
      NPR <- STN / (STN + SFN)
      F1S <- (2 * STP) / (2*STP + SFP + SFN)
      
      # Create a data frame to hold the Model Evaluation Metrics
      metrics <- as.data.frame(x = matrix(nrow = 1,
                                          ncol = 7))
      
      # Add approptiate column names to the metrics data frame
      colnames(metrics) <- c("ACC", "BACC", "TPR", "TNR", "PPR", "NPR", "F1S")
      
      # Save Performance Metrics into the metrics data frame
      metrics[1,1] <- ACC
      metrics[1,2] <- BACC
      metrics[1,3] <- TPR
      metrics[1,4] <- TNR
      metrics[1,5] <- PPR
      metrics[1,6] <- NPR
      metrics[1,7] <- F1S
      
      # Update the lasso list with the relevant information
      #mars_list[[1]] <- validations
      mars_list[[2]] <- metrics
      
    } else if(family == "poisson"){
      
      #-- (1) Evaulation Metrics using the Validation Set --#
      
      # Create a data frame to hold the Model Evaluation Metrics
      metrics <- as.data.frame(x = matrix(nrow = 1,
                                          ncol = 7))
      
      # Add approptiate column names to the metrics data frame
      colnames(metrics) <- c("SSE", "SAE", "MSE", "MAE", "RMSE", "RMAE", "Rsq")
      
      # Calculate the sum squared error
      metrics[1,1] <- sum((validations$Err.)^2)
      metrics[1,2] <- sum(abs(validations$Err.))
      metrics[1,3] <- mean((validations$Err.)^2)
      metrics[1,4] <- mean(abs(validations$Err.))
      metrics[1,5] <- sqrt(mean((validations$Err.)^2))
      metrics[1,6] <- sqrt(mean(abs(validations$Err.)))
      metrics[1,7] <- 1 - (sum(validations$Err.^2) / sum((validations$Obs. - mean(validations$Pred.))^2))
      
      # Update the lasso list with the relevant information
      #mars_list[[1]] <- validations
      mars_list[[2]] <- metrics
      
      #-- (2) Visualisations using the validation set --#
      
    } else if(family == "multinomial"){
      
    }
  }
  
  #----------------------------------------------#
  # Calculate Final Predictions for Test Set     #
  #----------------------------------------------#
  
  if(!is.null(test) & family == "gaussian"){
    
    # apply mean row wise
    test_pred <- apply(X = predictions, MARGIN = 1, FUN = function(x) mean(x))
    
    # save the test predictions to the lasso list
    mars_list[[3]] <- test_pred
    
  } else if(!is.null(test) & family == "binomial"){
    
    # find the mode row wise
    test_pred <- apply(X = predictions, MARGIN = 1, FUN = function(x) Mode(x))
    
    # save the test predictions to the lasso list
    mars_list[[3]] <- test_pred
    
  } else if(!is.null(test) & family == "poisson"){
    
    # apply mean row wise
    test_pred <- apply(X = predictions, MARGIN = 1, FUN = function(x) mean(x))
    
    # save the test predictions to the lasso list
    mars_list[[3]] <- test_pred
    
  }
  
  # return the lasso list
  return(mars_list)
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.