R/oss.seqVIF.R

Defines functions oss.seqVIF

Documented in oss.seqVIF

#' Perform Sequential Variable Inflation Factor Analysis
#'
#'Perform sequential Variable Inflation Factor analysis for a dataframe of predictor variables with dataframe columns representing predictor variable values
#'
#' @param cov_df data.frame* object where each column represent covariate values.
#' @param thresh numeric* object Perform sequential VIF until all covariates have VIF scores lower than this value
#' @param trace Boolean* object (default = FALSE). Should the results of each VIF be shown in the console?
#' @param show.R2.vals Boolean* object (default = FALSE) Should the VIF score be also described as an R squared value? (VIF = 1/1-R2)
#'
#' @return Returns a list of 4 objects: 1. Covariates_retained: First object is the column names of the covariates that are kept (i.e. have VIF scores lower than threshold).
#' 2. Covariates_removed: Second object is the column names of the covariates were removed (i.e., have VIF scores higher than threshold).
#' 3. VIF_removed: Third object is a summary of the VIF scores of each covariate that was removed. NA indicates scores fell below threshold and removal was not performed.
#' 4. VIF_all: Forth object is a complete report of each VIF score for each covariate for each time VIF was run.
#'
#' @export
#'
#' @examples
#' #Perform sequential VIF on an environmental raster stack
#' library(terra)
#' library(fmsb)
#'
#' #Generate autocorrelated raster layers from the Keene study area DEM
#' data(keene)
#' keene<- rast(keene)
#'
#' #Original DEM values
#' orig_pts <- terra::spatSample(x= keene, size=1000, na.rm=TRUE, method="random", as.df=FALSE)
#'
#' #Create values correlated with original DEM values
#' ras1 <- orig_pts + rnorm(orig_pts, mean=0, sd=5)
#' ras2 <- orig_pts + rnorm(orig_pts, mean=0, sd=5)
#' ras3 <- orig_pts + rnorm(orig_pts, mean=0, sd=5)
#' ras4 <- orig_pts + rnorm(orig_pts, mean=0, sd=10)
#' ras5 <- orig_pts + rnorm(orig_pts, mean=0, sd=10)
#'
#' df <- data.frame(orig_pts, ras1, ras2, ras3, ras4, ras5)
#'
#' #Create values with same mean as DEM values but not correlated
#' random_rasters <- NULL
#' for(i in 1:10){
#'   rand_ras <- rnorm(n=length(orig_pts), mean=mean(orig_pts), sd=10)
#'   random_rasters <- cbind(random_rasters, rand_ras)
#' }
#'
#' df <- cbind(df, random_rasters)
#'
#' #Run to perform sequential VIF analyses,
#' # removing the covariate with the highest VIF value,
#' # then repeating until threshold is hit
#'
#' vif_results <- oss.seqVIF(df, thresh=5, trace=FALSE, show.R2.vals=TRUE)

oss.seqVIF <- function(cov_df, thresh, trace=FALSE, show.R2.vals=FALSE){

  ###Load required library or throw error
  if(any(!'data.frame' %in% class(cov_df))){cov_df <- data.frame(cov_df)}

  ###Get initial vif value for each covariate and confirm that at least VIF is above threshold
  vif_init <- NULL
  for(covar in names(cov_df)){
    regressors <- names(cov_df)[names(cov_df) != covar]
    form <- stats::formula(paste(covar, '~', paste(regressors, collapse = '+')))
    vif_init <- rbind(vif_init, c(covar, fmsb::VIF(stats::lm(form, data = cov_df))))
  }
  max_vif <- max(as.numeric(vif_init[,2]), na.rm = TRUE)

  ###If no values are above threshold, use all covariates
  if(max_vif < thresh){
    message(paste("All variables have VIF <", thresh, ", max VIF = ", round(max_vif,2),
                  ". Returning table with VIF scores. Choose lower threshold to perform removal or keep all variables"))
    return(list(Covariates_retained=colnames(cov_df),
                VIF_all=vif_init))
    }

  ###If at least one value is above threshold, run sequential VIF
  else{

    in_dat <- cov_df #create copy to iterate over
    #Set empty vectors to store outputs
    names_kept_vec <- NULL; names_rem_vec <- NULL; vif_dfs <- NULL
    vif_rem_vals <- data.frame(Covariate = character(), VIF_at_removal = numeric()) #Setting as numeric doesn't really matter since it gets removed later

    #Backwards selection of explanatory variables, stops when all VIF values are below 'thresh'
    while(max_vif >= thresh){
    if(ncol(in_dat) <= 2){print("Removed all covariates but two, stopping"); break} #If not enough covariates to do linear model, stop - shouldn't this be 1?
      vif_vals <- NULL
      for(covar in names(in_dat)){
        regressors <- names(in_dat)[names(in_dat) != covar]
        form <- stats::formula(paste(covar, '~', paste(regressors, collapse = '+')))
        vif_vals <- rbind(vif_vals, c(covar, fmsb::VIF(stats::lm(form, data = in_dat))))
      }

      #Record max value
      max_vif <- max(as.numeric(vif_vals[,2]), na.rm = TRUE)
      #if(length(which(vif_vals[,2] == max_vif)) > 1){paste0("VIF scores tied, removing alphabetically")} #This should be incredibly rare/impossible
      max_row <- which(vif_vals[,2] == max_vif)[1]

      #We need this break so it doesn't remove the next variable below the threshold
      if(max_vif<thresh) break

      #Print output of each iteration
      if(trace){print(vif_vals); cat('\n'); cat('removed: ', vif_vals[max_row,1], max_vif,'\n\n')}

      #Record names you are removing
      names_rem_vec = append(names_rem_vec, vif_vals[max_row,1])
      #Record the VIF value that is being removed
      vif_rem_vals <- rbind(vif_rem_vals, vif_vals[max_row,])

      #Remove covariate from dataframe
      in_dat <- in_dat[,!names(in_dat) == vif_vals[max_row,1]]

      #Store VIF table
      vif_dfs <- c(vif_dfs, list(vif_vals))
    }

    #Record names you are keeping
    names_kept_vec <- names(in_dat)

    #Update columns of vif_rem_vals with covariates that were not removed during VIF because either:
    #they were below threshold or because linear model breaks down with 2 covariates
    colnames(vif_rem_vals) <- c("Covariate", "VIF_score_at_removal")
    vif_rem_vals[,2] <- as.numeric(vif_rem_vals[,2])

    below_rows = data.frame(Covariate = as.character(vif_dfs[[length(vif_dfs)]][,1]),
                            VIF_score_at_removal = as.numeric(vif_dfs[[length(vif_dfs)]][,2])
                            )
    below_rows = below_rows[order(-below_rows$VIF_score_at_removal),][-1,] #remove first entry since it's already been removed
    below_rows[,2] = NA

    vif_rem_vals <- rbind(vif_rem_vals, below_rows)

    #If you want R2 and pearson r values. Not sure how to best interpret these
    if(show.R2.vals){vif_rem_vals$R2 <- 1 - 1/(vif_rem_vals$VIF_score_at_removal)
    vif_rem_vals$r <- sqrt(vif_rem_vals$R2)}

    #This list is ordered from first VIF to last VIF
    #(i.e., first variables to be removed and first VIF tables run are at the start)
    return(list(Covariates_retained=names_kept_vec,
                Covariates_removed=names_rem_vec,
                VIF_removed=vif_rem_vals,
                VIF_all=vif_dfs))

  }
}
newdale/onsoilsurvey documentation built on Jan. 5, 2024, 1:35 a.m.