R/filter_vif.R

Defines functions filter_vif

Documented in filter_vif

#' @title Filters out equations with high multicollinearity
#'
#' @description This function takes a dataframe with several models and
#' calculates the maximum Variance Inflation Factor (VIF) for a given
#' model. And either filters out the ones with high collinearity or it
#' flags them accordingly
#'
#' @param all_forms A data frame generated by \code{\link{make_models}}
#' @param env_data A dataset with the variables described in all_froms
#' @param ncores An integer specifying the number of cores to use for parallel processing
#' @param filter logical, if TRUE it filters out the models with a maximum VIF of high or higher, if FALSE it generates a new column called collinearity, wich will
#' @param verbose logical, defaults TRUE, sends messages about processing times
#' @return A data.frame with the models, fitering out the ones with high collinearity or flagginf them.
#' @importFrom parallel makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom dplyr bind_rows filter mutate
#' @importFrom stringr str_replace_all
#' @importFrom stats rnorm lm as.formula complete.cases
#'
#' @examples
#'\donttest{
#' library(vegan)
#' data(dune)
#' data(dune.env)
#' AllModels <- make_models(vars = c("A1", "Moisture", "Manure"))
#'
#' filter_vif(all_forms = AllModels,
#'            env_data = dune.env)
#'}
#' @export

filter_vif <- function(all_forms,
                       env_data,
                       ncores = 2,
                       filter = TRUE,
                       verbose = TRUE){
  max_vif <- x <- NULL
  meta_data <- all_forms
  meta_data$max_vif <- NA
  if(!filter){
    meta_data$collinearity <- NA
  }

  # Check for missing values
  missing_rows <- !complete.cases(env_data)

  if (any(missing_rows)) {
    if(verbose){
      # Print message about missing rows and columns
      message(sprintf("Removing %d rows with missing values\n", sum(missing_rows)))
      message("Columns with missing values: ")
      message(names(env_data)[colSums(is.na(env_data)) > 0], sep = ", ")
    }
  }

  # Filter out missing rows
  new_env_data <- env_data[complete.cases(env_data), ]

  cl <- parallel::makeCluster(ncores)
  doParallel::registerDoParallel(cl)

  Fs <- foreach(x = 1:nrow(meta_data), .packages = c("dplyr", "AICcPermanova", "stringr"), .combine = bind_rows) %dopar% {

    Response = new_env_data
    Response$y <- rnorm(n = nrow(Response))
    gc()

    Temp <- meta_data[x,]


    Temp$max_vif <- tryCatch(
      expr = VIF(lm(as.formula(stringr::str_replace_all(Temp$form[1], "Distance ", "y")), data = Response)),
      error = function(e) NA
    )
    Temp
  }
  parallel::stopCluster(cl)

  if(filter){
    Fs <- Fs |>
      dplyr::filter(max_vif < 5)
  }

  if(!filter){
    Fs <- Fs |>
      dplyr::mutate(collinearity = ifelse(max_vif < 5, "low", "high"))
  }

  return(Fs)
}

Try the AICcPermanova package in your browser

Any scripts or data that you put into this service are public.

AICcPermanova documentation built on April 11, 2023, 6:01 p.m.