R/fit_models.R

Defines functions VIF fit_models

Documented in fit_models VIF

#' @title Fit PERMANOVA models and arrange by AICc
#'
#' @description This function fits PERMANOVA models for all combinations of variables in a given dataset, and arranges the models by Akaike Information Criterion (AICc) score. The function also calculates the maximum variance inflation factor (max_vif) for each model.
#'
#' @param all_forms A data frame generated by \code{\link{make_models}}
#' @param veg_data A dataset with vegetation presence absense or abundance data
#' @param env_data A dataset with the variables described in all_froms
#' @param ncores An integer specifying the number of cores to use for parallel processing
#' @param log logical if true, a log file will be generated
#' @param verbose logical, defaults TRUE, sends messages about processing times
#' @param logfile the text file that will be generated as a log
#' @param multiple after how many loops to write a log file
#' @param method method for distance from \code{\link{vegdist}}
#' @param strata a block variable similar to the use in \code{\link{adonis2}}
#'
#' @return A data.frame with fitted models arranged by AICc, including the formula used, the number of
#'         explanatory variables, R2, adjusted R2, and the AICc and max VIF.
#'
#'
#' @importFrom parallel makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %dopar%
#' @importFrom vegan vegdist adonis2
#' @importFrom dplyr bind_rows bind_cols select filter arrange
#' @importFrom broom tidy
#' @importFrom tidyr pivot_longer
#' @importFrom stringr str_replace_all
#' @importFrom stats rnorm lm as.formula complete.cases
#' @examples
#'
#' \donttest{
#' library(vegan)
#' data(dune)
#' data(dune.env)
#'
#' AllModels <- make_models(vars = c("A1", "Moisture", "Manure"))
#'
#' fit_models(all_forms = AllModels,
#'            veg_data = dune,
#'            env_data = dune.env)
#' }
#'
#' @references
#' Anderson, M. J. (2001). A new method for non-parametric multivariate analysis of variance. Austral Ecology, 26(1), 32-46.
#' https://doi.org/10.1111/j.1442-9993.2001.01070.pp.x

#'
#' @export

fit_models <- function(all_forms,
                       veg_data,
                       env_data,
                       method = "bray",
                       ncores = 2,
                       log = TRUE,
                       logfile = "log.txt",
                       multiple = 100,
                       strata = NULL,
                       verbose = FALSE){
  AICc <- R2 <- term <- x <- NULL
  if(log){
    if(file.exists(logfile)){
      file.remove(logfile)
    }
  }

  meta_data <- all_forms
  if(!("max_vif" %in% colnames(meta_data))){
    meta_data$max_vif <- NA
  }
  vegetation_data = veg_data

  # Check for missing values
  missing_rows <- !complete.cases(env_data)

  if (any(missing_rows)) {
    if(verbose){
      # Print message about missing rows and columns
      message(sprintf("Removing %d rows with missing values\n", sum(missing_rows)))
      message("Columns with missing values: ")
      message(names(env_data)[colSums(is.na(env_data)) > 0], sep = ", ")
    }
  }

  # Filter out missing rows
  new_env_data <- env_data[complete.cases(env_data), ]
  vegetation_data <- vegetation_data[complete.cases(env_data), ]


  cl <- parallel::makeCluster(ncores)
  doParallel::registerDoParallel(cl)

  Distance <- vegan::vegdist(vegetation_data, method = method)

  Fs <- foreach(x = 1:nrow(meta_data), .packages = c("vegan", "dplyr", "AICcPermanova", "tidyr", "broom"), .combine = bind_rows, .export = c("Distance")) %dopar% {

      Response = new_env_data
      Response$y <- rnorm(n = nrow(Response))
      gc()

      Temp <- meta_data[x,]

      if(is.null(strata)){
        Model <- try(vegan::adonis2(as.formula(Temp$form[1]), data = Response, by = "margin"))
      }

      if(!is.null(strata)){
        # Convert strata variable to factor
        strata_factor <- factor(Response[[strata]])
        Model <- try(with(Response, vegan::adonis2(as.formula(Temp$form[1]), data = Response, by = "margin", strata = strata_factor)), silent = TRUE)
      }



      Temp <- tryCatch(
        expr = cbind(Temp, AICcPermanova::AICc_permanova2(Model)),
        error = function(e) NA
      )


    if(is.na(Temp$max_vif)){
      Temp$max_vif <- tryCatch(
        expr = VIF(lm(as.formula(stringr::str_replace_all(Temp$form[1], "Distance ", "y")), data = Response)),
        error = function(e) NA
      )

    }

      Rs <- tryCatch(
        {
          tidy_model <- broom::tidy(Model)
          if (inherits(tidy_model, "try-error")) {
            stop("Error occurred in broom::tidy(Model)")
          }
          tidy_model |>
            dplyr::filter(!(term %in% c("Residual", "Total"))) |>
            dplyr::select(term, R2) |>
            tidyr::pivot_wider(names_from = term, values_from = R2)
        },
        error = function(e) {
          message("Error: ", conditionMessage(e))
          NULL
        }
      )


      if(log){
        if((x %% multiple) == 0){
          sink(logfile, append = TRUE)
          cat(paste("finished", x, "number of models", Sys.time(), "of",  nrow(meta_data)))
          cat("\n")
          sink()
        }

      }

      Temp <- bind_cols(Temp, Rs)
      Temp
    }
  parallel::stopCluster(cl)
  Fs <- Fs |>
    dplyr::arrange(AICc)
  return(Fs)
}

#' Get Maximum Variance Inflation Factor (VIF) from a Model
#'
#' This function calculates the maximum Variance Inflation Factor (VIF) for a given model.
#' The VIF is a measure of collinearity among predictor variables within a regression model.
#' It quantifies how much the variance of an estimated regression coefficient is increased due to collinearity.
#' A VIF of 1 indicates no collinearity, while values above 1 indicate increasing levels of collinearity.
#' A VIF of 5 or greater is often considered high, indicating a strong presence of collinearity.
#'
#' @param model A regression model, such as those created by lm, glm, or other similar functions.
#'
#' @return The maximum VIF value.
#'
#' @references
#' - Belsley, D. A., Kuh, E., & Welsch, R. E. (1980). Regression Diagnostics: Identifying Influential Data and Sources of Collinearity. John Wiley & Sons.
#' - Kutner, M. H., Nachtsheim, C. J., Neter, J., & Li, W. (2004). Applied Linear Statistical Models. McGraw-Hill/Irwin.
#' - O'Brien, R. M. (2007). A caution regarding rules of thumb for variance inflation factors. Quality & Quantity, 41(5), 673-690.
#'
#' @importFrom car vif
#'
#' @keywords collinearity
#'
#' @export

VIF <- function(model) {
      tryCatch({
         vif <- car::vif(model)
         max(vif)
       }, error = function(e) {
         if (grepl("aliased coefficients", e$message)) {
           20000
         } else if (grepl("model contains fewer than 2 terms", e$message)) {
           0
         } else {
           stop(e)
         }
       })
     }

Try the AICcPermanova package in your browser

Any scripts or data that you put into this service are public.

AICcPermanova documentation built on April 11, 2023, 6:01 p.m.