R/remove_collinearity.R

Defines functions vif.step vif.cor vif vif2 maxCor detect_collinearity sample_spatial_obj

#' @name sample_spatial_obj
#' @param predictors, a raster, either from raster or terra format
#' @param method, The correlation method to be used:"vif.cor", "vif.step", "pearson", "spearman"
#' or "kendall". "vif.cor" and "vif.step" use the Variance Inflation factor and the pearson correlation, more details
#' here https://www.rdocumentation.org/packages/usdm/versions/1.1-18/topics/vif. If your variables are skewed or have outliers 
#' (e.g. when working with precipitation variables) you should favour the Spearman or Kendall methods.
#' @param method_cor_vif, the correlation method to be used with "vif.cor" method. "pearson", "spearman"
#' or "kendall".
#' @param mask, a 
#' @param sample, boolean value. If TRUE, sample a number of points equal to nb_points before evaluating collinearity
#' @param nb_points, a numeric value. Only used if sample.points = TRUE. The number of sampled points from the raster.
#' @param cutoff_cor, a numeric value corresponding to the maximum threshold of linear correlation (for "vif.cor", "pearson", "spearman").
#' @param cutoff_vif, a numeric value corresponding to the maximum threshold of VIF (only used for method "vif.step").
#' @param export, boolean value. If TRUE, the list of selected variables and the correlation matrix will be saved.
#' @param title_export, a string. If export is TRUE, title of the correlation matrix.
#' @param path, a string. If export is TRUE, path to save the corrrelation matrix and list of uncorrelated variables.
#' @return a raster stack of variables not intercorrelated
#' @import terra raster virtualspecies

sample_spatial_obj <- function(obj_to_sample, nb_points = 5000) {
  names_layers <- names(obj_to_sample)

  if(inherits(obj_to_sample, "RasterStack")) {
    
    if (raster::ncell(obj_to_sample) > nb_points) {
      env_df <- raster::sampleRandom(obj_to_sample, size = nb_points, na.rm = TRUE)
      
    } else {
      env_df <- raster::getValues(obj_to_sample) 
    }

  } else if (inherits(obj_to_sample, "SpatRaster")) {
    if (terra::ncell(obj_to_sample) > nb_points) {
      env_df <- terra::spatSample(obj_to_sample, size = nb_points, na.rm = TRUE,
                                  method="random", replace=FALSE) %>% as.matrix()
      
    } else {
      env_df <- terra::values(obj_to_sample, matrix = T)
      
    }
    
  } else if (inherits(obj_to_sample, "data.frame")) {
    if (nrow(env_df) > nb_points) {
      obj_to_sample <- obj_to_sample[sample(1:nrow(obj_to_sample), nb_points), ]
    
  } else {
    env_df <- obj_to_sample
  }
  } else if (inherits(obj_to_sample, "cube")) {
    env_df <- stacatalogue::extract_gdal_cube(obj_to_sample, n_sample = nb_points, simplify = T)
  }
    env_df <- setNames(data.frame(env_df), names_layers)
   message(paste0(nrow(env_df), " points randomly selected (excluding NA's)."))
  return(env_df)
}

detect_collinearity <- function(env_df,
                                method = "vif.cor",
                                method_cor_vif = NULL,
                                cutoff_cor = 0.7,
                                cutoff_vif = 10,
                                export = FALSE,
                                title_export = "",
                                path = NULL) {
  

  #remove NA's
  comp <-  complete.cases(env_df)
  env_df <- env_df[comp, ]
  
  if (method == "vif.cor") {
    
    excluded <- vif.cor(env_df, th = cutoff_cor,
                        method = method_cor_vif,
                        maxobservations = nrow(env_df))
    retained <- setdiff(names(env_df), excluded)
  }
  
  if (method == "vif.step") {
    
    excluded <- vif.step(env_df, th = cutoff_vif, maxobservations = nrow(env_df))
    retained <- setdiff(names(env_df), excluded)
  }
  
  
  if(method %in% c("pearson", "spearman", "kendall")) {

        # Correlation based on Pearson
    cor.matrix <- 1 - abs(stats::cor(env_df, method = method))
    
    # Transforming the correlation matrix into an ascendent hierarchical classification
    dist.matrix <- stats::as.dist(cor.matrix)
    ahc <- stats::hclust(dist.matrix, method = "complete")
    groups <- stats::cutree(ahc, h = 1 - cutoff_cor)
    if(length(groups) == max(groups)){ 
      retained <- names(env_df)
    } else { 
      retained <- NULL
      for (i in 1:max(groups))
      {
        retained <- c(retained, sample(names(groups[groups == i]), 1))
      }
    } 
    excluded <- setdiff(names(env_df), retained)

  }

  nb_excluded <- length(excluded)
  
  # If some variables excluded
  if (length(nb_excluded) > 0) {
    
    if (method %in% c("vif.cor", "vif.step" )) {
      message(sprintf("%s variables excluded with VIF threshold = %s",
                      nb_excluded, cutoff_vif))
    } else {
      message(sprintf("%s variables excluded with method %s and cutoff_cor = %s", 
                      nb_excluded, method, cutoff_cor))
      
    }
    
    # If no variables excluded
  } else {  
    if (method %in% c("vif.cor", "vif.step" )) {
      message(sprintf("No variables excluded with VIF threshold = %s",
                      cutoff_vif))
    } else {
      message(sprintf("No variables excluded with method %s and cutoff_cor = %s", 
                      method, cutoff_cor))
      
    }
  }
  
  
  if (export) {
    if (is.null(path)) {
      stop("You must provide a path to export the correlation matrix.")
    }
    if (method == "vif.cor") {
      method <- method_cor_vif
    } else if (method == "vif.step") {
      method <- "pearson"
    }
    cm <- cor(env_df, method = method, use =  "complete.obs")
    
    png(file = paste(path, 'cm_plot.png', sep = "/"))
    corrplot::corrplot(cm, tl.col = "black",
             title = title_export)
    dev.off()
    message("Correlation plot saved.")
    
    write.table(retained, file = paste(path, "retained_predictor.csv", sep = "/"), row.names = F, col.names = F, sep = ";")
    message("List of uncorrelated variables saved.")
  }
  return(retained)
  
}


# from usdm package (https://github.com/cran/usdm/blob/master/R/vif.R)
maxCor <- function(k) {
  k <- abs(k)
  n <- nrow(k)
  for (i in 1:n) k[i:n, i] <- NA
  w <- which.max(k)
  c(rownames(k)[((w %/% nrow(k)) + 1)], colnames(k)[w %% nrow(k)])
}

# from usdm package (https://github.com/cran/usdm/blob/master/R/vif.R)
vif2 <- function(y, w) {
  z <- rep(NA, length(w))
  names(z) <- colnames(y)[w]
  for (i in 1:length(w)) {
    z[i] <- 1 / (1 - summary(lm(as.formula(paste(colnames(y)[w[i]], "~.", sep = "")), data = y))$r.squared)
  }
  return(z)
}

# from usdm package (https://github.com/cran/usdm/blob/master/R/vif.R)
vif <- function(y) {
  z <- rep(NA, ncol(y))
  names(z) <- colnames(y)
  for (i in 1:ncol(y)) {
    z[i] <- 1 / (1 - summary(lm(y[, i] ~ ., data = y[-i]))$r.squared)
  }
  return(z)
}

# adapted and corrected from usdm package (https://github.com/cran/usdm/blob/master/R/vif.R)

vif.cor <- function(x, th = 0.9, method = "pearson", maxobservations = 5000) {
  
  if (is.null(method) || !method %in% c("pearson", "kendall", "spearman")) method <- "pearson"
  x <- as.data.frame(x)
  
  LOOP <- TRUE
  if (nrow(x) > maxobservations) x <- x[sample(1:nrow(x), maxobservations), ]
  x <- na.omit(x)
  exc <- c()
  while (LOOP) {
    xcor <- abs(cor(x, method = method))
    mx <- maxCor(xcor)
    if (xcor[mx[1], mx[2]] >= th) {
      w1 <- which(colnames(xcor) == mx[1])
      w2 <- which(rownames(xcor) == mx[2])
      v <- vif2(x, c(w1, w2))
      ex <- mx[which.max(v[mx])]
      exc <- c(exc, ex)
      x <- as.data.frame(x[, -which(colnames(x) == ex)])
      if (ncol(x) == 1) {
        LOOP <- FALSE
      }
    } else {
      LOOP <- FALSE
    }
  }
  exc
}

# adapted and corrected from usdm package (https://github.com/cran/usdm/blob/master/R/vif.R)
vif.step <- function(x, th = 10, maxobservations = 5000) {
  LOOP <- TRUE
  x <- as.data.frame(x)
  x <- na.omit(x)
  if (nrow(x) > maxobservations) x <- x[sample(1:nrow(x), maxobservations), ]
  exc <- c()
  while (LOOP) {
    v <- vif(x)
    if (v[which.max(v)] >= th) {
      ex <- names(v[which.max(v)])
      exc <- c(exc, ex)
      x <- x[, -which(colnames(x) == ex)]
      if (ncol(x) == 1) {
        LOOP <- FALSE
      }
    } else {
      LOOP <- FALSE
    }
  }
  
  exc
}
ReseauBiodiversiteQuebec/sdm-pipeline documentation built on June 23, 2022, 9:10 p.m.