R/SDM_GEN_MAXENT_FUNCTIONS.R

Defines functions compile_sdm_results var_importance fit_maxent_targ_bg_full_no_crop fit_maxent_targ_bg_back_sel_crop run_sdm_analysis_crop fit_maxent_targ_bg_back_sel_no_crop run_sdm_analysis_no_crop

Documented in compile_sdm_results fit_maxent_targ_bg_back_sel_crop fit_maxent_targ_bg_back_sel_no_crop fit_maxent_targ_bg_full_no_crop run_sdm_analysis_crop run_sdm_analysis_no_crop var_importance

#########################################################################################################################
###################################  FUNCTIONS FOR RUNNING SDM ANALYSIS ---- ############################################
#########################################################################################################################






#' @title Run SDM analyses for 'species with data' format.
#' 
#' @description This function takes a data frame of all taxa records,
#' and runs a specialised maxent analysis for each taxa.
#' It uses the dismo package https://github.com/johnbaums/rmaxent
#' @param taxa_list          Character string - the taxa to run maxent models for
#' @param taxa_level         Character string - the taxnomic level to run maxent models for
#' @param sdm_df             SpatialPointsDataFrame. Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param sdm_predictors     Character string - Vector of enviro conditions that you want to include
#' @param maxent_dir         Character string - The file path used for saving the maxent output
#' @param bs_dir             Character string - The file path used for saving the backwards selection maxent output
#' @param backwards_sel      Logical - Run backwards selection using the maxent models (T/F)?
#' @param template_raster    RasterLayer - Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param cor_thr            Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr            Numeric - The min allowable percent variable contribution
#' @param k_thr              Numeric - The min number of variables to be kept in the model
#' @param min_n              Numeric - The min number of records for running maxent
#' @param max_bg_size        Numeric - The max number of background points to keep
#' @param background_buffer_width Numeric - The max distance (km) from occ points that BG points should be selected?
#' @param feat_save           Logical - Save features of the occ and bg data (T/F)?
#' @param features           Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates         Numeric - The number of replicates to use
#' @param responsecurves     Logical - Save response curves of the maxent models (T/F)?
#' @param poly_path          Character string - file path to feature polygon layer
#' @param epsg               Numeric - ERSP code of coord ref system to be translated into WKT format
#' @export run_sdm_analysis_no_crop
run_sdm_analysis_no_crop = function(taxa_list,
                                    taxa_level,
                                    sdm_df,
                                    sdm_predictors,
                                    maxent_dir,
                                    bs_dir,
                                    backwards_sel,
                                    template_raster,
                                    cor_thr,
                                    pct_thr,
                                    k_thr,
                                    min_n,
                                    max_bg_size,
                                    background_buffer_width,
                                    features,
                                    feat_save,
                                    replicates,
                                    responsecurves,
                                    poly_path,
                                    epsg) {
  
  
  ## Convert to SF object for selection - inefficient
  message('Preparing spatial data for SDMs')
  
  poly <- st_read(poly_path) %>% 
    st_transform(., st_crs(epsg))
  
  ## Loop over all the taxa
  ## taxa <- sort(taxa_list)[2]
  lapply(taxa_list, function(taxa){
    
    ## Skip the taxa if the directory already exists, before the loop
    # outdir <- maxent_dir
    
    ## Could also check the folder size, so folders with no contents aren't skipped eg
    ## && sum(file.info(dir_name)$size) < 1000 (EG 1MB)
    dir_name = file.path(maxent_dir, gsub(' ', '_', taxa))
    
    if(!dir.exists(dir_name)) {
      message('Maxent not yet run for ', taxa, ' - running')
      
      ## Create the directory for the taxa in progress,
      ## so other parallel runs don't run the same taxa
      dir.create(dir_name)
      file.create(file.path(dir_name, "in_progress.txt"))
      
      ## Print the taxa being processed to screen
      if(taxa %in% sdm_df[[taxa_level]]) {
        message('Doing ', taxa)
        
        ## Subset the records to only the taxa being processed - in the searched taxa or returned
        occurrence <- sdm_df %>% .[.[[taxa_level]] %in% taxa, ]
        message('Using ', nrow(occurrence), ' occ records from ', unique(occurrence$SOURCE))
        
        ## Now get the background points. These can come from any taxa, other than the modelled taxa.
        ## However, they should be limited to the same SOURCE as the occ data
        background <- sdm_df %>% .[!.[[taxa_level]] %in% taxa, ]
        message('Using ', nrow(background), ' background records from ', unique(background$SOURCE))
        gc()
        
        ## Finally fit the models using FIT_MAXENT_TARG_BG. Also use tryCatch to skip any exceptions
        tryCatch(
          fit_maxent_targ_bg_back_sel_no_crop(occ                     = occurrence,
                                              bg                      = background,
                                              sdm_predictors          = sdm_predictors,
                                              taxa                    = taxa,
                                              outdir                  = maxent_dir,
                                              bsdir                   = bs_dir,
                                              backwards_sel           = backwards_sel,
                                              cor_thr                 = cor_thr,
                                              pct_thr                 = pct_thr,
                                              k_thr                   = k_thr,
                                              
                                              template_raster         = template_raster,
                                              min_n                   = min_n,
                                              max_bg_size             = max_bg_size,
                                              background_buffer_width = background_buffer_width,
                                              features                = features,
                                              feat_save               = feat_save,
                                              replicates              = replicates,
                                              responsecurves          = responsecurves,
                                              poly                    = poly),
          
          
          ## Save error message
          error = function(cond) {
            
            ## How to put the message into the file?
            file.create(file.path(dir_name, "sdm_failed.txt"))
            message(taxa, ' failed')
            cat(cond$message, file = file.path(dir_name, "sdm_failed.txt"))
            warning(taxa, ': ', cond$message)
            
          })
        
      } else {
        message(taxa, ' skipped - no data.') ## This condition ignores taxa which have no data...
        file.create(file.path(dir_name, "completed.txt"))
      }
      
      ## now add a file to the dir to denote that it has completed
      file.create(file.path(dir_name, "completed.txt"))
      
    } else {
      message(taxa, ' sdm already exists, skipped') ## This condition ignores taxa which have no data...
    }
    
  })
}




#' @title Maxent Backwards selection for 'species with data' format, without biogeographic cropping. 
#' @description 
#' This function takes a data frame of all taxa records,
#' and runs a specialised maxent analysis for each taxa.
#' It uses the rmaxent package https://github.com/johnbaums/rmaxent
#' It assumes that the input df is that returned by the prepare_sdm_table function
#' 
#' @param occ                SpatialPointsDataFrame - Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param bg                 SpatialPointsDataFrame - Spdf of of candidate background points
#' @param sdm_predictors     Character string - Vector of enviro conditions that you want to include
#' @param outdir             Character string - The file path used for saving the maxent output
#' @param bsdir              Character string - The file path used for saving the backwards selection maxent output
#' @param backwards_sel      Logical - Run backwards selection using the maxent models (T/F)?
#' @param template_raster    RasterLayer -  Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param template_raster    RasterLayer -  Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param cor_thr            Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr            Numeric - The min allowable percent variable contribution
#' @param k_thr              Numeric - The min number of variables to be kept in the model
#' @param min_n              Numeric - The min number of records for running maxent
#' @param max_bg_size        Numeric - The max number of background points to keep
#' @param feat_save          Logical - Save shapefiles of the occ and bg data (T/F)?
#' @param features           Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates         Numeric - The number of replicates to use
#' @param responsecurves     Logical - Save response curves of the maxent models (T/F)?
#' @param poly                Character string - path to shapefile
#' @param rep_args           RasterLayer of global koppen zones, in Mollweide54009 projection
#' @param full_args          Dataframe of global koppen zones, with columns : GRIDCODE, Koppen
#' @export fit_maxent_targ_bg_back_sel_no_crop 
fit_maxent_targ_bg_back_sel_no_crop <- function(occ,
                                                bg,
                                                sdm_predictors,
                                                taxa,
                                                outdir,
                                                bsdir,
                                                cor_thr,
                                                pct_thr,
                                                k_thr,
                                                backwards_sel,
                                                template_raster,
                                                min_n,
                                                max_bg_size,
                                                background_buffer_width,
                                                shapefiles,
                                                features,
                                                feat_save,
                                                replicates,
                                                responsecurves,
                                                poly) {
  
  ## First, stop if the outdir file exists:
  outdir_sp <- file.path(outdir, gsub(' ', '_', taxa))
  bsdir_sp  <- file.path(bsdir,  gsub(' ', '_', taxa))
  
  if(!file.exists(bsdir_sp)) stop('outdir does not exist :(', call. = FALSE)
  
  ## If the file doesn't exist, split out the features
  if(!file.exists(outdir_sp)) dir.create(outdir_sp)
  features <- unlist(strsplit(features, ''))
  
  ## Make sure user features are allowed: don't run the model if the
  ## features have been incorrectly specified in the main argument
  ## l: linear
  ## p: product
  ## q: quadratic
  ## h: hinge       ## disabled for this analysis
  ## t: threshold   ## disabled for this analysis
  if(length(setdiff(features, c('l', 'p', 'q', 'h', 't'))) > 1)
    stop("features must be a vector of one or more of ',
         'l', 'p', 'q', 'h', and 't'.")
  
  ## Create a buffer of xkm around the occurrence points
  ## So the spatial data change has caused this problem
  message('get distinct occurrence records')
  buffer <- aggregate(gBuffer(occ, width = background_buffer_width, byid = TRUE))
  
  ## Get unique cell numbers for taxa occurrences
  template_raster_spat <- terra::rast(template_raster)
  occ_sf               <- occ %>% st_as_sf()
  raster_extent        <- raster::extent(template_raster)
  cell_size            <- xres(template_raster)
  
  # cells_coords         <- raster::xyFromCell(template_raster, occ_coord)
  # cells_distinct       <- cells_coords %>% as.data.frame %>% distinct()
  # distinct_cells       <- dplyr::distinct(as.data.frame(cells)) %>% na.omit()
  
  # xcoord_snapped_to_cell = xmin + floor ((Xcoord - xmin) / cellsize) * cellsize + cellsize/2
  # ycoord_snapped_to_cell = ymin + floor ((Ycoord - ymin) / cellsize) * cellsize + cellsize/2
  
  occX      <- raster_extent[1] + floor((occ_sf$X - raster_extent[1]) / cell_size ) * cell_size  + cell_size /2
  occY      <- raster_extent[2] + floor((occ_sf$Y - raster_extent[2]) / cell_size ) * cell_size  + cell_size /2
  occ_sf$X  <- occX
  occ_sf$Y  <- occY
  occ_cells <- occ_sf %>% dplyr::distinct(., X, Y, .keep_all = TRUE)
  
  message(nrow(occ_cells), ' occurrence records (unique cells).')
  ## Skip taxa that have less than a minimum number of records: eg 20 taxa
  if(nrow(occ_cells) > min_n) {
    
    message('get distinct background records')
    bg_buffer   <- bg[buffer, ] %>% st_as_sf()
    
    if(nrow(bg_buffer) > 100000) {
      sample_row <- nrow(bg_buffer)/100000
      bg_buffer  <- bg_buffer[seq(1, nrow(bg_buffer), sample_row), ]
    }
  
    bgX         <- raster_extent[1] + floor((bg_buffer$X - raster_extent[1]) / cell_size ) * cell_size  + cell_size /2
    bgY         <- raster_extent[2] + floor((bg_buffer$Y - raster_extent[2]) / cell_size ) * cell_size  + cell_size /2
    bg_buffer$X <- bgX
    bg_buffer$Y <- bgY
    bg_cells    <- bg_buffer[!duplicated(bg_buffer, by = c("X", "Y")), names(bg_buffer), with = FALSE]

    ## Subset the background records to the 200km buffered polygon
    message(taxa, ' creating background cells')

    ## Don't use which to get unique cells, that has already been done
    message(taxa, ' Do not intersect background cells with Koppen zones')
    message('country poly is a ', class(poly))
    
    ## Now save an image of the background points
    ## This is useful to quality control the models
    save_name = gsub(' ', '_', taxa)
    
    ## Then save the occurrence points
    png(sprintf('%s/%s/%s_%s.png', outdir, save_name, save_name, "buffer_occ"),
        16, 10, units = 'in', res = 300)
    
    raster::plot(st_geometry(poly), 
                 main   = paste0('Occurence SDM records for ', taxa))
    
    raster::plot(buffer,                 add = TRUE, col = "red")
    raster::plot(st_geometry(occ_cells), add = TRUE, col = "blue")
    dev.off()
    gc()
    
    ## Reduce background sample, if it's larger than max_bg_size
    if (nrow(bg_cells) > max_bg_size) {
      
      message(nrow(bg_cells), ' target taxa background records for ', taxa,
              ', reduced to random ', max_bg_size, ' using random points from :: ', unique(bg_cells$SOURCE))
      bg.samp <- bg_cells[sample(nrow(bg_cells), max_bg_size), ]
      
    } else {
      ## If the bg points are smaller that the max_bg_size, just get all the points
      message(nrow(bg_cells), ' target taxa background records for ', taxa,
              ', using all points from :: ', unique(bg_cells$SOURCE))
      bg.samp <- bg_cells
    }
    
    ## Now save the buffer, the occ and bg points as shapefiles
    if(feat_save) {
      
      suppressWarnings({
        message(taxa, ' writing occ and bg geopackages')
        
        st_write(buffer %>% st_as_sf(), 
                 dsn    = paste0(outdir_sp, '/', save_name, '_maxent_points.gpkg'), 
                 layer  = paste0(save_name, '_buffer'), 
                 quiet  = TRUE, 
                 append = FALSE)
        
        st_write(bg.samp %>% st_as_sf(),   
                 dsn    = paste0(outdir_sp, '/', save_name, '_maxent_points.gpkg'), 
                 layer  = paste0(save_name, '_samples'), 
                 quiet  = TRUE, 
                 append = FALSE)
        
        st_write(occ_cells,       
                 dsn    = paste0(outdir_sp, '/', save_name, '_maxent_points.gpkg'), 
                 layer  = paste0(save_name, '_occurrence'), 
                 quiet  = TRUE, 
                 append = FALSE)
      })
    }
    
    ## Also save the background and occurrence points as .rds files.
    saveRDS(bg.samp,   file.path(outdir_sp, paste0(save_name, '_bg.rds')))
    saveRDS(occ_cells, file.path(outdir_sp, paste0(save_name, '_occ.rds')))
    
    ## SWD = taxa with data. Now sample the environmental
    ## variables used in the model at all the occ and bg points
    swd_occ    <- occ_cells[, sdm_predictors]
    swd_occ_sp <- as_Spatial(swd_occ)
    saveRDS(swd_occ, file.path(outdir_sp, paste0(save_name,'_occ_swd.rds')))
    
    swd_bg     <- bg.samp[, sdm_predictors]
    saveRDS(swd_bg, file.path(outdir_sp, paste0(save_name, '_bg_swd.rds')))
    gc()
    
    ## Now combine the occurrence and background data
    swd <- st_join(swd_occ, swd_bg) #as.data.frame(rbind(swd_occ@data, swd_bg@data))
    saveRDS(swd, file.path(outdir_sp, 'swd.rds'))
    pa  <- rep(1:0, c(nrow(swd_occ), nrow(swd_bg)))
    
    ## Now, set the features to be used by maxent ::
    ## Linear, product and quadratic
    off <- setdiff(c('l', 'p', 'q', 't', 'h'), features)
    
    ## This sets threshold and hinge features to "off"
    if(length(off) > 0) {
      off <- c(l = 'linear=false',    p = 'product=false', q = 'quadratic=false',
               t = 'threshold=false', h = 'hinge=false')[off]
    }
    
    off <- unname(off)
    gc()
    
    if (backwards_sel) {
      
      ## Coerce the "taxa with data" (SWD) files to regular data.frames
      ## This is needed to use the simplify function
      swd_occ_df     <- as.data.frame(swd_occ_sp)
      swd_occ_df$lon <- NULL
      swd_occ_df$lat <- NULL
      swd_bg_df      <- as.data.frame(swd_bg)
      swd_bg_df$lon  <- NULL
      swd_bg_df$lat  <- NULL
      
      ## Need to create a taxa column here.
      swd_occ_df$searchTaxon <- taxa
      swd_bg_df$searchTaxon  <- taxa
      
      ## Make sure the colnames match
      common_cols <- intersect(names(swd_occ_df), names(swd_bg_df))
      swd_occ_df  <- dplyr::select(swd_occ_df, all_of(common_cols))
      swd_bg_df   <- dplyr::select(swd_bg_df,  all_of(common_cols))
      
      ## Run simplify rmaxent::simplify
      
      # Given a candidate set of predictor variables, this function identifies
      # a subset that meets specified multicollinearity criteria. Subsequently,
      # backward stepwise variable selection (VIF) is used to iteratively drop
      # the variable that contributes least to the model, until the contribution
      # of each variable meets a specified minimum, or until a predetermined
      # minimum number of predictors remains. It returns a model object for the
      # full model, rather than a list of models as does the previous function
      
      ## Using a modified versionof rmaxent::simplify, so that the name of the
      ## maxent model object "maxent_fitted.rds" is the same in both models.
      ## This is needed to run the mapping step over either the full or BS folder
      m <- local_simplify(
        
        swd_occ_df,
        swd_bg_df,
        path            = bsdir,
        taxa_column     = "searchTaxon",
        replicates      = replicates,  ## 5 as above
        response_curves = TRUE,
        logistic_format = TRUE,
        cor_thr         = cor_thr,
        pct_thr         = pct_thr,
        k_thr           = k_thr,
        features        = features,    ## LPQ as above
        quiet           = FALSE)
      
      ## Read the model in, because it's tricky to index
      bs.model <- readRDS(sprintf('%s/%s/full/maxent_fitted.rds', bsdir,  save_name))
      identical(length(bs.model@presence$Annual_mean_temp), nrow(occ))
      
      ## Save the chart corrleation file too for the training data set
      par(mar   = c(3, 3, 5, 3),
          oma   = c(1.5, 1.5, 1.5, 1.5))
      
      png(sprintf('%s/%s/full/%s_%s.png', bsdir,
                  save_name, save_name, "bs_predictor_correlation"),
          3236, 2000, units = 'px', res = 300)
      
      ## set margins
      par(mar   = c(3, 3, 5, 3),
          oma   = c(1.5, 1.5, 1.5, 1.5))
      
      ## Add detail to the response plot
      chart.Correlation(bs.model@presence,
                        histogram = TRUE, pch = 19,
                        title = paste0('Reduced variable correlations for ', save_name))
      
      dev.off()
      gc()
      
    } else {
      message("Don't run backwards selection")
    }
    
  } else {
    message('Fewer occurrence records than the number of cross-validation ',
            'replicates for taxa ', taxa,
            ' Model not fit for this taxa')
  }
}






#' @title Run SDM analyses for 'species with data' format, with bio-geographic cropping.
#' 
#' @description This function takes a data frame of all taxa records,
#' and runs a specialised maxent analysis for each taxa.
#' It uses the dismo package https://github.com/johnbaums/rmaxent
#' @param taxa_list          Character string - the taxa to run maxent models for
#' @param taxa_level         Character string - the taxnomic level to run maxent models for
#' @param sdm_df             SpatialPointsDataFrame. Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param sdm_predictors     Character string - Vector of enviro conditions that you want to include
#' @param maxent_dir         Character string - The file path used for saving the maxent output
#' @param bs_dir             Character string - The file path used for saving the backwards selection maxent output
#' @param backwards_sel      Logical - Run backwards selection using the maxent models (T/F)?
#' @param template_raster    RasterLayer - Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param cor_thr            Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr            Numeric - The min allowable percent variable contribution
#' @param k_thr              Numeric - The min number of variables to be kept in the model
#' @param min_n              Numeric - The min number of records for running maxent
#' @param max_bg_size        Numeric - The max number of background points to keep
#' @param background_buffer_width Numeric - The max distance (km) from occ points that BG points should be selected?
#' @param shapefiles         Logical - Save shapefiles of the occ and bg data (T/F)?
#' @param features           Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates         Numeric - The number of replicates to use
#' @param responsecurves     Logical - Save response curves of the maxent models (T/F)?
#' @param koppen_crop        Logical - Should we use a koppen zone raster to select background points?
#' @param country_shp             .Rds object - SpatialPolygonsDataFrame of Australia for mapping maxent points
#' @param Koppen_raster      RasterLayer of global koppen zones, in Mollweide54009 projection
#' @param Koppen_zones       Dataframe of global koppen zones, with columns : GRIDCODE, Koppen
#' @export
run_sdm_analysis_crop = function(taxa_list,
                                 taxa_level,
                                 sdm_df,
                                 sdm_predictors,
                                 maxent_dir,
                                 bs_dir,
                                 backwards_sel,
                                 template_raster,
                                 cor_thr,
                                 pct_thr,
                                 k_thr,
                                 min_n,
                                 max_bg_size,
                                 background_buffer_width,
                                 shapefiles,
                                 features,
                                 replicates,
                                 responsecurves,
                                 crop_Koppen,
                                 Koppen_raster,
                                 Koppen_zones,
                                 country_shp) {
  
  
  ## Convert to SF object for selection - inefficient
  message('Preparing spatial data for SDMs')
  
  ## Loop over all the taxa
  ## taxa <- taxa_list[1]
  lapply(taxa_list, function(taxa){
    
    ## Skip the taxa if the directory already exists, before the loop
    # outdir <- maxent_dir
    
    ## Could also check the folder size, so folders with no contents aren't skipped eg
    ## && sum(file.info(dir_name)$size) < 1000 (EG 1MB)
    dir_name = file.path(maxent_dir, gsub(' ', '_', taxa))
    
    if(!dir.exists(dir_name)) {
      message('Maxent not yet run for ', taxa, ' - running')
      
      ## Create the directory for the taxa in progress,
      ## so other parallel runs don't run the same taxa
      dir.create(dir_name)
      file.create(file.path(dir_name, "in_progress.txt"))
      
      ## Print the taxa being processed to screen
      if(taxa %in% sdm_df[[taxa_level]]) {
        message('Doing ', taxa)
        
        ## Subset the records to only the taxa being processed - in the searched taxa or returned
        occurrence <- sdm_df %>% .[.[[taxa_level]] %in% taxa, ]
        message('Using ', nrow(occurrence), ' occ records from ', unique(occurrence$SOURCE))
        
        ## Now get the background points. These can come from any taxa, other than the modelled taxa.
        ## However, they should be limited to the same SOURCE as the occ data
        background <- sdm_df %>% .[!.[[taxa_level]] %in% taxa, ]
        message('Using ', nrow(background), ' background records from ', unique(background$SOURCE))
        
        if(!crop_Koppen) {
          Koppen_raster = FALSE
          Koppen_zones  = FALSE
          
        }
        
        ## Finally fit the models using FIT_MAXENT_TARG_BG. Also use tryCatch to skip any exceptions
        tryCatch(
          fit_maxent_targ_bg_back_sel_crop(occ                     = occurrence,
                                           bg                      = background,
                                           sdm_predictors          = sdm_predictors,
                                           taxa                    = taxa,
                                           outdir                  = maxent_dir,
                                           bsdir                   = bs_dir,
                                           backwards_sel           = backwards_sel,
                                           cor_thr                 = cor_thr,
                                           pct_thr                 = pct_thr,
                                           k_thr                   = k_thr,
                                           
                                           crop_Koppen             = crop_Koppen,
                                           template_raster         = template_raster,
                                           min_n                   = min_n,
                                           max_bg_size             = max_bg_size,
                                           Koppen_raster           = Koppen_raster,
                                           Koppen_zones            = Koppen_zones,
                                           background_buffer_width = background_buffer_width,
                                           shapefiles              = shapefiles,
                                           features                = features,
                                           replicates              = replicates,
                                           responsecurves          = responsecurves,
                                           country_shp             = country_shp),
          
          
          ## Save error message
          error = function(cond) {
            
            ## How to put the message into the file?
            file.create(file.path(dir_name, "sdm_failed.txt"))
            message(taxa, ' failed')
            cat(cond$message, file = file.path(dir_name, "sdm_failed.txt"))
            warning(taxa, ': ', cond$message)
            
          })
        
      } else {
        message(taxa, ' skipped - no data.') ## This condition ignores taxa which have no data...
        file.create(file.path(dir_name, "completed.txt"))
      }
      
      ## now add a file to the dir to denote that it has completed
      file.create(file.path(dir_name, "completed.txt"))
      
    } else {
      message(taxa, ' sdm already exists, skipped') ## This condition ignores taxa which have no data...
    }
    
  })
  
}







#' @title Maxent Backwards selection for 'species with data' format, with cropping. 
#' @description 
#' This function takes a data frame of all taxa records,
#' and runs a specialised maxent analysis for each taxa.
#' It uses the rmaxent package https://github.com/johnbaums/rmaxent
#' It assumes that the input df is that returned by the prepare_sdm_table function
#' 
#' @param occ                SpatialPointsDataFrame - Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param bg                 SpatialPointsDataFrame - Spdf of of candidate background points
#' @param sdm_predictors     Character string - Vector of enviro conditions that you want to include
#' @param outdir             Character string - The file path used for saving the maxent output
#' @param bsdir              Character string - The file path used for saving the backwards selection maxent output
#' @param backwards_sel      Logical - Run backwards selection using the maxent models (T/F)?
#' @param template_raster    RasterLayer -  Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param template_raster    RasterLayer -  Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param cor_thr            Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr            Numeric - The min allowable percent variable contribution
#' @param k_thr              Numeric - The min number of variables to be kept in the model
#' @param min_n              Numeric - The min number of records for running maxent
#' @param max_bg_size        Numeric - The max number of background points to keep
#' @param crop_Koppen        Logical - Should we use a koppen zone raster to select background points?
#' @param shapefiles         Logical - Save shapefiles of the occ and bg data (T/F)?
#' @param features           Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates         Numeric - The number of replicates to use
#' @param responsecurves     Logical - Save response curves of the maxent models (T/F)?
#' @param country_shp        .Rds object - SpatialPolygonsDataFrame of Australia for mapping maxent points
#' @param rep_args           RasterLayer of global koppen zones, in Mollweide54009 projection
#' @param full_args          Dataframe of global koppen zones, with columns : GRIDCODE, Koppen
#' @export fit_maxent_targ_bg_back_sel_crop
fit_maxent_targ_bg_back_sel_crop <- function(occ,
                                             bg,
                                             sdm_predictors,
                                             taxa,
                                             outdir,
                                             bsdir,
                                             cor_thr,
                                             pct_thr,
                                             k_thr,
                                             backwards_sel,
                                             template_raster,
                                             min_n,
                                             max_bg_size,
                                             background_buffer_width,
                                             crop_Koppen,
                                             Koppen_raster,
                                             Koppen_zones,
                                             shapefiles,
                                             features,
                                             replicates,
                                             responsecurves,
                                             country_shp) {
  
  
  ## If we aren't cropping the raster to the Koppen extent, don't use that argument
  
  ## First, stop if the outdir file exists,
  if(!file.exists(outdir)) stop('outdir does not exist :(', call. = FALSE)
  outdir_sp <- file.path(outdir, gsub(' ', '_', taxa))
  bsdir_sp  <- file.path(bsdir,  gsub(' ', '_', taxa))
  
  if(crop_Koppen) {
    if(!missing('Koppen_raster')) {
      if(!is(Koppen_raster, 'RasterLayer'))
        stop('Koppen must be a RasterLayer, and should be in the same coordinate system as template_raster')
    }
  }
  
  ## If the file doesn't exist, split out the features
  if(!file.exists(outdir_sp)) dir.create(outdir_sp)
  features <- unlist(strsplit(features, ''))
  
  ## Make sure user features are allowed: don't run the model if the
  ## features have been incorrectly specified in the main argument
  ## l: linear
  ## p: product
  ## q: quadratic
  ## h: hinge       ## disabled for this analysis
  ## t: threshold   ## disabled for this analysis
  if(length(setdiff(features, c('l', 'p', 'q', 'h', 't'))) > 1)
    stop("features must be a vector of one or more of ',
         'l', 'p', 'q', 'h', and 't'.")
  
  ## Create a buffer of xkm around the occurrence points
  ## So the spatial data change has caused this problem
  buffer <- aggregate(gBuffer(occ, width = background_buffer_width, byid = TRUE))
  
  ## Get unique cell numbers for taxa occurrences
  template_raster_spat <- terra::rast(template_raster)
  occ_mat              <- cbind(lon = occ$lon, lat = occ$lat) 
  cells                <- terra::cellFromXY(template_raster_spat, occ_mat)
  
  ## Clean out duplicate cells and NAs (including points outside extent of predictor data)
  ## Note this will get rid of a lot of duplicate records not filtered out by GBIF columns, etc.
  not_dupes <- which(!duplicated(cells) & !is.na(cells))
  occ       <- occ[not_dupes, ]
  cells     <- cells[not_dupes]
  message(nrow(occ), ' occurrence records (unique cells).')
  
  ## Skip taxa that have less than a minimum number of records: eg 20 taxa
  if(nrow(occ) > min_n) {
    
    ## Subset the background records to the 200km buffered polygon
    message(taxa, ' creating background cells')
    system.time(o <- over(bg, buffer))
    bg        <- bg[which(!is.na(o)), ]
    bg_mat    <- cbind(lon = bg$lon, lat = bg$lat) 
    bg_cells  <- terra::cellFromXY(template_raster_spat, bg_mat)
    
    ## Clean out duplicates and NAs (including points outside extent of predictor data)
    bg_not_dupes       <- which(!duplicated(bg_cells) & !is.na(bg_cells))
    bg_unique          <- bg[bg_not_dupes, ]
    bg_cells_unique    <- bg_cells[bg_not_dupes]
    bg_mat_unique      <- cbind(lon = bg_unique$lon, lat = bg_unique$lat) 
    
    ## Find which of these cells fall within the Koppen-Geiger zones that the taxa occupies
    ## Crop the Kopppen raster to the extent of the occurrences, and snap it.
    if(crop_Koppen) {
      
      message(taxa, ' intersecting background cells with Koppen zones')
      Koppen_crop <- raster::crop(Koppen_raster, occ, snap = 'out')
      
      ## Only extract and match those cells that overlap between the ::
      ## 1). cropped koppen zone,
      ## 2). occurrences and
      ## 3). background points
      zones               <- raster::extract(Koppen_crop, occ)
      cells_in_zones_crop <- raster::Which(Koppen_crop %in% zones, cells = TRUE)
      Koppen_crop_rast    <- terra::rast(Koppen_crop)
      Koppen_rast         <- terra::rast(Koppen_raster)
      cells_in_zones      <- terra::cellFromXY(Koppen_rast, terra::xyFromCell(Koppen_crop_rast, cells_in_zones_crop))
      
      bg_cells_zones      <- intersect(bg_cells_unique, cells_in_zones)  ## this is 0 for 5km
      i                   <- terra::cellFromXY(template_raster_spat, bg_mat_unique)
      bg_crop             <- bg_unique[which(i %in% bg_cells_zones), ]
      
      ## For some taxa, we have the problem that the proportion of ALA/INV data is
      ## very different in the occurrence vs the bg records.
      ## This should be caused by the 200km / koppen restriction, etc.
      
      ## Now save an image of the background points
      ## This is useful to quality control the models
      save_name = gsub(' ', '_', taxa)
      
      aus.mol = country_shp  %>%
        spTransform(projection(buffer))
      
      aus.kop = raster::crop(Koppen_crop, aus.mol)
      occ.mol <- occ %>%
        spTransform(projection(buffer))
      
      ## Then save the occurrence points
      png(sprintf('%s/%s/%s_%s.png', outdir, save_name, save_name, "buffer_occ"),
          16, 10, units = 'in', res = 300)
      
      plot(Koppen_crop, legend = FALSE,
           main = paste0('Occurence SDM records for ', taxa))
      
      plot(aus.mol, add = TRUE)
      plot(buffer,  add = TRUE, col = "red")
      plot(occ.mol, add = TRUE, col = "blue")
      
      dev.off()
      
      
    } else {
      message(taxa, ' Do not intersect background cells with Koppen zones')
      i                   <- terra::cellFromXY(template_raster_spat, bg_mat_unique)
      bg_crop             <- bg_unique[which(i %in% bg_cells_unique), ]
      
      ## Now save an image of the background points
      ## This is useful to quality control the models
      save_name = gsub(' ', '_', taxa)
      
      aus.mol = country_shp  %>%
        spTransform(projection(buffer))
      
      occ.mol <- occ %>%
        spTransform(projection(buffer))
      
      ## Then save the occurrence points
      png(sprintf('%s/%s/%s_%s.png', outdir, save_name, save_name, "buffer_occ"),
          16, 10, units = 'in', res = 300)
      
      plot(aus.mol, legend = FALSE,
           main = paste0('Occurence SDM records for ', taxa))
      plot(buffer,  add = TRUE, col = "red")
      plot(occ.mol, add = TRUE, col = "blue")
      
      dev.off()
      
    }
    
    ## Reduce background sample, if it's larger than max_bg_size
    if (nrow(bg_crop) > max_bg_size) {
      
      message(nrow(bg_crop), ' target taxa background records for ', taxa,
              ', reduced to random ', max_bg_size, ' using random points from :: ', unique(bg_crop$SOURCE))
      bg.samp <- bg_crop[sample(nrow(bg_crop), max_bg_size), ]
      
    } else {
      ## If the bg points are smaller that the max_bg_size, just get all the points
      message(nrow(bg_crop), ' target taxa background records for ', taxa,
              ' using all points from :: ', unique(bg$SOURCE))
      bg.samp <- bg_crop
    }
    
    ## Now save the buffer, the occ and bg points as shapefiles
    if(shapefiles) {
      
      suppressWarnings({
        
        message(taxa, ' writing occ and bg shapefiles')
        writeOGR(SpatialPolygonsDataFrame(buffer, data.frame(ID = seq_len(length(buffer)))),
                 outdir_sp, paste0(save_name, '_bg_buffer'),       'ESRI Shapefile', overwrite_layer = TRUE)
        writeOGR(bg.samp,   outdir_sp, paste0(save_name, '_bg'),   'ESRI Shapefile', overwrite_layer = TRUE)
        writeOGR(occ,       outdir_sp, paste0(save_name, '_occ'),  'ESRI Shapefile', overwrite_layer = TRUE)
        
      })
      
    }
    
    ## Also save the background and occurrence points as .rds files
    saveRDS(bg.samp,  file.path(outdir_sp, paste0(save_name, '_bg.rds')))
    saveRDS(occ,      file.path(outdir_sp, paste0(save_name, '_occ.rds')))
    
    ## SWD = taxa with data. Now sample the environmental
    ## variables used in the model at all the occ and bg points
    swd_occ <- occ[, sdm_predictors]
    saveRDS(swd_occ, file.path(outdir_sp, paste0(save_name,'_occ_swd.rds')))
    
    swd_bg <- bg.samp[, sdm_predictors]
    saveRDS(swd_bg, file.path(outdir_sp, paste0(save_name, '_bg_swd.rds')))
    
    ## Save the SWD tables as shapefiles
    if(shapefiles) {
      writeOGR(swd_occ, outdir_sp,  paste0(save_name, '_occ_swd'), 'ESRI Shapefile', overwrite_layer = TRUE)
      writeOGR(swd_bg,  outdir_sp,  paste0(save_name, '_bg_swd'),  'ESRI Shapefile', overwrite_layer = TRUE)
    }
    
    ## Now combine the occurrence and background data
    swd <- as.data.frame(rbind(swd_occ@data, swd_bg@data))
    saveRDS(swd, file.path(outdir_sp, 'swd.rds'))
    pa  <- rep(1:0, c(nrow(swd_occ), nrow(swd_bg)))
    
    ## Now, set the features to be used by maxent ::
    ## Linear, product and quadratic
    off <- setdiff(c('l', 'p', 'q', 't', 'h'), features)
    
    ## This sets threshold and hinge features to "off"
    if(length(off) > 0) {
      
      off <- c(l = 'linear=false',    p = 'product=false', q = 'quadratic=false',
               t = 'threshold=false', h = 'hinge=false')[off]
      
    }
    
    off <- unname(off)
    
    ## Run the replicates
    if(replicates > 1) {
      
      ## Only use rep_args & full_args if not using replicates
      rep_args <- NULL
      
      ## Run MAXENT for cross validation data splits of swd : so 5 replicaes, 0-4
      ## first argument is the predictors, the second is the occurrence data
      message(taxa, ' running xval maxent')
      me_xval <- dismo::maxent(swd, pa, path = file.path(outdir_sp, 'xval'),
                               args = c(paste0('replicates=', replicates),
                                        'responsecurves=true',
                                        'outputformat=logistic',
                                        off, paste(names(rep_args), rep_args, sep = '=')))
    }
    
    ## Run the full maxent model - using all the data in swd
    ## This uses DISMO to output standard files, but the names can't be altered
    full_args <- NULL
    message(taxa, ' running full maxent')
    me_full <- maxent(swd, pa, path = file.path(outdir_sp, 'full'),
                      args = c(off, paste(names(full_args), full_args, sep = '='),
                               'responsecurves=true',
                               'outputformat=logistic'))
    
    ## Save the full model. Replicate this line in the backwards selection algortithm
    ## Remove Koppen from the end
    if(crop_Koppen) {
      saveRDS(list(me_xval = me_xval, me_full = me_full, swd = swd, pa = pa),
              koppen_gridcode = as.character(Koppen_zones$Koppen[match(unique(zones), Koppen_zones$GRIDCODE)]),
              file.path(outdir_sp, 'full', 'maxent_fitted.rds'))
      
    } else {
      saveRDS(list(me_xval = me_xval, me_full = me_full, swd = swd, pa = pa),
              file.path(outdir_sp, 'full', 'maxent_fitted.rds'))
    }
    
    if (backwards_sel) {
      
      ## Coerce the "taxa with data" (SWD) files to regular data.frames
      ## This is needed to use the simplify function
      swd_occ     <- as.data.frame(swd_occ)
      swd_occ$lon <- NULL
      swd_occ$lat <- NULL
      swd_bg      <- as.data.frame(swd_bg)
      swd_bg$lon  <- NULL
      swd_bg$lat  <- NULL
      
      ## Need to create a taxa column here
      swd_occ$searchTaxon <- taxa
      swd_bg$searchTaxon  <- taxa
      
      ## Run simplify rmaxent::simplify
      
      # Given a candidate set of predictor variables, this function identifies
      # a subset that meets specified multicollinearity criteria. Subsequently,
      # backward stepwise variable selection (VIF) is used to iteratively drop
      # the variable that contributes least to the model, until the contribution
      # of each variable meets a specified minimum, or until a predetermined
      # minimum number of predictors remains. It returns a model object for the
      # full model, rather than a list of models as does the previous function
      
      ## Using a modified versionof rmaxent::simplify, so that the taxa of the
      ## maxent model object "maxent_fitted.rds" is the same in both models.
      ## This is needed to run the mapping step over either the full or BS folder
      m <- local_simplify(
        
        swd_occ,
        swd_bg,
        path            = bsdir,
        taxa_column     = "searchTaxon",
        replicates      = replicates,  ## 5 as above
        response_curves = TRUE,
        logistic_format = TRUE,
        cor_thr         = cor_thr,
        pct_thr         = pct_thr,
        k_thr           = k_thr,
        features        = features,    ## LPQ as above
        quiet           = FALSE)
      
      ## Save the bg, occ and swd files into the backwards selection folder too
      saveRDS(bg.samp,  file.path(bsdir_sp, paste0(save_name, '_bg.rds')))
      saveRDS(occ,      file.path(bsdir_sp, paste0(save_name, '_occ.rds')))
      saveRDS(swd,      file.path(bsdir_sp, paste0('swd.rds')))
      
      ## Read the model in, because it's tricky to index
      bs.model <- readRDS(sprintf('%s/%s/full/maxent_fitted.rds', bsdir,  save_name))
      identical(length(bs.model@presence$Annual_mean_temp), nrow(occ))
      
      ## Save the chart corrleation file too for the training data set
      par(mar   = c(3, 3, 5, 3),
          oma   = c(1.5, 1.5, 1.5, 1.5))
      
      png(sprintf('%s/%s/full/%s_%s.png', bsdir,
                  save_name, save_name, "bs_predictor_correlation"),
          3236, 2000, units = 'px', res = 300)
      
      ## set margins
      par(mar   = c(3, 3, 5, 3),
          oma   = c(1.5, 1.5, 1.5, 1.5))
      
      ## Add detail to the response plot
      chart.Correlation(bs.model@presence,
                        histogram = TRUE, pch = 19,
                        title = paste0('Reduced variable correlations for ', save_name))
      
      dev.off()
      gc()
      
    } else {
      message("Don't run backwards selection")
    }
    
  } else {
    message('Fewer occurrence records than the number of cross-validation ',
            'replicates for taxa ', taxa,
            ' Model not fit for this taxa')
  }
}




#' @title Maxent full models for 'species with data' format, without biogeographic cropping. 
#' @description 
#' This function takes a data frame of all taxa records,
#' and runs a specialised maxent analysis for each taxa.
#' It uses the rmaxent package https://github.com/johnbaums/rmaxent
#' It assumes that the input df is that returned by the prepare_sdm_table function
#' 
#' @param occ                SpatialPointsDataFrame - Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param bg                 SpatialPointsDataFrame - Spdf of of candidate background points
#' @param sdm_predictors     Character string - Vector of enviro conditions that you want to include
#' @param outdir             Character string - The file path used for saving the maxent output
#' @param bsdir              Character string - The file path used for saving the backwards selection maxent output
#' @param backwards_sel      Logical - Run backwards selection using the maxent models (T/F)?
#' @param template_raster    RasterLayer -  Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param template_raster    RasterLayer -  Empty raster with analysis extent (global), res (1km) and projection (Mollweide, EPSG 54009)
#' @param cor_thr            Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr            Numeric - The min allowable percent variable contribution
#' @param k_thr              Numeric - The min number of variables to be kept in the model
#' @param min_n              Numeric - The min number of records for running maxent
#' @param max_bg_size        Numeric - The max number of background points to keep
#' @param shapefiles         Logical - Save shapefiles of the occ and bg data (T/F)?
#' @param features           Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates         Numeric - The number of replicates to use
#' @param responsecurves     Logical - Save response curves of the maxent models (T/F)?
#' @param poly                Character string - path to shapefile
#' @param rep_args           RasterLayer of global koppen zones, in Mollweide54009 projection
#' @param full_args          Dataframe of global koppen zones, with columns : GRIDCODE, Koppen
#' @export fit_maxent_targ_bg_back_sel_no_crop 
fit_maxent_targ_bg_full_no_crop <- function(occ,
                                            bg,
                                            sdm_predictors,
                                            taxa,
                                            outdir,
                                            bsdir,
                                            cor_thr,
                                            pct_thr,
                                            k_thr,
                                            template_raster,
                                            min_n,
                                            max_bg_size,
                                            background_buffer_width,
                                            shapefiles,
                                            features,
                                            replicates,
                                            responsecurves,
                                            poly) {
  
  ## First, stop if the outdir file exists,
  if(!file.exists(outdir)) stop('outdir does not exist :(', call. = FALSE)
  outdir_sp <- file.path(outdir, gsub(' ', '_', taxa))
  bsdir_sp  <- file.path(bsdir,  gsub(' ', '_', taxa))
  
  ## If the file doesn't exist, split out the features
  if(!file.exists(outdir_sp)) dir.create(outdir_sp)
  features <- unlist(strsplit(features, ''))
  
  ## Make sure user features are allowed: don't run the model if the
  ## features have been incorrectly specified in the main argument
  ## l: linear
  ## p: product
  ## q: quadratic
  ## h: hinge       ## disabled for this analysis
  ## t: threshold   ## disabled for this analysis
  if(length(setdiff(features, c('l', 'p', 'q', 'h', 't'))) > 1)
    stop("features must be a vector of one or more of ',
         'l', 'p', 'q', 'h', and 't'.")
  
  ## Create a buffer of xkm around the occurrence points
  ## So the spatial data change has caused this problem
  buffer <- aggregate(gBuffer(occ, width = background_buffer_width, byid = TRUE))
  
  ## Get unique cell numbers for taxa occurrences
  template_raster_spat <- terra::rast(template_raster)
  occ_mat              <- cbind(lon = occ$lon, lat = occ$lat) 
  cells                <- terra::cellFromXY(template_raster_spat, occ_mat)
  
  ## Clean out duplicate cells and NAs (including points outside extent of predictor data)
  ## Note this will get rid of a lot of duplicate records not filtered out by GBIF columns, etc.
  not_dupes <- which(!duplicated(cells) & !is.na(cells))
  occ       <- occ[not_dupes, ]
  cells     <- cells[not_dupes]
  message(nrow(occ), ' occurrence records (unique cells).')
  
  ## Skip taxa that have less than a minimum number of records: eg 20 taxa
  if(nrow(occ) > min_n) {
    
    ## Subset the background records to the 200km buffered polygon
    message(taxa, ' creating background cells')
    system.time(o <- over(bg, buffer))
    bg        <- bg[which(!is.na(o)), ]
    bg_mat    <- cbind(lon = bg$lon, lat = bg$lat) 
    bg_cells  <- terra::cellFromXY(template_raster_spat, bg_mat)
    
    ## Clean out duplicates and NAs (including points outside extent of predictor data)
    bg_not_dupes       <- which(!duplicated(bg_cells) & !is.na(bg_cells))
    bg_unique          <- bg[bg_not_dupes, ]
    bg_cells_unique    <- bg_cells[bg_not_dupes]
    bg_mat_unique      <- cbind(lon = bg_unique$lon, lat = bg_unique$lat) 
    
    ## Find which of these cells fall within the Koppen-Geiger zones that the taxa occupies
    ## Crop the Kopppen raster to the extent of the occurrences, and snap it.
    message(taxa, ' Do not intersect background cells with Koppen zones')
    message('country poly is a ', class(poly))
    i                   <- terra::cellFromXY(template_raster_spat, bg_mat_unique)
    bg_crop             <- bg_unique[which(i %in% bg_cells_unique), ]
    
    ## Now save an image of the background points
    ## This is useful to quality control the models
    save_name = gsub(' ', '_', taxa)
    
    ## Then save the occurrence points
    png(sprintf('%s/%s/%s_%s.png', outdir, save_name, save_name, "buffer_occ"),
        16, 10, units = 'in', res = 300)
    
    raster::plot(st_geometry(poly), 
                 legend = FALSE,
                 main   = paste0('Occurence SDM records for ', taxa))
    
    raster::plot(buffer,  add = TRUE, col = "red")
    raster::plot(occ, add = TRUE, col = "blue")
    
    dev.off()
    
    gc()
    
    ## Reduce background sample, if it's larger than max_bg_size
    if (nrow(bg_crop) > max_bg_size) {
      
      message(nrow(bg_crop), ' target taxa background records for ', taxa,
              ', reduced to random ', max_bg_size, ' using random points from :: ', unique(bg_crop$SOURCE))
      bg.samp <- bg_crop[sample(nrow(bg_crop), max_bg_size), ]
      
    } else {
      ## If the bg points are smaller that the max_bg_size, just get all the points
      message(nrow(bg_crop), ' target taxa background records for ', taxa,
              ' using all points from :: ', unique(bg$SOURCE))
      bg.samp <- bg_crop
    }
    
    ## Now save the buffer, the occ and bg points as shapefiles
    if(shapefiles) {
      
      suppressWarnings({
        
        message(taxa, ' writing occ and bg shapefiles')
        writeOGR(SpatialPolygonsDataFrame(buffer, data.frame(ID = seq_len(length(buffer)))),
                 outdir_sp, paste0(save_name, '_bg_buffer'),       'ESRI Shapefile', overwrite_layer = TRUE)
        writeOGR(bg.samp,   outdir_sp, paste0(save_name, '_bg'),   'ESRI Shapefile', overwrite_layer = TRUE)
        writeOGR(occ,       outdir_sp, paste0(save_name, '_occ'),  'ESRI Shapefile', overwrite_layer = TRUE)
        
      })
      
    }
    
    ## Also save the background and occurrence points as .rds files
    saveRDS(bg.samp,  file.path(outdir_sp, paste0(save_name, '_bg.rds')))
    saveRDS(occ,      file.path(outdir_sp, paste0(save_name, '_occ.rds')))
    
    ## SWD = taxa with data. Now sample the environmental
    ## variables used in the model at all the occ and bg points
    swd_occ <- occ[, sdm_predictors]
    saveRDS(swd_occ, file.path(outdir_sp, paste0(save_name,'_occ_swd.rds')))
    
    swd_bg <- bg.samp[, sdm_predictors]
    saveRDS(swd_bg, file.path(outdir_sp, paste0(save_name, '_bg_swd.rds')))
    
    ## Save the SWD tables as shapefiles
    if(shapefiles) {
      writeOGR(swd_occ, outdir_sp,  paste0(save_name, '_occ_swd'), 'ESRI Shapefile', overwrite_layer = TRUE)
      writeOGR(swd_bg,  outdir_sp,  paste0(save_name, '_bg_swd'),  'ESRI Shapefile', overwrite_layer = TRUE)
    }
    
    gc()
    
    ## Now combine the occurrence and background data
    swd <- as.data.frame(rbind(swd_occ@data, swd_bg@data))
    saveRDS(swd, file.path(outdir_sp, 'swd.rds'))
    pa  <- rep(1:0, c(nrow(swd_occ), nrow(swd_bg)))
    
    ## Now, set the features to be used by maxent ::
    ## Linear, product and quadratic
    off <- setdiff(c('l', 'p', 'q', 't', 'h'), features)
    
    ## This sets threshold and hinge features to "off"
    if(length(off) > 0) {
      
      off <- c(l = 'linear=false',    p = 'product=false', q = 'quadratic=false',
               t = 'threshold=false', h = 'hinge=false')[off]
      
    }
    
    off <- unname(off)
    
    ## Run the replicates
    if(replicates > 1) {
      
      ## Only use rep_args & full_args if not using replicates
      rep_args <- NULL
      
      ## Run MAXENT for cross validation data splits of swd : so 5 replicaes, 0-4
      ## first argument is the predictors, the second is the occurrence data
      message(taxa, ' running xval maxent')
      me_xval <- dismo::maxent(swd, pa, path = file.path(outdir_sp, 'xval'),
                               args = c(paste0('replicates=', replicates),
                                        'responsecurves=true',
                                        'outputformat=logistic',
                                        off, paste(names(rep_args), rep_args, sep = '=')))
    }
    
    gc()
    
    ## Run the full maxent model - using all the data in swd
    ## This uses DISMO to output standard files, but the names can't be altered
    full_args <- NULL
    message(taxa, ' running full maxent')
    me_full <- maxent(swd, pa, path = file.path(outdir_sp, 'full'),
                      args = c(off, paste(names(full_args), full_args, sep = '='),
                               'responsecurves=true',
                               'outputformat=logistic'))
    
    ## Save the full model. Replicate this line in the backwards selection algortithm
    ## Remove Koppen from the end
    saveRDS(list(me_xval = me_xval, me_full = me_full, swd = swd, pa = pa),
            file.path(outdir_sp, 'full', 'maxent_fitted.rds'))
    
  } else {
    message('Fewer occurrence records than the number of cross-validation ',
            'replicates for taxa ', taxa,
            ' Model not fit for this taxa')
  }
}





#' @title Simplify maxent model
#' @description This is a local version of rmaxent::simplify (https://github.com/johnbaums/rmaxent)
#' Given a candidate set of predictor variables, this function identifies
#' a subset that meets specified multicollinearity criteria. Subsequently,
#' backward stepwise variable selection is used to iteratively drop the variable
#' that contributes least to the model, until the contribution of each variable
#' meets a specified minimum, or until a predetermined minimum number of predictors remains.
#' 
#' It assumes that the input df is that returned by the fit_maxent_targ_bg_back_sel function
#' @param occ                SpatialPointsDataFrame - Spdf of all taxa records returned by the 'prepare_sdm_table' function
#' @param bg                 SpatialPointsDataFrame - Spdf of of candidate background points
#' @param path               Character string - Vector of enviro conditions that you want to include
#' @param taxa_column     Character string - Vector of enviro conditions that you want to include
#' @param cor_thr            Numeric - The max allowable pairwise correlation between predictor variables
#' @param pct_thr            Numeric - The min allowable percent variable contribution
#' @param k_thr              Numeric - The min number of variables to be kept in the model
#' @param features           Character string - Which features should be used? (e.g. linear, product, quadratic 'lpq')
#' @param replicates         Numeric - The number of replicates to use
#' @param type               The variable contribution metric to use when dropping variables
#' @param logistic_format    Logical value indicating whether maxentResults.csv should report logistic value thresholds
#' @param responsecurves     Logical - Save response curves of the maxent models (T/F)?
#' @export local_simplify
local_simplify = function (occ, bg, path, taxa_column = "taxa", response_curves = TRUE,
                           logistic_format = TRUE,  type = "PI", cor_thr, pct_thr, k_thr,
                           features = "lpq",  replicates = 1, quiet = TRUE)
  
{
  if (!taxa_column %in% colnames(occ))
    stop(taxa_column, " is not a column of `occ`", call. = FALSE)
  if (!taxa_column %in% colnames(bg))
    stop(taxa_column, " is not a column of `bg`", call. = FALSE)
  
  if (missing(path)) {
    save <- FALSE
    path <- tempdir()
  }
  
  else save <- TRUE
  features <- unlist(strsplit(gsub("\\s", "", features), ""))
  if (length(setdiff(features, c("l", "p", "q", "h", "t"))) >
      1)
    stop("features must be a vector of one or more of ',\n  'l', 'p', 'q', 'h', and 't'.")
  off <- setdiff(c("l", "p", "q", "t", "h"), features)
  if (length(off) > 0) {
    off <- c(l = "linear=FALSE", p = "product=FALSE", q = "quadratic=FALSE",
             t = "threshold=FALSE", h = "hinge=FALSE")[off]
    
  }
  off <- unname(off)
  occ_by_taxa <- split(occ, occ[[taxa_column]])
  bg_by_taxa <- split(bg, bg[[taxa_column]])
  if (!identical(sort(names(occ_by_taxa)), sort(names(bg_by_taxa)))) {
    stop("The same set of taxa names must exist in occ and bg")
  }
  
  type <- switch(type, PI = "permutation.importance", PC = "contribution",
                 stop("type must be either \"PI\" or \"PC\".", call. = FALSE))
  args <- off
  
  if (replicates > 1)
    args <- c(args, paste0("replicates=", replicates))
  if (isTRUE(response_curves))
    args <- c(args, "responsecurves=TRUE")
  if (isTRUE(logistic_format))
    args <- c(args, "outputformat=logistic")
  
  f <- function(taxa) {
    
    if (!quiet)
      message("\n\nDoing ", taxa)
    name_ <- gsub(" ", "_", taxa)
    swd <- rbind(occ_by_taxa[[taxa]], bg_by_taxa[[taxa]])
    swd <- swd[, -match(taxa_column, names(swd))]
    
    if (ncol(swd) < k_thr)
      stop("Initial number of variables < k_thr", call. = FALSE)
    pa <- rep(1:0, c(nrow(occ_by_taxa[[taxa]]), nrow(bg_by_taxa[[taxa]])))
    vc <- usdm::vifcor(swd, maxobservations = nrow(swd),
                       th = cor_thr)
    
    vif <- methods::slot(vc, "results")
    k <- nrow(vif)
    exclude <- methods::slot(vc, "excluded")
    
    if (!isTRUE(quiet) & length(exclude) > 0) {
      message("Dropped due to collinearity: ", paste0(exclude,
                                                      collapse = ", "))
    }
    
    if (k < k_thr)
      stop(sprintf("Number of uncorrelated variables (%s) < k_thr (%s). %s",
                   k, k_thr, "Reduce k_thr, increase cor_thr, or find alternative predictors."),
           call. = FALSE)
    
    swd_uncor <- swd[, as.character(vif$Variables)]
    d <- file.path(path, name_, if (replicates > 1)
      "xval"
      else "full")
    
    m <- dismo::maxent(swd_uncor, pa, args = args, path = d)
    if (isTRUE(save))
      saveRDS(m, file.path(d, "maxent_fitted.rds"))
    
    pct <- m@results[grep(type, rownames(m@results)), ,
                     drop = FALSE]
    pct <- pct[, ncol(pct)]
    pct <- sort(pct)
    
    names(pct) <- sub(paste0("\\.", type), "", names(pct))
    
    if (min(pct) >= pct_thr || length(pct) == k_thr) {
      
      if (replicates > 1) {
        d <- file.path(path, name_, "full")
        m <- dismo::maxent(swd_uncor, pa, args = grep("replicates",
                                                      args, value = TRUE,
                                                      invert = TRUE), path = d)
      }
      
      if (isTRUE(save)) {
        saveRDS(m, file.path(d, "maxent_fitted.rds"))
      }
      return(m)
    }
    
    while (min(pct) < pct_thr && length(pct) > k_thr) {
      candidates <- vif[vif$Variables %in% names(pct)[pct ==
                                                        pct[1]], ]
      
      drop <- as.character(candidates$Variables[which.max(candidates$VIF)])
      message("Dropping ", drop)
      
      swd_uncor <- swd_uncor[, -match(drop, colnames(swd_uncor))]
      if (!quiet)
        message(sprintf("%s variables: %s", ncol(swd_uncor),
                        paste0(colnames(swd_uncor), collapse = ", ")))
      m <- dismo::maxent(swd_uncor, pa, args = args, path = d)
      pct <- m@results[grep(type, rownames(m@results)),
                       , drop = FALSE]
      
      pct <- pct[, ncol(pct)]
      pct <- sort(pct)
      names(pct) <- sub(paste0("\\.", type), "", names(pct))
    }
    
    if (replicates > 1) {
      d <- file.path(path, name_, "full")
      m <- dismo::maxent(swd_uncor, pa, args = grep("replicates",
                                                    args, value = TRUE,
                                                    invert = TRUE), path = d)
    }
    
    gc()
    
    if (isTRUE(save)) {
      saveRDS(m, file.path(d, "maxent_fitted.rds"))
    }
    return(m)
  }
  lapply(names(occ_by_taxa), f)
}




#' @title Variable importance
#' @description This function calculates variables importance.
#' @param mod   object - maxent model object
#' @export var_importance
var_importance <- function(mod) {
  res <- mod@results
  pc <- res[grepl('contribution', rownames(res)),]
  pi <- res[grepl('permutation', rownames(res)),]
  varnames <- sapply(strsplit(names(pc), '.contribution'), function(x) x[1])
  df <- data.frame(variable=varnames, percent.contribution=pc, permutation.importance=pi, row.names=NULL)
  return(df)
}





#' @title Compile SDM results from 'species with data' format.
#' @description This function extracts the SDM results from the folders.
#' @param taxa_list      Character string - the taxa to run maxent models for
#' @param results_dir       Character string - The file path used for saving the maxent output
#' @param save_data         Logical or character - do you want to save the data frame?
#' @param data_path         Character string - The file path used for saving the data frame
#' @param sdm_path          Character string - The file path where the maxent models are stored.
#' @param save_run          Character string - run name to append to the data frame, useful for multiple runs.
#' @export compile_sdm_results
compile_sdm_results = function(taxa_list,
                               results_dir,
                               save_data,
                               data_path,
                               sdm_path,
                               save_run) {
  
  ## The code that adds niche info is now in './R/9_COLLATE_MAXENT_RESULTS.R'
  message('Creating summary stats for ', length(taxa_list),
          ' taxa in the set ', "'", save_run, "'")
  
  ##
  processed = list.dirs(sdm_path, full.names = FALSE, recursive = FALSE)
  
  ## First, make a list of all the taxa with models, then restrict them
  ## to just the models on the taxa_list list
  map_spp_list  = gsub(" ", "_", taxa_list)
  map_spp_list  = processed[processed %in% map_spp_list ]
  map_spp_patt  = paste0(map_spp_list, collapse = "|")
  
  message ("map_spp_list head:")
  message (paste(head(map_spp_list),   collapse = ","))
  
  ## Now stop R from creating listing all the maxent files that have completed - this takes a long time
  message('Compile SDM results for taxa in ', results_dir)
  maxent.tables = lapply (map_spp_list, FUN = function (x) {paste(results_dir , x, "full/maxent_fitted.rds", sep="/")})
  
  ## How many taxa have been modelled?
  message(paste("maxent.tables has this many entries:", length(maxent.tables)))
  message(paste(head (maxent.tables), collapse = ","))
  sdm.exists = lapply(maxent.tables, FUN = function (x) {file.exists (x)}) %>% unlist()
  
  ## Only list the intersection between the modelled taxa and
  message(paste(head(sdm.exists), collapse=","))
  maxent.tables = maxent.tables[sdm.exists]
  
  ## Report what data the table has
  message (paste ("maxent.tables has this many entries:", length(maxent.tables)))
  maxent.tables = stringr::str_subset(maxent.tables, map_spp_patt)
  message (paste ("maxent.tables has this many entries:", length(maxent.tables)))
  message (paste (head(maxent.tables), collapse = ","))
  
  ## Now create a table of the results
  ## x = maxent.tables[1]
  MAXENT.RESULTS <- maxent.tables %>%
    
    ## Pipe the list into lapply
    lapply(function(x) {
      
      ## We don't need to skip taxa that haven't been modelled
      x = gsub(paste0(results_dir, "/"), "", x)
      message(x)
      
      ## load the backwards selected model
      if (grepl("back", results_dir)) {
        m = readRDS(paste0(results_dir, '/',  x))
        
      } else {
        ## Get the background records from any source
        m = readRDS(paste0(results_dir, '/',  x))$me_full
      }
      
      ## Get the number of Variables
      number.var  = length(m@lambdas) - 4   ## (the last 4 slots of the lambdas file are not variables)
      mxt.records = nrow(m@presence)
      
      ## Get variable importance
      m.vars    = var_importance(m)
      var.pcont = m.vars[rev(order(m.vars[["percent.contribution"]])),][["variable"]][1:4]
      pcont     = m.vars[rev(order(m.vars[["percent.contribution"]])),][["percent.contribution"]][1:4]
      Var_pcont = paste(var.pcont, pcont, sep = ' = ')
      
      var.pimp  = m.vars[rev(order(m.vars[["permutation.importance"]])),][["variable"]][1:1]
      pimp      = m.vars[rev(order(m.vars[["permutation.importance"]])),][["permutation.importance"]][1:1]
      
      ## Create a list of the variables which explains at least 75% of variation?
      
      ## Get maxent results columns to be used for model checking
      ## Including the omission rate here
      Training_AUC             = m@results["Training.AUC",]
      Number_background_points = m@results["X.Background.points",]
      Logistic_threshold       = m@results["X10.percentile.training.presence.Logistic.threshold",]
      Omission_rate            = m@results["X10.percentile.training.presence.training.omission",]
      
      ## Now rename the maxent columns that we will use in the results table
      d = data.frame(searchTaxon              = x,
                     Maxent_records           = mxt.records,
                     Number_var               = number.var,
                     
                     Var_pcont_1              = Var_pcont[1],
                     Var_pcont_2              = Var_pcont[2],
                     Var_pcont_3              = Var_pcont[3],
                     Var_pcont_4              = Var_pcont[4],
                     
                     Var_pimp                 = var.pimp[[1]],
                     Perm_imp                 = pimp[[1]],
                     Training_AUC,
                     Number_background_points,
                     Logistic_threshold,
                     Omission_rate)
      
      ## Remove path gunk, and taxa
      d$Species     = NULL
      d$searchTaxon = gsub("/full/maxent_fitted.rds", "", d$searchTaxon)
      row.names(d)  <- NULL
      return(d)
      
    }) %>%
    
    ## Finally, bind all the rows together
    bind_rows
  
  ## Now create a list of the '10th percentile training presence Logistic threshold'.
  ## This is used in step 8 to threshold
  ## the maps to just areas above the threshold.
  message ("MAXENT.RESULTS columns")
  message (paste (colnames (MAXENT.RESULTS)))
  message (paste (nrow (MAXENT.RESULTS)))
  summary(MAXENT.RESULTS["Logistic_threshold"])
  percent.10.log <- as.list(MAXENT.RESULTS["Logistic_threshold"]) %>%
    .[["Logistic_threshold"]]
  
  ## Create a list of the omission files - again, don't do this for all the files, just the intersection
  omission.tables = lapply (map_spp_list, FUN = function (x) {paste(results_dir , x, "full/species_omission.csv", sep = "/")})
  message (head (omission.tables))
  
  ## Only process the existing files
  om.exists = lapply (omission.tables, FUN = function (x) {file.exists (x)}) %>% unlist()
  omission.tables = omission.tables[om.exists]
  message(head(omission.tables))
  
  ## Get the maxium TSS value using the omission data : use _training_ ommission data only
  Max_tss <- sapply(omission.tables, function(file) {
    
    ## For eachg taxa, read in the training data
    d <- read.csv(file)
    i <- which.min(d$Training.omission + d$Fractional.area)
    
    c(Max_tss = 1 - min(d$Training.omission + d$Fractional.area),
      thr     = d$Corresponding.logistic.value[i])
    
  })
  
  ## Add a taxa variable to the TSS results, so we can subset to just the taxa analysed
  Max_tss        = as.data.frame(Max_tss)
  MAXENT.RESULTS = cbind(MAXENT.RESULTS, Max_tss)
  summary(MAXENT.RESULTS$Max_tss)
  summary(MAXENT.RESULTS$Omission_rate)
  
  ## This is a summary of maxent output for current conditions
  ## All taxa should have AUC > 0.7
  dim(MAXENT.RESULTS)
  head(MAXENT.RESULTS, 20)[1:5]
  
  ## Now check the match between the taxa list, and the results list.
  length(intersect(map_spp_list, MAXENT.RESULTS$searchTaxon))
  MAXENT.RESULTS  =  MAXENT.RESULTS[MAXENT.RESULTS$searchTaxon %in% map_spp_list , ]
  map_spp         = unique(MAXENT.RESULTS$searchTaxon)
  length(map_spp);setdiff(sort(map_spp_list), sort(map_spp))
  
  ## Then make a list of all the directories containing the individual GCM rasters. This is used for combining the rasters
  SDM.RESULTS.DIR <- map_spp %>%
    
    ## Pipe the list into lapply
    lapply(function(taxa) {
      
      ## Create the character string...
      m <-   sprintf('%s/%s/full/', results_dir, taxa)                ## path.backwards.sel
      m
      
    }) %>%
    
    ## Bind the list together
    c() %>% unlist()
  
  length(SDM.RESULTS.DIR)
  
  ## Change the taxa column
  MAXENT.RESULTS$searchTaxon = gsub("_", " ", MAXENT.RESULTS$searchTaxon)
  MAXENT.RESULTS$results_dir = SDM.RESULTS.DIR
  
  ##
  if(save_data == TRUE) {
    
    ## If saving, save
    saveRDS(MAXENT.RESULTS,   paste0(data_path, 'MAXENT_RESULTS_', save_run, '.rds'))
    write_csv(MAXENT.RESULTS, paste0(data_path, 'MAXENT_RESULTS_', save_run, '.csv'))
    return(MAXENT.RESULTS)
    
  } else {
    ## Or return to the global environment
    message(' skip file saving, not many taxa analysed')
    return(MAXENT.RESULTS)
  }
  
}





#########################################################################################################################
####################################################  TBC  ##############################################################
#########################################################################################################################
HMB3/nenswniche documentation built on Jan. 31, 2023, 11:46 p.m.