R/blocking.R

#' Use distance (buffer) around records to separate train and test folds
#'
#' This function generates spatially separated train and test folds by considering buffers of specified distance around each observation point.
#' This approach is a form of \emph{leave-one-out} cross-validation. Each fold is generated by excluding nearby observations around each testing
#' point within the specified distance (ideally the range of spatial autocorrelation, see \code{\link{spatialAutoRange}}). In this method the test set never
#' directly abuts a training presence or absence. For more information see the details section.
#'
#' When working with  presence-background (presence and pseudo-absence) data (specified by \code{spDataType} argument), only presence records  are used
#' for specifying the folds. Consider a target presence point. The buffer is defined around this target point, using the specified range (\code{theRange}).
#' The testing fold comprises the target presence point and all background points within the buffer (this is the default. If \code{addBG = FALSE} the bacground
#' points are ignored). Any non-target presence points inside the buffer are  excluded. All points (presence and background) outside of buffer
#' are used for training set. The methods cycles through all the \emph{presence} data, so the number of folds is equal to the number of presence points in the dataset.
#'
#' For presence-absence data, folds are created based on all records, both presences and absences. As above, a target observation (presence or absence) forms a
#' test point, all presence and absence points other than the target point within the buffer are ignored, and the training set comprises all presences and
#' absences outside the buffer. Apart from the folds, the number of \emph{training-presence}, \emph{training-absence}, \emph{testing-presence} and \emph{testing-absence}
#' records is stored and returned in the \code{records} table. If \code{species = NULL} (no column with 0s and 1s are defined), the procedure is like presence-absence data.
#'
#'
#' @param speciesData A SpatialPointsDataFrame, SpatialPoints or sf object containing species data.
#' @param species Character. Indicating the name of the field in which species presence/absence data (0s and 1s) are stored. If \code{speceis = NULL}
#' the presence and absence data will be treated the same and only training and testing records will be counted.
#' @param theRange Numeric value of the specified range by which the training and testing datasets are separated (See \code{\link{spatialAutoRange}}).
#' This distance should be in \strong{\emph{metres}}. The range can  be explored by \code{spatialAutoRange}.
#' @param spDataType Character input indicating the type of data. It can take two values, \strong{PA} for \emph{presence-absence} data and \strong{PB} for
#' \emph{presence-background} data, when \code{species} is not \code{NULL}. See the details section for more information on these two approaches.
#' @param addBG Logical. Add background points to the test set when \code{spDataType = "PB"}.
#' @param progress Logical. If TRUE a progress bar will be shown.
#'
#' @seealso \code{\link{spatialAutoRange}} for selecting buffer distance; \code{\link{spatialBlock}} and \code{\link{envBlock}} for
#' alternative blocking strategies; \code{\link{foldExplorer}} for visualisation of the generated folds.
#'
#' @return An object of class S3. A list of objects including:
#'     \itemize{
#'     \item{folds - a list containing the folds. Each fold has two vectors with the training (first) and testing (second) indices}
#'     \item{k - number of the folds}
#'     \item{range - the distance band to separated trainig and testing folds)}
#'     \item{species - the name of the species (column), if provided}
#'     \item{dataType - species data type}
#'     \item{records - a table with the number of points in each category of training and testing}
#'     }
#'
#' @export
#'
#' @examples
#' \dontrun{
#'
#' # import presence-absence species data
#' PA <- read.csv(system.file("extdata", "PA.csv", package = "blockCV"))
#' # coordinate reference system
#' Zone55s <- "+proj=utm +zone=55 +south +ellps=GRS80 +units=m +no_defs"
#' # make a SpatialPointsDataFrame object from data.frame
#' pa_data <- sp::SpatialPointsDataFrame(PA[,c("x", "y")], PA, proj4string=CRS(Zone55s))
#'
#' # buffering with presence-absence data
#' bf1 <- buffering(speciesData= pa_data,
#'                  species= "Species", # to count the number of presences and absences
#'                  theRange= 68000,
#'                  spDataType = "PA",
#'                  progress = T)
#'
#'
#' # import presence-background species data
#' PB <- read.csv(system.file("extdata", "PB.csv", package = "blockCV"))
#' # make a SpatialPointsDataFrame object from data.frame
#' pb_data <- sp::SpatialPointsDataFrame(PB[,c("x", "y")], PB, proj4string=CRS(Zone55s))
#'
#' # buffering with presence-background data
#' bf2 <- buffering(speciesData= pb_data,
#'                  species= "Species",
#'                  theRange= 68000,
#'                  spDataType = "PB",
#'                  addBG = TRUE, # add background data to testing folds
#'                  progress = T)
#'
#' # buffering with no species attribute
#' bf3 <- buffering(speciesData= pa_data,
#'                  theRange= 63300)
#'
#' }
buffering <- function(speciesData, species=NULL, theRange, spDataType="PA", addBG=TRUE, progress=TRUE){
  if((methods::is(speciesData, "SpatialPoints") || methods::is(speciesData, "sf"))==FALSE){stop("speciesData should be SpatialPointsDataFrame, SpatialPoints or sf object")}
  if(methods::is(speciesData, "sf")){
    sfobj <- speciesData
  } else{
    sfobj <- sf::st_as_sf(speciesData)
  }
  speciesData$ID <- 1:length(speciesData)
  if(is.null(sf::st_crs(sfobj))){
    stop("The coordinate reference system of species data should be defined")
  } else if(sp::is.projected(speciesData)){ # this is due to a recent change (Jan 2018) in the sf package that doesn't recognize the projected crs units. It will be fixed soon.
    sf::st_crs(sfobj) <- NA
  }
  dmatrix <- sf::st_distance(sfobj)
  distuni <- dmatrix[1,1] # take the unit to avoid using units package
  distuni[1] <- theRange
  foldList <- list()
  if(!is.null(species)){
    presences <- speciesData[speciesData@data[,species]==1,] # creating a layer of presence data
    if(spDataType=="PB"){
      prI <- which(speciesData@data[,species] == 1) # presence indices to loop through
      n <- length(prI)
      trainTestTable <- base::data.frame(trainPr=rep(0, n), trainAb=0, testPr=0, testAb=0)
      if(progress==TRUE){
        pb <- progress::progress_bar$new(format = " Progress [:bar] :percent in :elapsed",
                                         total=n, clear=FALSE, width=75) # add progress bar
      }
      j <- 0
      for(i in prI){ # loop through presences
        j <- j + 1
        trainSet <- which(dmatrix[i, ] > distuni)
        if(addBG==TRUE){
          test <- which(dmatrix[i, ] <= distuni)
          inside <- speciesData[test, ]
          testSet <- inside[inside@data[,species]==0,]$ID # creating IDs of bg data
          testSet[length(testSet) + 1] <- i
        } else{
          testSet <- i
        }
        foldList[[j]] <- assign(paste0("fold", j), list(trainSet, testSet))
        lnPrsences <- length(presences)
        lnBackground <- length(speciesData) - lnPrsences
        trainPoints <- speciesData[trainSet, ]
        trainTestTable$trainPr[j] <- length(trainPoints[trainPoints@data[,species]==1,])
        trainTestTable$trainAb[j] <- length(trainPoints[trainPoints@data[,species]!=1,])
        trainTestTable$testPr[j] <- 1
        if(addBG==TRUE){
          trainTestTable$testAb[j] <- lnBackground - trainTestTable$trainAb[j]
        } else{
          trainTestTable$testAb[j] <- 0
        }
        if(progress==TRUE){
          pb$tick() # update progress bar
        }
      }
    } else if(spDataType=="PA"){
      n <- length(speciesData)
      trainTestTable <- base::data.frame(trainPr=rep(0, n), trainAb=0, testPr=0, testAb=0)
      if(progress==TRUE){
        pb <- progress::progress_bar$new(format = " Progress [:bar] :percent in :elapsed",
                                         total=n, clear=FALSE, width=75) # add progress bar
      }
      for(i in 1:n){
        trainSet <- which(dmatrix[i, ] > distuni)
        testSet <- i
        foldList[[i]] <- assign(paste0("fold", i), list(trainSet, testSet))
        lnPrsences <- length(presences)
        # lnAbsence <- length(speciesData) - lnPrsences
        trainPoints <- speciesData[trainSet, ]
        trainTestTable$trainPr[i] <- length(trainPoints[trainPoints@data[,species]==1,])
        trainTestTable$trainAb[i] <- length(trainPoints[trainPoints@data[,species]!=1,])
        if(speciesData@data[testSet, species] == 1){
          trainTestTable$testPr[i] <- 1
          trainTestTable$testAb[i] <- 0
        } else{
          trainTestTable$testPr[i] <- 0
          trainTestTable$testAb[i] <- 1
        }
        if(progress==TRUE){
          pb$tick() # update progress bar
        }
      }
    }
  } else{ # data with no species column
    n <- length(speciesData)
    trainTestTable <- base::data.frame(train=rep(0, n), test=0)
    if(progress==TRUE){
      pb <- progress::progress_bar$new(format = " Progress [:bar] :percent in :elapsed",
                                       total=n, clear=FALSE, width=75) # add progress bar
    }
    for(i in 1:n){
      trainSet <- which(dmatrix[i, ] > distuni)
      testSet <- i
      foldList[[i]] <- assign(paste0("fold", i), list(trainSet, testSet))
      trainTestTable$train[i] <- length(trainSet)
      trainTestTable$test[i] <- length(testSet)
      if(progress==TRUE){
        pb$tick() # update progress bar
      }
    }
  }
  theList <- list(folds=foldList, k=n, species=species, range=theRange, dataType=spDataType, records=trainTestTable)
  class(theList) <- c("BufferedBlock")
  return(theList)
}


#' @export
print.BufferedBlock <- function(x, ...){
  print(class(x))
}

#' @export
summary.BufferedBlock <- function(object, ...){
  print("Number of recoreds in each category")
  print(object$records)
}



systematicNum <- function(layer, num=5){
  n <- nrow(layer)
  if(n %% num == 0){
    a <- n/num
    c <- rep(1:num, a)
  } else {
    a <- floor(n/num)
    b <- n %% num
    c <- c(rep(1:num, a), 1:b)
  }
  return(c)
}

#' Use spatial blocks to separate train and test folds
#'
#' This function creates spatially separated folds based on a pre-specified distance. It assigns blocks to the training and
#' testing folds randomly, systematically or in a checkerboard pattern. The distance (\code{theRange}) should be in \strong{metres},
#' regardless of the unit of the reference system of the input data (for more information see the details section). By default,
#' the function creates blocks according to the extent and shape of the study area, assuming that the user has considered the
#' landscape for the given species and case study.
#' Alternatively, blocks can be created based on species spatial data. This is especially useful when the
#' species data is not evenly dispersed in the whole region. Blocks can also be offset so the origin is not at the outer
#' corner of the rasters. Instead of providing a distance, the blocks can also be created by specifying a number of rows and
#' columns and divide the study area into vertical or horizontal bins, as presented in Wenger & Olden (2012) and Bahn & McGill (2012).
#' Finally, the blocks can be specified by a user-defined spatial polygon layer.
#'
#'
#' To keep the consistency, all the functions use \strong{metres} as their unit. In this function, when the input map
#' has geographic coordinate system (decimal degrees), the block size is calculated based on deviding \code{theRange} by
#' 111325 (the standard distance of a degree in metres, on the Equator) to change the unit to degree. This value is optional
#' and can be changed by user via \code{degMetre} argument.
#'
#' The \code{xOffset} and \code{yOffset} can be used to change the spatial position of the blocks. It can also be used to
#' assess the sensitivity of analysis results to shifting in the blocking arrangements. These options are available when \code{theRange}
#' is defined. By default the region is located in the middle of the blocks and by setting the offsets, the blocks will shift.
#'
#' Roberts et. al. (2017) suggest that blocks should be substantially bigger than the range of spatial
#' autocorrelation (in model residual) to obtain realistic error estimates, while a buffer with the size of
#' the spatial autocorrelation range would result in a good estimation of error. This is because of the so-called
#' edge effect (O'Sullivan & Unwin, 2014), whereby points located on the edges of the blocks of opposite sets are
#' not separated spatially. Blocking with a buffering strategy overcomes this issue (see \code{\link{buffering}}).
#'
#'
#' @inheritParams buffering
#' @param blocks A SpatialPolygons* or sf object to be used as the blocks. This can be a user defined polygon and it must cover all
#' the species points.
#' @param theRange Numeric value of the specified range by which blocks are created and training/testing data are separated.
#' This distance should be in \strong{metres}. The range could be explored by \code{spatialAutoRange()} and \code{rangeExplorer()} functions.
#' @param k Integer value. The number of desired folds for cross-validation. The default is \code{k = 5}.
#' @param selection Type of assignment of blocks into  folds. Can be \strong{random} (default), \strong{systematic} or \strong{checkerboard} pattern (not working with user-defined blocks).
#' @param iteration Integer value. The number of attempts to create folds that fulfil the set requirement for minimum number
#' of points in each category (\emph{training-presence}, \emph{training-absence}, \emph{testing-presence}
#' and \emph{testing-absence}), as specified by \code{numLimit} value.
#' @param numLimit Integer value. The minimum number of points in each category of data (see above - \code{iterration}).
#' If \code{numLimit = NULL}, the most evenly dispersed number of records is chosen (given the number of iteration).
#' @param maskBySpecies Logical. If raster layer is provided and \code{maskBySpecies = TRUE}, the blocks will be
#' created based on the raster extent, but only those blocks covering species data is kept. The default is \code{TRUE}.
#' @param degMetre Integer. The conversion rate of metres to degree. See the details section for more information.
#' @param rasterLayer RasterLayer for visualisation. If provided, this will be used to specify the blocks covering the area.
#' @param border SpatialPolygons* or sf object to clip the block based on a border. This might increase the computation time.
#' @param showBlocks Logical. If TRUE the final blocks with fold numbers will be plotted. A raster layer could be specified
#' in \code{rasterlayer} argument to be as background.
#' @param biomod2Format Logical. Creates a matrix of folds that can be directly used in the \pkg{biomod2} package as
#' a \emph{DataSplitTable} for cross-validation.
#' @param progress Logical. If TRUE shows a progress bar when \code{numLimit = NULL} in random fold selection.
#' @param rows Integer value by which the area is divided into latitudinal bins.
#' @param cols Integer value by which the area is divided into longitudinal bins.
#' @param xOffset Numeric value between \strong{0} and \strong{1} for shifting the blocks horizontally.
#' The value is the proportion of block size.
#' @param yOffset Numeric value between \strong{0} and \strong{1} for shifting the blocks vertically. The value is the proportion of block size.
#'
#' @seealso \code{\link{spatialAutoRange}} and \code{\link{rangeExplorer}} for selecting block size; \code{\link{buffering}}
#' and \code{\link{envBlock}} for alternative blocking strategies; \code{\link{foldExplorer}} for visualisation of the generated folds.
#' @seealso For \emph{DataSplitTable} see \code{\link[biomod2]{BIOMOD_cv}} in \pkg{biomod2} package
#'
#' @references Bahn, V., & McGill, B. J. (2012). Testing the predictive performance of distribution models. Oikos, 122(3), 321–331.
#'
#' O’Sullivan, D., Unwin, D.J., 2010. Geographic Information Analysis, 2nd ed. John Wiley & Sons.
#'
#' Roberts et al., 2017. Cross-validation strategies for data with temporal, spatial, hierarchical,
#' or phylogenetic structure. Ecography. 40: 913-929.
#'
#' Wenger, S.J., Olden, J.D., 2012. Assessing transferability of ecological models: an underappreciated aspect of statistical
#' validation. Methods Ecol. Evol. 3, 260–267.
#'
#' @return An object of class S3. A list of objects including:
#'    \itemize{
#'     \item{folds - a list containing the folds. Each fold has two vectors with the training (first) and testing (second) indices}
#'     \item{foldID - a vector of values indicating the number of the fold for each observation (each number corresponds to the same point in species data)}
#'     \item{biomodTable - a matrix with the folds to be used in \pkg{biomod2} package}
#'     \item{k - number of the folds}
#'     \item{blocks - SpatialPolygon of the blocks}
#'     \item{range - the distance band of separating trainig and testing folds, if provided}
#'     \item{species - the name of the species (column), if provided}
#'     \item{plots - ggplot object}
#'     \item{records - a table with the number of points in each category of training and testing}
#'     }
#' @export
#'
#' @examples
#' \dontrun{
#'
#' # load package data
#' awt <- raster::brick(system.file("extdata", "awt.grd", package = "blockCV"))
#' # import presence-absence species data
#' PA <- read.csv(system.file("extdata", "PA.csv", package = "blockCV"))
#' # make a SpatialPointsDataFrame object from data.frame
#' pa_data <- sp::SpatialPointsDataFrame(PA[,c("x", "y")], PA, proj4string=raster::crs(awt))
#'
#' # spatial blocking by specified range and random assignment
#' sb1 <- spatialBlock(speciesData = pa_data,
#'                     species = "Species",
#'                     theRange = 68000,
#'                     k = 5,
#'                     selection = 'random',
#'                     iteration = 250,
#'                     numLimit = NULL,
#'                     biomod2Format = TRUE,
#'                     xOffset = 0.3, # shift the blocks horizontally
#'                     yOffset = 0)
#'
#' # spatial blocking by row/column and systematic fold assignment
#' sb2 <- spatialBlock(speciesData = pa_data,
#'                     species = "Species",
#'                     rasterLayer = awt,
#'                     rows = 5,
#'                     cols = 8,
#'                     k = 5,
#'                     selection = 'systematic',
#'                     maskBySpecies = TRUE,
#'                     biomod2Format = TRUE)
#'
#' }
spatialBlock <- function(speciesData, species=NULL, blocks=NULL, rasterLayer=NULL, theRange=NULL, rows=NULL, cols=NULL, k=5,
                         selection='random', iteration=250, numLimit=NULL, maskBySpecies=TRUE, degMetre=111325, border=NULL,
                         showBlocks=TRUE, biomod2Format=TRUE, xOffset=0, yOffset=0, progress=TRUE){
  if(selection != 'systematic' && selection != 'random' && selection != 'checkerboard'){stop("The selection argument must be 'systematic', 'random' or 'checkerboard'")}
  chpattern <- FALSE
  if(selection == 'checkerboard'){
    chpattern <- TRUE
    if(k != 2){
      k <- 2
      message("k has been set to 2 because of checkerboard fold selection")
    }
  }
  if(methods::is(speciesData, "sf")){
    speciesData <- sf::as_Spatial(speciesData)
  }
  if(methods::is(speciesData, "SpatialPoints") && !methods::is(speciesData, "SpatialPointsDataFrame")){
    speciesData <- sp::SpatialPointsDataFrame(speciesData, data.frame(ID=1:length(speciesData)))
  }
  if(methods::is(border, "sf")){
    border <- sf::as_Spatial(border)
  }
  if(is.null(blocks)){
    if(is.null(rasterLayer)){
      net <- rasterNet(speciesData, resolution=theRange, xbin=cols, ybin=rows, degree=degMetre, xOffset=xOffset, yOffset=yOffset, checkerboard=chpattern)
      if(is.null(border)){
        subBlocks <- raster::intersect(net, speciesData)
      } else{
        subBlocks <- crop(net, border)
      }
    } else{
      if(is.null(border)){
        if(maskBySpecies==FALSE){
          subBlocks <- rasterNet(rasterLayer[[1]], mask=TRUE, resolution=theRange, xbin=cols, ybin=rows, degree=degMetre, xOffset=xOffset, yOffset=yOffset, checkerboard=chpattern)
        } else{
          net <- rasterNet(rasterLayer[[1]], resolution=theRange, xbin=cols, ybin=rows, degree=degMetre, xOffset=xOffset, yOffset=yOffset, checkerboard=chpattern)
          subBlocks <- raster::intersect(net, speciesData)
        }
      } else{ # having a border
        net <- rasterNet(rasterLayer[[1]], resolution=theRange, xbin=cols, ybin=rows, degree=degMetre, xOffset=xOffset, yOffset=yOffset, checkerboard=chpattern)
        subBlocks <- crop(net, border)
      }
    }
  } else{
    if(methods::is(blocks, "SpatialPolygonsDataFrame")){
      subBlocks <- raster::intersect(blocks, speciesData)
    } else if(methods::is(blocks, "SpatialPolygons")){
      df <- data.frame(ID=1:length(blocks))
      blocks <- sp::SpatialPolygonsDataFrame(blocks, df, match.ID=FALSE)
      subBlocks <- raster::intersect(blocks, speciesData)
      if(!is.null(species)){
        species <- NULL
        message("The species argument has been set to NULL since there is no table associated with SpatialPoints object")
      }
    } else if(methods::is(blocks, "sf")){
      blocks <- sf::as_Spatial(blocks)
      subBlocks <- raster::intersect(blocks, speciesData)
    } else{
      stop("The input blocks should be a SpatialPolygons* object")
    }
    if(selection == 'checkerboard'){
      selection <- 'systematic'
      message("'checkerboard' fold selection does not work with user defined blocks. 'systematic' will be used instead.")
    }
  }
  # given a wrong iteration, ensure the process will be run at least once
  if(iteration < 1 || is.numeric(iteration)==FALSE){
    iteration=1
    message("The interation has been set to 1! \n", "Iteration must be a numeric value, equal or higher than one.")
  } else if(is.numeric(iteration) && iteration >= 10000){
    message("Due to the large number of iterations, the process might take a while")
  }
  if(progress==TRUE && is.null(numLimit)){
    pb <- progress::progress_bar$new(format = " Progress [:bar] :percent in :elapsed",
                                     total=iteration, clear=FALSE, width=75) # add progress bar
  }
  maxNumRecord <- 0
  maxSD <- Inf
  for(i in 1:iteration){
    nrowBlocks <- length(subBlocks)
    if(k > nrowBlocks){
      stop("'k' is bigger than the number of the blocks")
    } else if(k < 2){stop("'k' must be 2 or higher")}
    if(selection=='systematic'){
      subBlocks$folds <- systematicNum(subBlocks, k)
    } else if(selection=='checkerboard'){
      subBlocks$folds <- subBlocks$layer
    } else if(selection=='random'){
      # create random folds
      subBlocks$folds <- 0
      nrowBlocks <- length(subBlocks)
      if(k==2){
        subBlocks$folds[sample(nrowBlocks, nrowBlocks/2, replace=FALSE)] <- 1
        unfold <- which(subBlocks$folds==0)
        subBlocks$folds[unfold] <- 2
      } else {
        num <- floor(nrowBlocks / k)
        for(kk in 1:num){ # while
          unfold <- which(subBlocks$folds==0)
          subBlocks$folds[sample(unfold, k, replace=FALSE)]=1:k
        }
        if(nrowBlocks %% k != 0){
          rest <- nrowBlocks %% k
          unfold <- which(subBlocks$folds==0)
          subBlocks$folds[unfold]=1:rest
        }
      }
    }
    if(!is.null(species)){
      presences <- speciesData[speciesData@data[,species]==1,] # creating a layer of presence data
      trainTestTable <- base::data.frame(trainPr=rep(0, k), trainAb=0, testPr=0, testAb=0)
    } else{
      trainTestTable <- base::data.frame(train=rep(0, k), test=0)
    }
    foldList <- list()
    foldNum <- rep(NA, nrow(speciesData))
    biomodTable <- data.frame(RUN1=rep(TRUE, length(speciesData)))
    for(p in 1:k){
      sp.over <- sp::over(speciesData, subBlocks[subBlocks$folds==p, ]) # overlay layers to specify the inside & oudside points
      trainSet <- which(is.na(sp.over[,1])) # exclude all the data from the bufer area
      testSet <- which(!is.na(sp.over[,1]))
      foldNum[testSet] <- p
      foldList[[p]] <- assign(paste0("fold", p), list(trainSet, testSet))
      if(!is.null(species)){
        lnPrsences <- length(presences)
        lnAbsences <- length(speciesData) - lnPrsences
        trainPoints <- speciesData[which(is.na(sp.over[,1])),]
        trainTestTable$trainPr[p] <- length(trainPoints[trainPoints@data[,species]==1,])
        trainTestTable$trainAb[p] <- length(trainPoints[trainPoints@data[,species]!=1,])
        trainTestTable$testPr[p] <- lnPrsences - trainTestTable$trainPr[p]
        trainTestTable$testAb[p] <- lnAbsences - trainTestTable$trainAb[p]
      } else{
        trainTestTable$train[p] <- length(trainSet)
        trainTestTable$test[p] <- length(testSet)
      }
      if(biomod2Format==TRUE){ # creating a biomod2 DataSplitTable for validation
        colm <- paste0("RUN", p)
        biomodTable[,colm] <- FALSE
        biomodTable[,colm][trainSet] <- TRUE # biomodTable[trainSet,colm] <- TRUE # #### Change this!
      }
    }
    if(selection=='systematic' || selection == 'checkerboard'){
      # if(iteration > 1){message('Iteration has been set to 1 due to systematic selection')}
      break
    } else{ # do the rest if the selection is random
      if(is.numeric(numLimit) && numLimit >= 0){
        if(any(trainTestTable < numLimit)==FALSE){ # exit the loop if meet the limit number
          break
        }
      } else if(is.null(numLimit)){ # find the highest minimum number in the table and store relevant objects
        if(min(trainTestTable) >= maxNumRecord && stats::sd(unlist(trainTestTable)) < maxSD){
          trainTestTable2 <- trainTestTable
          maxNumRecord <- min(trainTestTable2)
          maxSD <- stats::sd(unlist(trainTestTable))
          subBlocks2 <- subBlocks
          foldList2 <- foldList
          foldNum2 <- foldNum
          biomodTable2 <- biomodTable
          iter <- i
        }
      } else (stop("numLimit argument should be a numeric value equal or hagher than 0 or be NULL"))
      if(progress==TRUE && is.null(numLimit)){
        pb$tick() # update progress bar
      }
    }
  }
  if(is.null(numLimit) && selection=='random'){ # return the optimum bloks, table etc.
    subBlocks <- subBlocks2
    trainTestTable <- trainTestTable2
    foldList <- foldList2
    foldNum <- foldNum2
    biomodTable <- biomodTable2
    print(paste0("The best fold was in iteration ", iter, ":"))
    # print(trainTestTable)
  }
  if(!is.null(numLimit) && any(trainTestTable < numLimit)==TRUE){ # show a message if criteria is not met
    if(numLimit==0){
      message(paste("There are one or more folds with", numLimit, "records"))
    } else{
      message(paste("There are one or more folds with less than", numLimit, "records"))
    }
  }
  print(trainTestTable)
  centroids <- as.data.frame(sp::coordinates(subBlocks)); names(centroids) <- c("Easting", "Northing") # take the centroids
  subBlocks@data <- cbind(subBlocks@data, centroids) # add the centeroids to blocks
  polyObj <- ggplot2::fortify(subBlocks)
  if(is.na(sp::proj4string(speciesData))){
    mapext <- raster::extent(speciesData)[1:4]
    if(mapext >= -180 && mapext <= 180){
      xaxes <- "Longitude"
      yaxes <- "Latitude"
    } else {
      xaxes <- "Easting"
      yaxes <- "Northing"
    }
  } else{
    if(sp::is.projected(speciesData)){
      xaxes <- "Easting"
      yaxes <- "Northing"
    } else{
      xaxes <- "Longitude"
      yaxes <- "Latitude"
    }
  }
  if(is.null(rasterLayer)){
    p2 <- ggplot() +
      geom_polygon(aes(x = long, y = lat, group=group),
                   data = polyObj, color ="red",
                   fill ="orangered4",
                   alpha = 0.04,
                   size = 0.2) +
      geom_text(aes(label=folds, x=Easting, y=Northing), data=subBlocks@data) +
      xlab(xaxes) + ylab(yaxes) +
      coord_fixed() +
      ggtitle('Spatial blocks', subtitle=paste("The", selection, 'fold assignment by', i, 'iteration'))
  } else{
    if(methods::is(rasterLayer, 'Raster')){
      samp <- raster::sampleRegular(rasterLayer[[1]], 5e+05, asRaster=TRUE)
      map.df <- raster::as.data.frame(samp, xy=TRUE, centroids=TRUE, na.rm=TRUE)
      colnames(map.df) <- c("Easting", "Northing", "MAP")
      mid <- mean(map.df$MAP)
      p2 <- ggplot(data=map.df, aes(y=Northing, x=Easting)) +
        geom_raster(aes(fill=MAP)) +
        coord_fixed() +
        scale_fill_gradient2(low="darkred", mid="yellow", high="darkgreen", midpoint=mid) +
        guides(fill=FALSE) +
        ggtitle('Spatial blocks', subtitle=paste("The", selection, 'fold assignment by', i, 'iteration')) +
        geom_polygon(aes(x = long, y = lat, group=group),
                     data = polyObj, color ="red",
                     fill ="orangered4",
                     alpha = 0.04,
                     size = 0.2) +
        xlab(xaxes) + ylab(yaxes) +
        geom_text(aes(label=folds, x=Easting, y=Northing), data=subBlocks@data) # lable blocks
    }
  }
  if(showBlocks==TRUE){
    plot(p2)
  }
  # save the objects
  if(biomod2Format==TRUE){
    biomodTable <- as.matrix(biomodTable)
    theList <- list(folds=foldList, foldID=foldNum, biomodTable=biomodTable, k=k, blocks=subBlocks, species=species,
                    range=theRange, plots=p2, records=trainTestTable)
  } else{
    theList <- list(folds=foldList, foldID=foldNum, biomodTable=NULL, k=k, blocks=subBlocks, species=species,
                    range=theRange, plots=p2, records=trainTestTable)
  }
  class(theList) <- c("SpatialBlock")
  return(theList)
}


#' @export
print.SpatialBlock <- function(x, ...){
  print(class(x))
}

#' @export
plot.SpatialBlock <- function(x, y, ...){
  plot(x$plots)
  message("Please use foldExplorer function to plot each fold interactively")
}

#' @export
summary.SpatialBlock <- function(object, ...){
  print("Number of recoreds in each category")
  print(object$records)
}
adamlilith/blockCV documentation built on May 25, 2019, 12:41 a.m.