sdmflow: Flexible Workflows for Species Distribution Modeling

Documented in crossv_cluster crossv_grid crossv_loo crossv_predstrat

#' Generate non-random cross-validated test-training pairs
#'
#' @param data A data frame
#' @param n Number of test-training pairs to generate (an integer).
#' Either single number of cluster groups, or vector with number of columns,
#' rows of sampling grid.
#' @param cluster Name of the clustering variable.
#' @param response Name of the response variable.
#' @param id Name of variable that gives each model a unique integer id.
#'
#' @return A data frame with n / prod(n) rows and columns test and train.
#' test and train are list-columns containing \code{\link{resample}} objects.
#' @export

crossv_cluster <- function(data, n = 5, cluster, id = ".id"){

  if (!is.numeric(n) || length(n) != 1) {
    stop("`n` must be a single integer.", call. = FALSE)
  }

  # cluster-levels in random order
  levels <- data %>% dplyr::pull(cluster) %>% unique() %>% sample()
  idx <- seq_len(nrow(data))

  # split into evenly-sized chunks
  groups <- split(levels, factor(sort(rank(levels)%%n)))

  fold_idx <- purrr::map(groups, function(x){which(data %>% dplyr::pull(cluster) %in% x)})

  fold <- function(test) {
    list(train = modelr::resample(data, setdiff(idx, test)), test = modelr::resample(data, test))
  }
  cols <- purrr::transpose(purrr::map(fold_idx, fold))
  cols[[id]] <- as.character(seq_len(n))
  tibble::as_tibble(cols)

}

#' @rdname crossv_cluster

crossv_loo <- function(data, cluster, response = "occ", id = "holdout", buffer = NULL){

  # cluster-levels in sorted order
  levels <- data %>% dplyr::pull(cluster) %>% unique() %>% sort()
  idx <- seq_len(nrow(data))

  # split
  groups <- as.list(levels)

  fold_idx <- purrr::map(groups, function(x){which(data %>% dplyr::pull(cluster) %in% x)})

  if(is.null(buffer)) {
    fold <- function(test) {
      list(train = modelr::resample(data, setdiff(idx, test)), test = modelr::resample(data, test))
    }
  } else {
    fold <- function(test) {
      level <- data %>% dplyr::slice(test) %>% dplyr::pull(cluster) %>% unique()
      filter_levels_pos <- c(match(level, levels) -seq_len(buffer), seq_len(buffer) + match(level, levels))
      filter_levels_pos <- filter_levels_pos[filter_levels_pos > 0]
      filter_levels <- levels[filter_levels_pos]
      filter_pos <- which(data %>% dplyr::pull(cluster) %in% filter_levels)
      list(train = modelr::resample(data, setdiff(idx, c(test, filter_pos))), test = modelr::resample(data, test))
    }
  }

  cols <- purrr::transpose(purrr::map(fold_idx, fold))
  cols[[id]] <- as.character(levels)
  out <- tibble::as_tibble(cols)

}


#' @rdname crossv_cluster

crossv_grid <- function(data, n = c(3,3), response = "occ", id = ".id"){

  grid <- sf::st_make_grid(data, n=n)

  fold_idx <- sf::st_intersects(grid, data)

  # remove grid cells that don't contain all response levels
  contains_both <- purrr::map_lgl(fold_idx, function(x){
    data %>% dplyr::slice(x) %>% dplyr::pull(response) %>% unique() %>% length() ==
      data %>% dplyr::slice(x) %>% dplyr::pull(response) %>% nlevels()
  })

  idx <- seq_len(nrow(data))

  if(length(which(contains_both)) < prod(n)) {
    message("Removing some of the grid cells because they do not contain all response levels.")
    fold_idx <- fold_idx[contains_both]
  }

  data <- st_set_geometry(data, NULL)

  fold <- function(test) {
    list(train = modelr::resample(data, setdiff(idx, test)), test = modelr::resample(data, test))
  }
  cols <- purrr::transpose(purrr::map(fold_idx, fold))
  cols[[id]] <- as.character(seq_along(fold_idx))
  tibble::as_tibble(cols)

}

#' @rdname crossv_cluster

crossv_predstrat <- function(data, n = 5, id = ".id", response = "occ", id_col = "id"){

  if (!is.numeric(n) || length(n) != 1) {
    stop("`n` must be a single integer.", call. = FALSE)
  }

  factor_cols <- data %>%
    dplyr::select_if(is.factor) %>%
    dplyr::select(-one_of(response, id_col)) %>%
    as.list()

  factor_names <- names(factor_cols)

  if(purrr::is_empty(factor_cols)){
    return(modelr::crossv_kfold(data, k=n, id = id))
  }

  # remove rows with levels fewer than n observations
  rare_levels <- purrr::map(factor_cols, function(x) {
    table(x)[table(x)< n] %>% names()
  })

  remove_rows <- purrr::map2(factor_cols, rare_levels, function(x,y){
    which(x %in% y)
  }) %>% unique() %>% unlist


  if (!purrr::is_empty(remove_rows)) {
    data <- data %>%
      dplyr::slice(-remove_rows) %>%
      dplyr::mutate_at(vars(one_of(factor_names)), droplevels)
  }

  # set aside 1 obs of every factor level for each resample

  save <- purrr::map(factor_names, function(x){
    s <- data %>%
      mutate(rownumber = 1:nrow(.)) %>%
      group_by_at(.vars = x) %>%
      sample_n(n) %>%
      mutate(resample = 1:n) %>%
      ungroup()
  })

  aside <- do.call(rbind, save) %>%
    dplyr::distinct() %>%
    split(.$resample) %>%
    purrr::map(pull, rownumber)

  # create folds for rest of data
  rest <- data %>%
    dplyr::slice(-unlist(aside))

  n_rest <- nrow(rest)
  folds <- sample(rep(1:n, length.out = n_rest))
  idx <- seq_len(n_rest)
  fold_idx <- split(idx, folds)

  # combine
  fold_idx <- map2(aside, fold_idx, c)

  fold <- function(test) {
    list(train = modelr::resample(data, setdiff(idx, test)), test = modelr::resample(data, test))
  }
  cols <- purrr::transpose(purrr::map(fold_idx, fold))

  cols[[id]] <- as.character(seq_len(n))
  tibble::as_tibble(cols)

}


#' @rdname crossv_cluster

crossv_temporal_blocking <- function(data, cluster, blocksize, time = "year", id = ".id") {


  mean_time <- data %>%
    group_split(!!!rlang::syms(cluster)) %>%
    map(function(x) {tibble(id = x$id[[1]], mean_time = mean(x %>% pull(time)))})

  mean_time <- do.call(bind_rows, mean_time) %>% arrange(mean_time)

  sorted <- data %>%
    right_join(mean_time, by = "id")

  levels <- sorted %>% pull(cluster) %>% unique()
  groups <- split(levels, ceiling(seq_along(levels)/blocksize))

  fold_idx <- purrr::map(groups, function(x){which(data %>% dplyr::pull(cluster) %in% x)})

  combinations <- combn(fold_idx, 2, simplify = FALSE)

  train_idx <- purrr::map(combinations, 1)
  test_idx <- purrr::map(combinations, 2)

  fold <- function(train, test) {
    list(train = modelr::resample(data, train), test = modelr::resample(data, test))
  }

  cols <- purrr::transpose(purrr::map2(train_idx, test_idx, fold))
  cols[[id]] <- as.character(seq_along(combinations))
  tibble::as_tibble(cols)

}


#' @rdname crossv_cluster

crossv_year_window<- function(data, windowsize) {

  years <- data %>% pull(year) %>% unique()

  if(windowsize > 0 ){
    years_used <- years %>% head(-windowsize) %>% tail(-windowsize)
  } else {
    years_used <- years
  }

  map_dfr(years_used, function(middle_year){

   year_sequence <- seq(middle_year-windowsize, middle_year+windowsize)

   test_years <- setdiff(years, year_sequence)

   train_idx <- which(data$year %in% year_sequence)

   map_dfr(test_years, function(x){

     test_idx <- which(data$year == x)

     tibble(train = list(modelr::resample(data, train_idx)),
            test = list(modelr::resample(data, test_idx)),
            middle_year = middle_year,
            test_year = x,
            windowsize = windowsize,
            training_size = length(train_idx),
            testing_size = length(test_idx)) %>%
       mutate(tdiff = min(abs(year_sequence-x)))
   })


 })

}