R/helpers_prepare_data.R

Defines functions prepare_formulas_glm calibration_grid_glm prepare_formulas_glmnetmx calibration_grid_glmnetmx calibration_grid part_data handle_missing_data generate_background_variables

#### HELPERS FOR PREPARE DATA ####
# Helper function to generate background variables
generate_background_variables <- function(env, nbg, bias_file = NULL,
                                          bias_effect = "direct",
                                          seed = 1) {


  if (!is.null(bias_file)) {

    #Check extension and resolution of bias_file
    same_ext <- terra::ext(bias_file) == terra::ext(env[[1]])
    same_res <- all(terra::res(bias_file) == terra::res(env[[1]]))
    same_ncell <- terra::ncell(bias_file) == terra::ncell(env[[1]])
    if (!same_ext | !same_res | !same_ncell) {
      stop("'bias_file' and 'raster_variables' must have the same 'extension', 'resolution', and 'number of cells'.")
    }

    names(bias_file) <- "bias_value"
    bias_file <- c(bias_file, env[[1]])
    cell_samp <- terra::as.data.frame(bias_file, na.rm = TRUE, cells = TRUE)
    bias_value <- cell_samp$bias_value
    cell_samp <- cell_samp$cell
    if (bias_effect == "inverse") {
      bias_value <- max(bias_value) - bias_value
    }
  } else {
    cell_samp <- terra::as.data.frame(env[[1]], na.rm = TRUE,
                                      cells = TRUE)[, "cell"]
    bias_value <- NULL
  }

  if (length(cell_samp) > nbg) {
    set.seed(seed)
    cell_samp <- sample(cell_samp, size = nbg, replace = FALSE,
                        prob = bias_value)
  } else {
    message("'n_background' >= initial number of points, using all points.")
  }

  bg_var <- terra::extract(x = env, y = cell_samp, xy = TRUE)
  bg_var$pr_bg <- 0

  return(bg_var)
}

# Helper function to handle missing data
handle_missing_data <- function(occ_bg, weights) {
  na_rows <- which(!complete.cases(occ_bg))

  if (length(na_rows) > 0) {
    occ_bg <- occ_bg[-na_rows,]

    if (!is.null(weights)) {
      weights <- weights[-na_rows]
    }
    warning(length(na_rows), " rows were excluded from database because NAs were found.")
  }
  return(occ_bg)
}


### Helper function to partition data
part_data <- function(data,
                      pr_bg = "pr_bg",
                      train_proportion = 0.7,
                      n_partitions = 5,
                      partition_method = "subsample",
                      seed = 1) {
  if(!inherits(data, "data.frame")){
    stop("'data' must be data.frame")
  }

  if(!(pr_bg %in% names(data))){
    stop("'pr_bg' must be a column in data")
  }

  if(!(partition_method %in% c("kfolds", "subsample", "bootstrap"))){
    stop("Invalid 'partition_method'. Available options include 'kfolds', 'subsample', and 'bootstrap'")
  }

  if(!(n_partitions %% 1 == 0) || n_partitions <= 0){
    stop("'n_partitions' must be a positive numeric integer (e.g., 1, 2, 3...)")
  }

  if(partition_method %in% c("bootstrap", "subsample")){
    if(train_proportion > 1 || train_proportion <= 0 ||
       is.na(train_proportion) || is.null(train_proportion)){
      stop("'train_proportion' must be a positive numeric between 0 and 1")
    }
  }

  #Get data
  d <- data[pr_bg]
  #Split presence and absence
  pre <- which(d[, pr_bg] == 1)
  aus <- which(d[, pr_bg] == 0)

  if (partition_method == "kfolds") {

    set.seed(seed)
    foldp <- sample(cut(seq(1, length(pre)), breaks = n_partitions,
                        labels = FALSE))
    set.seed(seed)
    folda <- sample(cut(seq(1, length(aus)), breaks = n_partitions,
                        labels = FALSE))
    names(foldp) <- pre
    names(folda) <- aus
    all <- c(foldp, folda)
    rep_data <- list()
    for (i in 1:n_partitions) {
      rep_data[[paste0("Partition_", i)]] <- as.numeric(names(all[all ==
                                                          i]))
    }

    # #Join data
    # d$folds <- NA
    # d$folds[which(d[,pr_bg] == 1)] <- foldp
    # d$folds[which(d[,pr_bg] == 0)] <- folda
    #
    # rep_data_p <- lapply(unique(foldp), function(f) {
    #   which(d$pr_bg == 1 & d$folds != f) })
    # rep_data_a <- lapply(unique(folda), function(f) {
    #   which(d$pr_bg == 0 & d$folds != f) })
    # rep_data <- lapply(1:length(rep_data_p), function(f){
    #   c(rep_data_p[[f]], rep_data_a[[f]])
    # })

  }

  if (partition_method == "subsample" | partition_method == "bootstrap") {
    if (partition_method == "subsample") {
      replacement <- FALSE
    } else if (partition_method == "bootstrap") {
      replacement <- TRUE
    }


    #Create list of partitions
    rep_data <- lapply(1:n_partitions, function(i) {
      set.seed(seed * i)
      foldp <- sample(pre,
                      size = floor(train_proportion * length(pre)),
                      replace = replacement)
      foldp <- setdiff(pre, foldp)

      set.seed(seed * i)
      folda <- sample(aus,
                      size = floor(train_proportion * length(aus)),
                      replace = replacement)
      folda <- setdiff(aus, folda)
      foldpa <- c(foldp, folda)
      return(foldpa)
    })
    names(rep_data) <- paste0("Partition_", 1:n_partitions)
  }
  return(rep_data)
}

#### Helper functions to create formula grid ####
calibration_grid <- function(occ_bg,
                             min_number = 2,
                             min_continuous = NULL,
                             categorical_var = NULL,
                             features = c("l", "q", "lq", "lqp", "p"),
                             algorithm = c("glm", "maxnet"),
                             r_multiplier = c(0.1, 1, 2, 3, 5)) {

  # Validate the algorithm input
  algorithm <- match.arg(algorithm)


  if (algorithm == "glm") {
    # Call the GLM-specific function
    cal_grid_data <- calibration_grid_glm(occ_bg = occ_bg,
                                          min_number = min_number,
                                          min_continuous = min_continuous,
                                          categorical_var = categorical_var,
                                          features = features)
  } else if (algorithm == "maxnet") {
    # Call the maxnet-specific function
    cal_grid_data <- calibration_grid_glmnetmx(occ_bg = occ_bg,
                                               min_number = min_number,
                                               min_continuous = min_continuous,
                                               categorical_var = categorical_var,
                                               features = features,
                                               r_multiplier = r_multiplier)
  }

  return(cal_grid_data)
}

# Calibration Grid Generation using maxnet
calibration_grid_glmnetmx <- function(occ_bg,
                                      categorical_var = NULL,
                                      features = c("l", "q", "lq", "lqp", "p"),
                                      min_number = 2,
                                      min_continuous = NULL,
                                      r_multiplier = c(0.1, 1, 2, 3, 5)) {

  #Get variable names
  var_names <- colnames(occ_bg[, -1, drop = FALSE])

  # Get variable combinations
  var_comb <- enmpa::aux_var_comb(var_names = var_names, minvar = min_number)

  # Remove combinations according to minimum number of continuous variables
  if (!is.null(min_continuous) & !is.null(categorical_var)) {
    n_cont <- sapply(var_comb, function(x) length(x[x != categorical_var]))
    var_comb <- var_comb[n_cont >= min_continuous]
  }

  # Split features
  formula_x <- list()
  for(f_x in features) {
    if (grepl("p", f_x) & !is.null(categorical_var)) {
      var_comb_new <- var_comb[
        sapply(var_comb, function(x) {sum(!x %in% categorical_var)}) >= 2
      ]
    } else {
      var_comb_new <- var_comb
    }
    for(vc in var_comb_new) {
      f_l <- prepare_formulas_glmnetmx(independent = vc, type = f_x,
                                       categorical_var = categorical_var)
      names(f_l) <- f_x
      formula_x <- c(formula_x, f_l)
      }
  }

  # Create a data frame with formulas and their corresponding features
  formula_d <- data.frame(
    Formulas = vapply(formula_x, function(f) {paste(f, "-1")}, character(1)),
    Features = names(formula_x),
    stringsAsFactors = FALSE
  )

  # Expand the grid by combining formulas and regularization parameters
  f_grid <- expand.grid(Formulas = formula_d$Formulas, R_multiplier = r_multiplier,
                        stringsAsFactors = FALSE)

  f_grid <- merge(f_grid, formula_d, by = "Formulas", sort = FALSE)
  f_grid$ID <- seq_len(nrow(f_grid))
  f_grid <- f_grid[, c("ID", "Formulas", "R_multiplier", "Features")]
  f_grid$Formulas <- as.character(f_grid$Formulas)

  return(f_grid)
}

# Prepare Formulas for maxnet
prepare_formulas_glmnetmx <- function(independent, type = "lqpht",
                                      categorical_var = NULL,
                                      minvar = 1, maxvar = NULL) {

  if (is.character(type)) {
    if (!all(unlist(strsplit(type, "")) %in% c("l", "p", "q", "h", "t"))) {
      stop("'type' must be: 'l', 'p', 'q', 'h', 't', or a combination of those.")
    }
  }  else {
    stop("'type' must be of class character.")
  }

  # Check if categorical variable is in independent variables. If not, set null
  if (!is.null(categorical_var)) {
    if (!(categorical_var %in% independent)) {
      categorical_var <- NULL
    }
  }

  if (!is.null(categorical_var)) {
    independent <- setdiff(independent, categorical_var)
  }

  predictors <- independent
  npred <- length(predictors)
  aux <- " "
  # Linear
  if (grepl("l", type)) {
    aux <- paste(aux, paste(predictors, collapse = " + "),
                 sep = " + ")
  }
  # Quadratic
  if (grepl("q", type)) {
    for (v in 1:length(predictors)) {
      aux <- paste(aux, paste0("I(", predictors[v], "^2)"),
                   sep = " + ")
    }
  }
  # Product
  if (grepl("p", type)) {
    if (npred > 1) {
      inter_tab <- utils::combn(predictors, 2)
      aux_inter <- paste0(" + ", paste(apply(inter_tab,
                                             2, paste, collapse = ":"),
                                       collapse = " + "))
      if (length(aux_inter) > 0) {
        aux <- paste0(aux, aux_inter)
      }
    }
    else {
      if (grepl("l", type) | grepl("q", type)) {
        message("'p' is only possible with 2 or more independent variables.",
                "\nReturning other combinations.")
      }
      else {
        stop("'p' is only possible with 2 or more independent variables.",
             "\nTry other combinations of type.")
      }
    }
  }
  # Hinge
  if (grepl("h", type)) {
    for (v in 1:length(predictors)) {
      aux <- paste(aux, paste0("hinge(", predictors[v], ")"),
                   sep = " + ")
    }
  }
  # Threshold
  if (grepl("t", type)) {
    for (v in 1:length(predictors)) {
      aux <- paste(aux, paste0("thresholds(", predictors[v], ")"),
                   sep = " + ")
    }
  }
  out <- paste0("~",
                gsub(pattern = "  \\+ ", x = aux, replacement = ""))

  if (!is.null(categorical_var)) {
    out <- paste0(out, " + categorical(", categorical_var, ")")
  }
  return(out)
}

# Calibration Grid Generation using GLM
calibration_grid_glm <- function(occ_bg,
                                 min_number = 2,
                                 min_continuous = NULL,
                                 categorical_var = NULL,
                                 features = c("l", "q", "lq", "lqp", "p")) {

  #Get variable names
  var_names <- colnames(occ_bg[, -1, drop = FALSE])

  # Generate all combinations of variables
  var_combinations <- enmpa::aux_var_comb(var_names = var_names,
                                          minvar = min_number)

  # Filter combinations based on the minimum number of continuous variables
  if (!is.null(min_continuous)) {
    continuous_counts <- sapply(var_combinations, function(vars) {
      length(setdiff(vars, categorical_var))
    })
    var_combinations <- var_combinations[continuous_counts > min_continuous]
  }

  # Prepare formulas for each feature type
  formula_list <- list()

  for (feature_type in features) {
    if (grepl("p", feature_type) && !is.null(categorical_var)) {
      filtered_combinations <- var_combinations[
        sapply(var_combinations, function(vars) {
                 length(setdiff(vars, categorical_var)) >= 2
        })
      ]
    } else {
      filtered_combinations <- var_combinations
    }

    for (vars in filtered_combinations) {
      formula <- prepare_formulas_glm(independent = vars, type = feature_type,
                                      categorical_var = categorical_var)
      formula_list <- c(formula_list, setNames(list(formula), feature_type))
      }
  }

  # Create a data frame with the formulas and their corresponding features
  formula_grid <- data.frame(
    ID = seq_along(formula_list),
    Formulas = unlist(formula_list),
    Features = names(formula_list),
    stringsAsFactors = FALSE
  )
  return(formula_grid)
}

# Prepare Formulas for GLM
prepare_formulas_glm <- function(independent, type = "l",
                                 categorical_var = NULL) {

  # Validate the 'type' input
  valid_types <- c("l", "p", "q")
  if (!all(unlist(strsplit(type, "")) %in% valid_types)) {
    stop("'type' must be: 'l', 'p', 'q', or a combination of those.")
  }

  # Remove the categorical variable from independent variables if not present
  if (!is.null(categorical_var) && !(categorical_var %in% independent)) {
    categorical_var <- NULL
  }

  # Exclude the categorical variable from independent variables for GLM
  if (!is.null(categorical_var)) {
    independent <- setdiff(independent, categorical_var)
  }

  # Initialize the formula with an empty space
  formula_parts <- " "

  # Linear terms
  if (grepl("l", type)) {
    formula_parts <- paste(formula_parts, paste(independent, collapse = " + "),
                           sep = " + ")
  }

  # Quadratic terms
  if (grepl("q", type)) {
    quadratic_terms <- paste0("I(", independent, "^2)")
    formula_parts <- paste(formula_parts,
                           paste(quadratic_terms, collapse = " + "),
                           sep = " + ")
  }

  # Product (interaction) terms
  if (grepl("p", type)) {
    if (length(independent) > 1) {
      interaction_terms <- apply(utils::combn(independent, 2), 2, paste,
                                 collapse = ":")
      formula_parts <- paste(formula_parts,
                             paste(interaction_terms, collapse = " + "),
                             sep = " + ")
    } else {
      message("'p' is only possible with 2 or more independent variables. Returning other combinations.")
    }
  }

  # Create the final formula
  formula <- paste0("~", gsub(pattern = "  \\+ ", replacement = "",
                              x = formula_parts))

  # Add the categorical variable if provided
  if (!is.null(categorical_var)) {
    formula <- paste0(formula, " + categorical(", categorical_var, ")")
  }

  return(formula)
}

Try the kuenm2 package in your browser

Any scripts or data that you put into this service are public.

kuenm2 documentation built on April 21, 2026, 1:07 a.m.