R/ecometric_model_qual.R
In commecometrics: Ecometric Models of Trait–Environment Relationships at the Community Level

Documented in ecometric_model_qual

#' Run an ecometric model for qualitative environmental variables
#'
#' Builds an ecometric trait space for qualitative environmental variables,
#' estimating the most probable category and the probability of each category
#' at each trait bin combination. Also calculates prediction accuracy
#' and anomalies for each point.
#'
#' @param points_df Output first element of the list from \code{summarize_traits_by_point()}. A data frame with columns: `summ_trait_1`, `summ_trait_2`, `count_trait`, and the environmental variable.
#' @param category_col Name of the column containing the categorical trait.
#' @param grid_bins_1 Number of bins for the first trait axis. If `NULL` (default),
#'   the number is calculated automatically using Scott's rule via `optimal_bins()`.
#' @param grid_bins_2 Number of bins for the second trait axis. If `NULL` (default),
#'   the number is calculated automatically using Scott's rule via `optimal_bins()`.
#' @param min_species Minimum number of species with trait data per point (default = 3).
#' @return A list containing:
#' \item{points_df}{Filtered input data frame with the following added columns:
#'   \describe{
#'     \item{bin_1}{Bin assignment code for first trait axis.}
#'     \item{bin_2}{Bin assignment code for second trait axis.}
#'     \item{prob_category}{Estimated probability of each environmental category per trait bin (e.g., \code{prob_1}, \code{prob_2}, etc.).}
#'     \item{observed_probability}{Probability assigned to the observed category for each point.}
#'     \item{predicted_probability}{Probability assigned to the predicted (most likely) category for each point.}
#'     \item{predicted_category}{Predicted environmental category for each point.}
#'     \item{correct_prediction}{Indicator for whether the predicted category matches the observed category (\code{"Yes"} or \code{"No"}).}
#'     \item{env_anom}{Difference between predicted and observed category probabilities.}
#'   }
#' }
#' \item{eco_space}{Raster-format data frame representing trait space bins with estimated environmental categories.}
#' \item{diagnostics}{Summary stats about bin usage and data coverage.}
#' \item{settings}{Metadata including the modeled trait.}
#' \item{prediction_accuracy}{Overall percentage of correct predictions.}
#' @importFrom stats density setNames
#' @examples
#' \donttest{
#' # Load internal data
#' data("geoPoints", package = "commecometrics")
#' data("traits", package = "commecometrics")
#' data("spRanges", package = "commecometrics")
#'
#' # Step 1: Summarize trait values at sampling points
#' traitsByPoint <- summarize_traits_by_point(
#'   points_df = geoPoints,
#'   trait_df = traits,
#'   species_polygons = spRanges,
#'   trait_column = "RBL",
#'   species_name_col = "sci_name",
#'   continent = FALSE,
#'   parallel = FALSE
#' )
#'
#' # Step 2: Run ecometric model using land cover class as qualitative variable
#' modelResult <- ecometric_model_qual(
#'   points_df = traitsByPoint$points,
#'   category_col = "vegetation",
#'   min_species = 3
#' )
#'
#' # View the percentage of correctly predicted categories
#' print(modelResult$prediction_accuracy)
#' }
#' @export
ecometric_model_qual <- function(points_df,
                                 category_col,
                                 grid_bins_1 = NULL,
                                 grid_bins_2 = NULL,
                                 min_species = 3) {
  # Remove NAs in trait category
  points_df <- points_df %>% dplyr::filter(!is.na(.data[[category_col]]))

  # Coerce category column to character if needed
  if (!is.character(points_df[[category_col]])) {
    message("Converting '", category_col, "' to character.")
    points_df[[category_col]] <- as.character(points_df[[category_col]])
  }

  if (!all(c("summ_trait_1", "summ_trait_2", "count_trait") %in% names(points_df))) {
    stop("points_df must contain 'summ_trait_1', 'summ_trait_2', and 'count_trait'.")
  }

  # Filter low-coverage points
  message("Filtering points with at least ", min_species, " species...")
  filtered_df <- dplyr::filter(points_df, count_trait >= min_species)
  prop_retained <- round(nrow(filtered_df) / nrow(points_df) * 100, 1)
  message("Retained ", nrow(filtered_df), " of ", nrow(points_df), " points (", prop_retained, "%)")
  if (nrow(filtered_df) == 0) stop("No points retained. Try lowering 'min_species'.")

  # Determine bin numbers if not provided
  if (is.null(grid_bins_1)) {
    grid_bins_1 <- optimal_bins(filtered_df$summ_trait_1)
    message("Optimal number of bins for the first trait summary metric (Scott's rule): ", grid_bins_1)
  }
  if (is.null(grid_bins_2)) {
    grid_bins_2 <- optimal_bins(filtered_df$summ_trait_2)
    message("Optimal number of bins for the second trait summary metric (Scott's rule): ", grid_bins_2)
  }

  # Define bin breaks
  mrange <- range(filtered_df$summ_trait_1, na.rm = TRUE)
  sdrange <- range(filtered_df$summ_trait_2, na.rm = TRUE)
  mbrks <- seq(mrange[1] - 0.001, mrange[2] + 0.001, length.out = grid_bins_1 + 1)
  sdbrks <- seq(sdrange[1] - 0.001, sdrange[2] + 0.001, length.out = grid_bins_2 + 1)

  filtered_df <- filtered_df %>%
    dplyr::mutate(
      bin_1 = .bincode(summ_trait_1, mbrks),
      bin_2 = .bincode(summ_trait_2, sdbrks)
    )

  categories <- sort(unique(filtered_df[[category_col]]))

  # Build bin grid
  eco_list <- list()
  bin_counts <- matrix(0, nrow = grid_bins_2, ncol = grid_bins_1)

  for (i in 1:grid_bins_1) {
    for (j in 1:grid_bins_2) {
      idx <- which(filtered_df$bin_1 == i & filtered_df$bin_2 == j)
      dat <- filtered_df[[category_col]][idx]
      bin_counts[j, i] <- length(dat)

      if (length(dat) > 0) {
        tab <- table(factor(dat, levels = categories))
        probs <- as.numeric(tab) / sum(tab)
        mode_cat <- categories[which.max(probs)]

        eco_list[[length(eco_list) + 1]] <- tibble::tibble(
          bin_1 = i,
          bin_2 = j,
          env_est = mode_cat,
          !!!setNames(as.list(probs), paste0("prob_", categories))
        )
      }
    }
  }

  eco_space <- dplyr::bind_rows(eco_list)

  # Bin diagnostics
  bin_counts_flipped <- bin_counts[grid_bins_2:1, , drop = FALSE]
  used_bins <- sum(bin_counts_flipped > 0)
  total_bins <- grid_bins_1 * grid_bins_2
  message("Used ", used_bins, " of ", total_bins, " bins (", round(used_bins / total_bins * 100, 1), "%)")

  # Map predictions back to points
  filtered_df <- dplyr::left_join(filtered_df, eco_space, by = c("bin_1", "bin_2"))

  names(eco_space)[names(eco_space) == "bin_1"] <- "x"
  names(eco_space)[names(eco_space) == "bin_2"] <- "y"

  # Compute probabilities for observed/predicted
  observed_prob_col <- paste0("prob_", filtered_df[[category_col]])
  predicted_prob_col <- paste0("prob_", filtered_df$env_est)

  observed_prob_val <- purrr::map2_dbl(observed_prob_col, seq_len(nrow(filtered_df)), function(colname, idx) {
    if (colname %in% names(filtered_df)) filtered_df[[colname]][idx] else NA_real_
  })

  predicted_prob_val <- purrr::map2_dbl(predicted_prob_col, seq_len(nrow(filtered_df)), function(colname, idx) {
    if (colname %in% names(filtered_df)) filtered_df[[colname]][idx] else NA_real_
  })

  filtered_df$observed_probability <- observed_prob_val
  filtered_df$predicted_probability <- predicted_prob_val

  filtered_df <- filtered_df %>%
    dplyr::mutate(
      predicted_category = env_est,
      correct_prediction = ifelse(.data[[category_col]] == env_est, "Yes", "No"),
      env_anom = dplyr::case_when(
        is.na(predicted_category) ~ NA_real_,
        TRUE ~ predicted_probability - observed_probability
      )
    )

  correct_predictions <- sum(filtered_df$correct_prediction == "Yes", na.rm = TRUE)
  total_points <- nrow(filtered_df)
  percent_correct <- round((correct_predictions / total_points) * 100, 2)
  message("Prediction accuracy: ", percent_correct, "% correct predictions.")

  return(list(
    points_df = filtered_df,
    eco_space = eco_space,
    diagnostics = list(
      total_points = nrow(points_df),
      retained_points = nrow(filtered_df),
      proportion_retained = prop_retained,
      used_bins = used_bins,
      total_bins = total_bins,
      bin_counts = bin_counts,
      brks_1 = mbrks,
      brks_2 = sdbrks,
      categories = categories
    ),
    settings = list(
      trait_var = category_col
    ),
    prediction_accuracy = percent_correct
  ))
}

Any scripts or data that you put into this service are public.

commecometrics documentation built on Aug. 8, 2025, 6:10 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

commecometrics
Ecometric Models of Trait–Environment Relationships at the Community Level

R/ecometric_model_qual.R
In commecometrics: Ecometric Models of Trait–Environment Relationships at the Community Level

Defines functions ecometric_model_qual

Documented in ecometric_model_qual

Try the commecometrics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

commecometrics Ecometric Models of Trait–Environment Relationships at the Community Level

R/ecometric_model_qual.R In commecometrics: Ecometric Models of Trait–Environment Relationships at the Community Level

Defines functions ecometric_model_qual

Documented in ecometric_model_qual

Try the commecometrics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

commecometrics
Ecometric Models of Trait–Environment Relationships at the Community Level

R/ecometric_model_qual.R
In commecometrics: Ecometric Models of Trait–Environment Relationships at the Community Level