#' Make predictions for all species in a dataset using single-species linear
#' models
#'
#' Make predictions across rows in a dataset that may contain multiple species.
#' The model associated with each species is used to predict values for the
#' response variable, as well as it's prediction interval. Necessary
#' bias-corrections are made for species with models that have a transformed
#' response variable.
#'
#' @param data Dataframe with columns containing the species and variables of
#' interest. Species names in the `species` column should be present in those
#' within `ref_table`, as well as those in `models`.
#' @param models A named list of each species' linear regression models.
#' `names(models)` should correspond to species names in `data` and
#' `ref_table`.
#' @param ref_table Dataframe containing information to correct bias introduced
#' in models with a transformed response variable. It should include columns
#' for `species`, `cf`, and `geom_mean`.
#' @param level Level of confidence for the prediction interval. Defaults to
#' `0.95`.
#' @param species Column name of the species variable in `data` and `ref_table`.
#' Defaults to `species`.
#' @param predictor Column name of the predictor variable in `data`. Defaults to
#' `diameter`.
#' @param cf Column name of the bias correction factor in `ref_table`. Defaults
#' to `correctn_factor`.
#' @param geom_mean Column name of the geometric mean of response variable in
#' `ref_table`, that was used in to fit the `models`. Defaults to
#' `response_geom_mean`.
#'
#' @return Dataframe of input `data` with columns appended: \describe{
#' \item{fit}{Predicted value for the response variable.} \item{lwr}{Lower
#' bound of the prediction interval, based on the input argument `level`.}
#' \item{upr}{Upper bound of the prediction interval, based on the input
#' argument `level`.} }
#'
#' @family single-species model functions
#' @seealso [ss_simulate()] to run `ss_predict()` on simulated data.
#'
#' @examples
#' # first select best-fit model
#' data(urbantrees)
#' Alb_sam <- urbantrees[urbantrees$species == 'Albizia saman', ] # we use one species as an example
#' results <- ss_modelselect_multi(Alb_sam,
#' response = 'height',
#' predictor = 'diameter')
#'
#' # generate data for subsequent predictions
#' newdata <- generate_x(Alb_sam,
#' response = "height", predictor = "diameter")
#'
#' # run function
#' predictions <- ss_predict(newdata,
#' models = results$ss_models,
#' ref_table = results$ss_models_info,
#' predictor = "predictor")
#' head(predictions)
#'
#' @import checkmate
#' @importFrom stringr str_replace
#' @importFrom stats as.formula complete.cases predict
#' @importFrom dplyr filter_at vars all_vars
#'
#' @export
ss_predict <- function(data, models, ref_table, level = 0.95, species = "species", predictor = "diameter", cf = "correctn_factor", geom_mean = "response_geom_mean") {
# Error checking ------------------
coll <- checkmate::makeAssertCollection()
checkmate::assert_list(models, unique = TRUE, types = "list", add = coll)
checkmate::assert_data_frame(ref_table, add = coll)
checkmate::assert_subset(unique(data[[species]]), choices = names(models), empty.ok = FALSE, add = coll) # species in data in models
checkmate::assert_subset(unique(data[[species]]), choices = as.character(ref_table[[species]]), empty.ok = FALSE, add = coll) # species in data in models
checkmate::assert_number(level, lower = 0, upper = 1, add = coll)
checkmate::assert_numeric(data[[predictor]], lower = 0, finite = TRUE, .var.name = "predictor", add = coll)
checkmate::reportAssertions(coll)
# remove missing data
if (checkmate::anyMissing(data[[predictor]])) {
message(cat(sum(!complete.cases(data[, predictor])), " row(s) with missing value(s) removed from 'data'", sep = ""))
data <- data[complete.cases(data[, predictor]), ]
}
# Calculations ------------------
data_list <- split(data, data$species)
for (i in 1:length(data_list)) {
spp <- names(data_list[i])
index <- which(names(models) == spp)
data_sub <- data_list[i][[1]]
data_sub$x <- data_sub[[predictor]] # IMPORTANT: align variable name with model
# extract weight info from model object
if ("weights" %in% names(models[[index]])) {
w <- as.character(models[[index]]$call)[length(models[[index]]$call)]
w <- stringr::str_replace(w, "x", "data_sub$x") # IMPORTANT
w <- as.formula(paste("~", w))
} else {
w <- 1
}
results <- predict(models[[index]], newdata = data_sub, level = level, interval = "prediction", type = "response", weights = w)
results <- as.data.frame(results)
# Transformed models: Geom mean height
if (ref_table$modelcode[ref_table$species == spp] == "loglog_w1" | ref_table$modelcode[ref_table$species == spp] == "loglog_w2" | ref_table$modelcode[ref_table$species ==
spp] == "loglog_w3" | ref_table$modelcode[ref_table$species == spp] == "loglog_w4" | ref_table$modelcode[ref_table$species == spp] ==
"expo_w1" | ref_table$modelcode[ref_table$species == spp] == "expo_w2" | ref_table$modelcode[ref_table$species == spp] == "expo_w3" |
ref_table$modelcode[ref_table$species == spp] == "expo_w4") {
# back-transform & bias-correction
results$fit <- exp(results$fit) * ref_table$correctn_factor[ref_table$species == spp]
results$lwr <- exp(results$lwr) * ref_table$correctn_factor[ref_table$species == spp]
results$upr <- exp(results$upr) * ref_table$correctn_factor[ref_table$species == spp]
}
data_list[[i]] <- cbind.data.frame(data_list[[i]], results)
}
data_list <- do.call(rbind, data_list) #combine lists by rows
# extract complete cases for specific cols
data_list <- dplyr::filter_at(data_list, vars(species, predictor, "fit", "lwr", "upr"), all_vars(complete.cases(.)))
rownames(data_list) <- NULL
return(data_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.