Nothing
#' predict_mortality
#'
#' This sub model first fits a binary model to derive the effects of individual
#' tree, site and climate variables on mortality; and afterwards predict the
#' probability of dying for each tree from df_predict
#'
#' @param df_fit a data frame with individual tree data and site descriptors
#' where code is used to specify a status of each tree
#' @param df_predict data frame which will be used for mortality predictions
#' @param df_climate data frame with monthly climate data
#' @param mortality_share a value defining the proportion of the volume which is
#' to be the subject of mortality
#' @param mortality_share_type character, it can be 'volume' or 'n_trees'. If
#' 'volume' then the mortality share relates to total standing volume, if
#' 'n_trees' then mortality share relates to the total number of standing trees
#' @param include_climate logical, should climate variables be included as
#' predictors
#' @param site_vars a character vector of variable names which are used as site
#' descriptors
#' @param select_months_climate vector of subset months to be considered.
#' Default is c(1,12), which uses all months.
#' @param mortality_model logical, should the mortality model be evaluated
#' and returned as the output
#' @param nb_laplace value used for Laplace smoothing (additive smoothing) in
#' naive Bayes algorithm. Defaults to 0 (no Laplace smoothing).
#' @param sim_crownHeight logical, should crown heights be considered as a
#' predictor variable? If TRUE, a crownHeight column is expected in data_NFI
#' @param k the number of folds to be used in the k fold cross-validation
#' @param eval_model_mortality logical, should the mortality model be evaluated
#' and returned as the output
#' @param blocked_cv logical, should the blocked cross-validation be used in the
#' evaluation phase?
#' @param sim_mortality logical, should mortality be simulated?
#' @param sim_step_years the simulation step in years
#' @param rf_mtry number of variables randomly sampled as candidates at each
#' split of a random forest model. If NULL, default settings are applied.
#' @param df_max_size a data frame with the maximum BA values for each species.
#' If a tree exceeds this value, it dies.
#' @param ingrowth_codes numeric value or a vector of codes which refer to
#' ingrowth trees
#' @param include_mortality_BAI logical, should basal area increments (BAI) be
#' used as independent variable for predicting individual tree morality?
#' @param intermediate_print logical, if TRUE intermediate steps will be printed
#' while the mortality sub model is running
#' @param use_max_size_threshold logical - should the principle of maxium size
#' be applied?
#' @param mortality_bias_adjusted Logical (length-one). If `TRUE` (default),
#' applies a simple bias fix so large trees aren’t over-removed. The frequency
#' of adjustment is controlled by the `bias_adj_factor` argument. If `FALSE`,
#' predicted probabilities are left unchanged.
#' @param bias_adj_factor Integer (>= 2). Controls how sparsely you reduce
#' death probabilities among the top-ranked trees. Starting from the 3rd row,
#' every `bias_adj_factor`-th tree has its probability set to zero—so `2` keeps
#' every second high-risk tree alive, `3` every third, and so on.
#'
#' @return a list with three elements:
#' \enumerate{
#' \item $predicted_mortality - a data frame with updated tree status (code) based on the predicted mortality
#' \item $eval_mortality - a data frame with predicted and observed probabilities of dying for all individual trees, or character string indicating that mortality sub-model was not evaluated
#' \item $model_output - the output model for mortality
#' }
#'
#' @examples
#' data("data_v4")
#' data("data_mortality")
#' data("max_size_data")
#'
#' mortality_outputs <- predict_mortality(
#' df_fit = data_mortality,
#' df_predict = data_v4,
#' mortality_share_type = 'volume',
#' df_climate = data_climate,
#' site_vars = c("slope", "elevation", "northness", "siteIndex"),
#' sim_mortality = TRUE,
#' mortality_model = 'naiveBayes',
#' nb_laplace = 0,
#' sim_crownHeight = TRUE,
#' mortality_share = 0.02,
#' include_climate = TRUE,
#' select_months_climate = c(6,7,8),
#' eval_model_mortality = TRUE,
#' k = 10, blocked_cv = TRUE,
#' sim_step_years = 6,
#' df_max_size = max_size_data,
#' ingrowth_codes = c(3,15),
#' include_mortality_BAI = TRUE)
#'
#' df_predicted <- mortality_outputs$predicted_mortality
#' df_evaluation <- mortality_outputs$eval_mortality
#'
#' # confusion matrix
#' table(df_evaluation$mortality, round(df_evaluation$mortality_pred, 0))
#'
predict_mortality <- function(df_fit, df_predict, df_climate, mortality_share = NA,
mortality_share_type = "volume",
include_climate, site_vars, select_months_climate = c(6,8),
mortality_model = "rf", nb_laplace = 0, sim_crownHeight = FALSE,
k = 10, eval_model_mortality = TRUE, blocked_cv = TRUE,
sim_mortality = TRUE, sim_step_years = 5, rf_mtry = NULL,
df_max_size = NULL, ingrowth_codes = 3, include_mortality_BAI = TRUE,
intermediate_print = FALSE, use_max_size_threshold = FALSE,
mortality_bias_adjusted = TRUE, bias_adj_factor = 2
){
# Define global variables
year <- NULL
month <- NULL
plotID <- NULL
p_sum <- NULL
t_avg <- NULL
code <- NULL
treeID <- NULL
speciesGroup <- NULL
species <- NULL
mortality <- NULL
mortality_pred <- NULL
p_mortality <- NULL
BA <- NULL
BAL <- NULL
height <- NULL
crownHeight <- NULL
stand_BA <- NULL
stand_n <- NULL
col_sum <- NULL
vol_ha_mid <- NULL
BA_mid <- NULL
BA_max <- NULL
ranger <- NULL
BAI <- NULL
BAI_mid <- NULL
n_obs <- NULL
BA_bin <- NULL
n_tgt <- NULL
if (sim_mortality == TRUE){
if (include_climate == TRUE){
initial_colnames <- colnames(df_predict)
max_year_predict <- max(df_predict$year)
min_year_predict <- max_year_predict + sim_step_years
climate_predict <- dplyr::filter(df_climate, year %in% seq(min_year_predict, max_year_predict)) %>%
dplyr::filter(month %in% select_months_climate) %>%
group_by(plotID) %>% summarise(p_sum = sum(p_sum), t_avg = mean(t_avg))
df_predict$t_avg <- NULL
df_predict$p_sum <- NULL
df_predict <- merge(df_predict, climate_predict, by = "plotID")
df_predict <- dplyr::select(df_predict, all_of(initial_colnames))
site_vars <- c(site_vars, "p_sum", "t_avg")
}
df_predict <- dplyr::filter(df_predict, code %in% c(0,ingrowth_codes))
if (sim_crownHeight == TRUE){
df_fit <- mutate(df_fit, mortality = ifelse(code == 2, 1, 0),
BA_mid = BA,
BAL_mid = BAL,
BAI_mid = BAI,
height_mid = height,
crownHeight_mid = crownHeight,
stand_BA_mid = stand_BA,
stand_n_mid = stand_n)
} else {
df_fit <- mutate(df_fit, mortality = ifelse(code == 2, 1, 0),
BA_mid = BA,
BAL_mid = BAL,
height_mid = height,
# crownHeight_mid = crownHeight,
stand_BA_mid = stand_BA,
stand_n_mid = stand_n)
}
# YOU CAN MANUALLY SET THE MORTALITY SHARE
if (is.na(mortality_share)){
mortality_share <- sum(df_fit$code == 2)/nrow(df_fit)
if (intermediate_print == TRUE){
message(paste0("Estimated moratlity share is ", round(mortality_share, 2)))
}
}
if (sim_crownHeight == TRUE){
if (include_mortality_BAI == TRUE){
formula <- as.formula(paste0("mortality ~ BA_mid + BAI_mid + height_mid + crownHeight_mid + BAL_mid + stand_BA_mid + stand_n_mid + speciesGroup +",
paste(all_of(site_vars), collapse = "+")))
df_fit <- filter(df_fit, !is.na(BAI_mid))
} else {
formula <- as.formula(paste0("mortality ~ BA_mid + height_mid + crownHeight_mid + BAL_mid + stand_BA_mid + stand_n_mid + speciesGroup +",
paste(all_of(site_vars), collapse = "+")))
}
} else {
if (include_mortality_BAI == TRUE){
formula <- as.formula(paste0("mortality ~ BA_mid + BAI_mid + height_mid + BAL_mid + stand_BA_mid + stand_n_mid + speciesGroup +",
paste(all_of(site_vars), collapse = "+")))
df_fit <- filter(df_fit, !is.na(BAI_mid))
summary(df_fit)
} else {
formula <- as.formula(paste0("mortality ~ BA_mid + height_mid + BAL_mid + stand_BA_mid + stand_n_mid + speciesGroup +",
paste(all_of(site_vars), collapse = "+")))
}
}
##############
# Eval phase #
##############
if (eval_model_mortality == TRUE){
foldi <- seq(1:k)
folds <- cut(seq(1, nrow(df_fit)), breaks = k, labels = FALSE)
eval_list <- list()
if (blocked_cv == FALSE){
df_fit_cv <- df_fit[sample(nrow(df_fit)), ]
} else {
df_fit_cv <- df_fit
}
for (m in 1:k){
testIndexes <- which(folds == m, arr.ind = TRUE)
test <- df_fit_cv[testIndexes, ]
train <- df_fit_cv[-testIndexes, ]
if (mortality_model == "glm"){
model_mortality <- glm(formula, data = train, family = "binomial")
test$mortality_pred <- predict(model_mortality, test, type="response")
} else if (mortality_model == "rf"){
if (is.null(rf_mtry)){
model_mortality <- ranger(formula, data = train)
} else {
model_mortality <- ranger(formula, data = train, mtry = rf_mtry)
}
test$mortality_pred <- predict(model_mortality, test, type="response")$predictions
} else if (mortality_model == "naiveBayes") {
train$mortality <- factor(train$mortality)
model_mortality <- naive_bayes(formula, data = train, laplace = nb_laplace)
suppressWarnings(test$mortality_pred <- predict(model_mortality, test, type="prob")[,2])
} else {
stop("mortality_model should be 'glm', 'naiveBayes' or 'rf'")
}
eval_list[[m]] <- test
}
df_eval_mortality <- do.call(rbind, eval_list)
df_eval_mortality <- dplyr::select(df_eval_mortality, plotID, treeID, year, speciesGroup, code,
species, mortality, mortality_pred)
} else {
df_eval_mortality <- "the argument set_eval_mortality is set to FALSE"
}
####################
# Prediction phase #
####################
if (mortality_model == "glm"){
model_mortality <- glm(formula, data = df_fit, family = "binomial")
df_predict$p_mortality <- predict(model_mortality, df_predict, type="response")
} else if (mortality_model == "rf"){
if (is.null(rf_mtry)){
model_mortality <- ranger(formula, data = df_fit)
} else {
model_mortality <- ranger(formula, data = df_fit, mtry = rf_mtry)
}
df_predict$p_mortality <- predict(model_mortality, df_predict, type="response")$predictions
} else if (mortality_model == "naiveBayes"){
df_fit$mortality <- factor(df_fit$mortality)
model_mortality <- naive_bayes(formula, data = df_fit, laplace = nb_laplace)
suppressWarnings(df_predict$p_mortality <- predict(model_mortality, df_predict, type="prob")[,2])
} else {
stop("mortality_model should be 'glm', 'naiveBayes' or 'rf'")
}
if (use_max_size_threshold == TRUE){
df_predict <- merge(df_predict, df_max_size, by = 'species', all.x = TRUE)
df_predict <- dplyr::mutate(df_predict, p_mortality = ifelse(BA_mid > BA_max, 1, p_mortality),
BA_max = NULL
) %>%
arrange(-p_mortality)
} else {
df_predict <- df_predict %>%
arrange(-p_mortality)
}
#######################
# Balance predictions #
######################
# Mortality model could bias mortality predictions towards trees with larger DBH
if (mortality_bias_adjusted == TRUE){
if (!is.numeric(bias_adj_factor) || length(bias_adj_factor) != 1 ||
is.na(bias_adj_factor) || bias_adj_factor < 2) {
warning("`bias_adj_factor` must be a single integer >= 2; bias adjustment not applied.")
} else {
# assume numeric p_mortality and descending order already applied
p <- df_predict$p_mortality
# pick rows 3, 6, 9, ...
idx <- seq(3, length(p), by = bias_adj_factor)
# df_predict$p_mortality_adj <- p
df_predict$p_mortality[idx] <- 0
df_predict <- df_predict %>%
arrange(-p_mortality)
}
}
if (mortality_share_type == "volume"){
df_predict$vol_ha_mid <- df_predict$volume_mid * df_predict$weight_mid
volume_total <- sum(df_predict$vol_ha_mid, na.rm = TRUE)
df_predict <- mutate(df_predict,
col_sum = cumsum(replace_na(vol_ha_mid, 0)),
code = ifelse(col_sum < (volume_total * mortality_share), 2, 0))
df_predict[, "year"] <- df_predict[, "year"] + sim_step_years
df_predict$p_mortality <- NULL
df_predict$col_sum <- NULL
df_predict$vol_ha_mid <- NULL
} else if (mortality_share_type == "n_trees"){
cut_th <- round(nrow(df_predict) * mortality_share)
df_predict[c(1:cut_th), "code"] <- 2
df_predict[c((cut_th+1):nrow(df_predict)), "code"] <- 0
df_predict[, "year"] <- df_predict[, "year"] + sim_step_years
} else {
stop(paste0("mortality_share_type should be 'n_trees' or 'volume' but instead it is ", mortality_share_type))
}
} else if (sim_mortality == FALSE){
df_predict <- dplyr::filter(df_predict, code %in% c(0,3,15))
df_predict[, "year"] <- df_predict[, "year"] + sim_step_years
df_predict[ , "code"] <- 0
df_eval_mortality <- paste0("sim_mortality is set to FALSE.",
"Mortality is not simulated. eval_mortality is not available.")
model_mortality <- paste0("sim_mortality is set to FALSE.",
"Mortality is not simulated. model_mortality is not available.")
}
df_predict <- arrange(df_predict, plotID, treeID)
final_output_list <- list(
predicted_mortality = df_predict,
eval_mortality = df_eval_mortality,
model_output = model_mortality
)
return(final_output_list)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.