R/bm_ModelAnalysis.R

Defines functions .bm_TotalSumSquares .bm_PRESS .bm_ModelAnalysis.check.args bm_ModelAnalysis

Documented in bm_ModelAnalysis

###################################################################################################
##' @name bm_ModelAnalysis
##' @author Hélène Blancheteau
##' 
##' @title Analyze the residuals of the single models 
##' 
##' @description This function return differents graphs to help analyse the single models.
##' 
##' 
##' @param bm.mod a \code{BIOMOD.models.out}
##' @param models.chosen a \code{vector} containing model names to be kept, must be either 
##' \code{all} or a sub-selection of model names that can be obtained with the 
##' \code{\link{get_built_models}} function
##' @param do.plot a \code{logical}, print the plots or not 
##' @param color.by a \code{character} between "full.name", "species", "PA", "RUN" or "algo" 
##' 
##' @return
##' \enumerate{
##'   \item a plot residuals ~ observations to detect outliers
##'   \item a histogramm of the residuals 
##'   \item a qqplot of the residuals
##'   \item plot residuals ~ fitted
##'   \item a plot of the different Rsquared available (Rsquared, Rsquared_aj, Psquared)
##' }
##' 
##' 
##' @details 
##' All the plots will be made for the models, independently to the different models assumptions. 
##' It is up to the user to interpret the graphs in the light of the model assumptions.
##' 
##' 
##' 
##' @keywords analyze models residuals
##' 
##' 
##' @importFrom foreach foreach %do%
##' @importFrom ggplot2 ggplot geom_hline geom_point stat_qq stat_qq_line vars geom_histogram
##' @importFrom tidyr separate
##' 
##' @export
##' 
##' 
###################################################################################################


bm_ModelAnalysis <- function(bm.mod, 
                             models.chosen = 'all',
                             do.plot = TRUE, 
                             color.by = 'full.name')
{
  .bm_cat("Model analysis")
  cat("> Warning : All the plots will be made for the models, independently to the different models assumptions. 
      It is up to you to interpret the graphs in the light of the model assumptions.\n")
  
  args <- .bm_ModelAnalysis.check.args(bm.mod = bm.mod, models.chosen = models.chosen,
                                       color.by = color.by)
  for (argi in names(args)) { assign(x = argi, value = args[[argi]]) }
  rm(args)
  
  models.coherent <- grep("GAM|MARS|GLM|CTA", models.chosen, value = TRUE)
  
  cat("\n> Residuals computation")
  predictions <- get_predictions(bm.mod, full.name = models.chosen)
  observations <- data.frame(points = 1:max(predictions$points),
                             obs = get_formal_data(bm.mod, subinfo = "resp.var"))
  
  if(bm.mod@data.type == "binary"){
    predictions$pred <- predictions$pred/1000
  }
  
  if(bm.mod@data.type == "ordinal"){
    names_levels <- levels(observations$obs)
    
    predictions$pred <- as.character(predictions$pred)
    to_change <- predictions$pred[predictions$pred %in% names_levels] 
    to_change <- factor(to_change, levels = levels(observations$obs), ordered = T)
    predictions$pred[predictions$pred %in% names_levels] <- as.character(as.numeric(to_change))
    
    predictions$pred <- as.numeric(predictions$pred)
    observations$obs <- as.numeric(observations$obs)
  }
  
  res <- merge(predictions, observations)
  res$residuals <- res$obs - res$pred
  res <- tidyr::separate(res, col = "full.name", into = c("species", "PA", "RUN", "algo"), remove = FALSE)
  
  # res <- foreach(mod.name = models.coherent, .combine = rbind) %dopar% {
  #   cat("\n\t> Analysis", mod.name, "...")
  #   mod <- get(BIOMOD_LoadModels(bm.out = bm.mod, full.name = mod.name))
  #   infos <- .bm_return_info_analysys(mod)
  #   return(data.frame("full.name" = mod.name, infos))
  # }

  # if (do.Rpredicted){
  #   Psquared <- foreach(mod.name = models.chosen, .combine = c) %dopar% {
  #     cat("\n\t> Psquared", mod.name, "...")
  #     PRESS <- .bm_PRESS(bm.mod, mod.name, perc = 1) ## attention perc ? 
  #     TSS <- .bm_TotalSumSquares(bm.mod, mod.name)
  #     pred_Rsquared <- 1 - PRESS/TSS
  #     return(pred_Rsquared)
  #   }
  # } else {
  #   Psquared = NA
  # }
  

  ## Outliers
  plot_outliers <- ggplot(res, aes(y = residuals, x = points, color = .data[[color.by]]))+
    geom_point(size = 1) +
    geom_hline(yintercept = 0, color = "black", linewidth = 1) +
    xlab("Observations number") +
    ylab("Residuals") +
    theme(legend.title = element_blank()
          , legend.key = element_rect(fill = "white")
          , axis.text.x = element_text(angle = 45, hjust = 1))+
    guides(colour = guide_legend(override.aes = list(size = 2)))+
    paletteer::scale_color_paletteer_d(palette)+
    ggtitle("Visualising residuals: are there any outliers?")

  ## QQ plot of residuals

  plot_qqplot <- ggplot(res, aes(sample = residuals, color = .data[[color.by]])) +
    stat_qq(size = 1)+
    stat_qq_line(color = "black", size = 1) +
    xlab("Theoretical quantiles") +
    ylab("Standardized residuals") +
    theme(legend.title = element_blank()
          , legend.key = element_rect(fill = "white")
          , axis.text.x = element_text(angle = 45, hjust = 1))+
    guides(colour = guide_legend(override.aes = list(size = 2)))+
    paletteer::scale_color_paletteer_d(palette)+
    ggtitle("Q-Q plot of residuals")

  ## Histogramme of residuals

  plot_hist <- ggplot(res, aes(x = residuals, color = .data[[color.by]])) +
    facet_wrap(vars(full.name))+
    geom_histogram(fill = NA, position = "dodge", bins = 30) + 
    xlab("Residuals") +
    theme(legend.title = element_blank()
          , legend.key = element_rect(fill = "white")
          , axis.text.x = element_text(angle = 45, hjust = 1),
          , axis.title.y = element_blank(),
          , axis.text.y = element_blank(),
          , axis.ticks.y = element_blank())+
    paletteer::scale_color_paletteer_d(palette)+
    ggtitle("Distribution of residuals")

  ## Fitted vs resid

  plot_fitted <- ggplot(res, aes(y = residuals, x = pred, color = .data[[color.by]])) +
    geom_point(size = 1)+
    geom_hline(yintercept = 0, color = "black", size = 1) +
    xlab("Fitted") +
    ylab("Residuals") +
    theme(legend.title = element_blank()
          , legend.key = element_rect(fill = "white")
          , axis.text.x = element_text(angle = 45, hjust = 1))+
    guides(colour = guide_legend(override.aes = list(size = 2)))+
    paletteer::scale_color_paletteer_d(palette)+
    ggtitle("Residuals ~ Fitted values (Tukey-Anscombe plot)")
  
  if(!bm.mod@data.type %in% c("binary", "ordinal")){
    ## Rscore
    cat("\n> Rscore compilation")
    Rscore <- data.frame(full.name = models.chosen, metric = NA, value = NA)
    eval <- get_evaluations(bm.mod)
    if ("Rsquared" %in% unique(eval$metric.eval)){
      eval_Rsq <- eval[eval$full.name %in% models.chosen & eval$metric.eval == "Rsquared", c("full.name", "calibration", "validation")]
      Rscore <- rbind(Rscore, 
                        data.frame(full.name = eval_Rsq$full.name, metric = "Rsquared_calibration", value = eval_Rsq$calibration),
                        data.frame(full.name = eval_Rsq$full.name, metric = "Rsquared_validation", value = eval_Rsq$validation)) ##C'est tordu mais c'est pour pas appeler melt
    }
    if ("Rsquared_aj" %in% unique(eval$metric.eval)){
      eval_Rsq_aj <- eval[eval$full.name %in% models.chosen & eval$metric.eval == "Rsquared_aj", c("full.name", "calibration", "validation")]
      Rscore <- rbind(Rscore, 
                        data.frame(full.name = eval_Rsq_aj$full.name, metric = "Rsquared_aj_calibration", value = eval_Rsq_aj$calibration),
                        data.frame(full.name = eval_Rsq_aj$full.name, metric = "Rsquared_aj_validation", value = eval_Rsq_aj$validation)) 
    }
    Rscore <- tidyr::separate(Rscore, col = "full.name", into = c("species", "PA", "RUN", "algo"), remove = FALSE)
    Rscore <- Rscore[!is.na(Rscore$metric),]
    
    plot_Rscore <- ggplot(Rscore, aes(y= value, x = full.name, color = .data[[color.by]], shape = metric))+
      geom_point(size = 2)+
      ylim(0,1)+
      xlab("Models")+
      theme(legend.title = element_blank()
            , legend.key = element_rect(fill = "white")
            , axis.text.x = element_text(angle = 45, hjust = 1),
            , axis.title.y = element_blank()) +
      paletteer::scale_color_paletteer_d(palette)+
      ggtitle("R scores")

  } else {
    Rscore <- NULL
  }
  

  if (do.plot){
    plot(plot_outliers)
    plot(plot_qqplot)
    plot(plot_hist)
    plot(plot_fitted)
    if(!bm.mod@data.type %in% c("binary", "ordinal")){print(plot_Rscore)}
  }
  
  .bm_cat("Done")
  return(list(residuals = res,
              R_scores = Rscore))
}


###################################################################################################

# # .bm_return_info_analysys
# # Functions to get residuals and fitted values of a model
# # 
# # param mod a \code{biomod2_model}
# # importFrom stats fitted residuals
# # 
# # export
# 
# setGeneric(".bm_return_info_analysys", function(mod) { standardGeneric(".bm_return_info_analysys")})
# 
# setMethod(".bm_return_info_analysys", signature("CTA_biomod2_model"), function(mod) {
#   resids <- as.vector(residuals(mod@model)) 
#   fit <- as.vector(mod@model$y) #[,"y"]
#   fit <- fit-1
#   return(data.frame("obs" = 1:length(resids), "residuals" = resids, "fitted" = fit))
# })
# 
# setMethod(".bm_return_info_analysys", signature("GAM_biomod2_model"), function(mod) {
#   resids <- as.vector(residuals(mod@model)) 
#   fit <- as.vector(fitted(mod@model))
#   return(data.frame("obs" = 1:length(resids), "residuals" = resids, "fitted" = fit))
# })
# 
# setMethod(".bm_return_info_analysys", signature("GLM_biomod2_model"), function(mod) {
#   resids <- as.vector(residuals(mod@model)) 
#   fit <- as.vector(fitted(mod@model))
#   return(data.frame("obs" = 1:length(resids), "residuals" = resids, "fitted" = fit))
# })
# 
# setMethod(".bm_return_info_analysys", signature("MARS_biomod2_model"), function(mod) {
#   resids <- as.vector(residuals(mod@model)) 
#   fit <- as.vector(fitted(mod@model))
#   return(data.frame( "obs" = 1:length(resids), "residuals" = resids, "fitted" = fit))
# })


###################################################################################################

.bm_ModelAnalysis.check.args <- function(bm.mod, models.chosen, color.by)
{
  .fun_testIfInherits(TRUE, "bm.mod", bm.mod, "BIOMOD.models.out")
  
  ## Stop if ordinal -------------------------------------------------------
  # if (bm.mod@data.type %in% c("ordinal", "binary")){
  #   stop("Model analysis is not ready for binary or ordinal data...")
  # }
  
  ## Check models.chosen ---------------------------------------------------
  if (models.chosen[1] == 'all') {
    models.chosen <- bm.mod@models.computed
  } else {
    models.chosen <- intersect(models.chosen, bm.mod@models.computed)
  }
  if (length(models.chosen) < 1) {
    stop('No models selected')
  }
  
  ## check that given models exist
  files.check <- paste0(bm.mod@dir.name, "/", bm.mod@sp.name, "/models/",
                        bm.mod@modeling.id, "/", models.chosen)

  not.checked.files <- grep('MAXENT|SRE', files.check)
  if (length(not.checked.files) > 0) {
    files.check <- files.check[-not.checked.files]
  }
  missing.files <- files.check[!file.exists(files.check)]
  if (length(missing.files) > 0) {
    stop(paste0("Models files missing : ", toString(missing.files)))
    if (length(missing.files) == length(files.check)) {
      stop("Impossible to find any models, might be a problem of working directory")
    }
  }
  
  ##check_color_by
  .fun_testIfIn(TRUE, "color.by", color.by, c("full.name", "species", "PA", "RUN", "algo" ))
  
  if(color.by == "species"){
    palette <- "ggthemes::Green_Orange_Teal"
  } else if (color.by == "algo"){
    palette <- "tvthemes::parksAndRec"
  } else if (color.by == "RUN"){
    palette <- "ggthemes::Classic_20"
  } else if (color.by == "PA"){
    palette <- "ggthemes::Tableau_20"
  } else {
    palette <- "ggsci::default_igv"
  }
  return(list(models.chosen = models.chosen,
              palette = palette))
}


###################################################################################################

.bm_PRESS <- function(bm.mod, mod.name, perc = 0.5) { #perc --> percentage of points evaluate for overfitting 
  
  ## Data
  dataset <- paste0("_", unlist(strsplit(mod.name, "_"))[2:3], collapse = "")
  algo <- unlist(strsplit(mod.name, "_"))[4]
  
  bm.format <- get_formal_data(bm.mod)
  mySpExpl <- get_species_data(bm.format)
  calib.lines <- as.data.frame(get_calib_lines(bm.mod))
  mySpExpl <- mySpExpl[which(calib.lines[,dataset] == TRUE), ]
  
  bm.options <- get_options(bm.mod)
  bm.options <- bm.options@options[[grep(algo, names(bm.options@options))]] ### Warning grep !! 
  
  ## Boucle 
  sample_points <- sample(1:nrow(mySpExpl), nrow(mySpExpl)*perc)
  model.call <- paste0(bm.options@package, "::", bm.options@func)
  
  residuals_predicted <- foreach(point = sample_points, .combine = c) %do% {
    argstmp <- bm.options@args.values[[dataset]]
    argstmp$data <- mySpExpl[-point, ]
    if (algo %in% c("MARS", "RF")){
      formu <- bm_MakeFormula(resp.name = names(mySpExpl)[1], expl.var = mySpExpl[1, 4:ncol(mySpExpl)], type = "simple", interaction.level = 0)
      argstmp$formula <- formu
    }
    # if(algo == "XGBOOST"){
    #   argstmp$label <- argstmp$data[,1]
    #   argstmp$data <- as.matrix(argstmp$data[, 4:ncol(mySpExpl)])
    #   argstmp <- argstmp[c("data", names(argstmp)[which(!(names(argstmp) %in% c("formula", "data")))])]
    # } 
    argstmp <- argstmp[c("formula","data", names(argstmp)[which(!(names(argstmp) %in% c("formula", "data")))])]
    
    
    new.model <- try(do.call(eval(parse(text = model.call)), argstmp), silent = TRUE)
    new.res <- NA
    if (inherits(new.model, 'try-error') != T){
      new.pred <- predict(new.model, mySpExpl[point, ])
      new.res <- mySpExpl[point, 1] - new.pred
    } 
    
    return(new.res)
  }## Return residuals x(-i)
  
  ## Calcul PRESS + R predicted
  PRESS <- sum(residuals_predicted ^ 2)
  return(PRESS)
}
  
  
## TSS with bm_FindOptimStat MAIS on a déjà TSS dans bm_FOS !!
## Pourquoi tout ces acronymes sont les mêmes ?!?

.bm_TotalSumSquares <- function(bm.mod, mod.name){
  dataset <- paste0("_", unlist(strsplit(mod.name, "_"))[2:3], collapse = "")
  bm.format <- get_formal_data(bm.mod)
  mySpExpl <- get_species_data(bm.format)
  calib.lines <- as.data.frame(get_calib_lines(bm.mod))
  mySpExpl <- mySpExpl[which(calib.lines[,dataset] == TRUE), ]
  
  #fit <- tab_res[tab_res$full.name == mod.name, "fitted"]
  y <- mySpExpl[, 1]
  y_mean <- mean(y, na.rm = T)
  TSS <- sum((y - y_mean) ^ 2)
  return(TSS)
}



## Il y a un problème 
## Peut être pas
biomodhub/biomod2 documentation built on March 29, 2025, 1:14 p.m.