#' @title Order fitted models
#' @description Find the models that outperform the others in terms of cross-validation index
#'
#' @param saveWD directory where outputs of the cross-validation procedure have
#' been saved or zip.file of this directory.
#' All necessary information have been saved in this directory to be
#' able to compile results. This is the only output of findBestModel in the
#' global R environment.
#' @param plot logical. Either to create the output plots or not. Saved
#' directly in saveWD.
#' @param zip.file TRUE (default) to save all outputs in a zipfile with saveWD,
#' FALSE for no zipfile output, or path to a new zip file
#' @param cl a cluster as made with \code{\link[snow]{makeCluster}}.
#' If empty, nbclust in \code{\link{modelselect_opt}} will be used.
#'
#' @importFrom grDevices heat.colors png dev.off
#' @import graphics
#'
#' @details
#' According to the family of model, the index of cross-validation procedure is
#' selected to rank all models fitted for all modeltypes conjointly. Model ranks
#' are then statistically compared. Model with lowest ranks not statistically
#' different than the first ranked model are kept.
#'
#' @return
#' VeryBestModels_crossV: Best models among all
#' BestForModeltypes: Best models among all in the modeltypes type
#' Figures are produced to visualize ranking and eventually help for a choice
#'
#' @export
ModelOrder <- function(saveWD, plot, zip.file = TRUE, cl = NULL)
{
saveWD <- normalizePath(saveWD)
if (utils::file_test("-f", saveWD) & file.exists(saveWD) & grepl("zip", saveWD)) {
if (zip.file == TRUE) {zip.file <- saveWD}
utils::unzip(saveWD, exdir = gsub(".zip", "", saveWD))
saveWD <- gsub(".zip", "", saveWD)
} else if (utils::file_test("-d", saveWD)) {
if (zip.file == TRUE) {zip.file <- paste0(saveWD, ".zip")}
} else {
stop("saveWD is neither an existing directory nor a zip file")
}
imgprefix <- paste0(basename(saveWD), "-")
saveWD <- normalizePath(saveWD)
lim_pvalue_final <- modelselect_opt$lim_pvalue_final
# Load variables of model configuration that need to be equivalent as fit
opt_saved <- readr::read_rds(paste0(saveWD, "/modelselect_opt.save.rds"))
seqthd <- opt_saved$seqthd
# Load information
# load(paste0(saveWD, "/Allinfo_all.RData"))
Allinfo_all <- readr::read_rds(paste0(saveWD, "/Allinfo_all.rds"))
datatype <- Allinfo_all$datatype
modeltypes <- Allinfo_all$modeltypes
# message("Image outputs will be saved in ", saveWD)
# Combine results of the different modeltypes ----
# Verif random sub-dataset for crossV used are the same
allM <- paste0("M", 1:length(modeltypes))
for (MX in 1:length(allM)) {
# load(file = paste0(saveWD, "/saveAlea", allM[MX], ".RData"))
saveAlea <- readr::read_rds(paste0(saveWD, "/saveAlea", allM[MX], ".rds"))
if (MX == 1) {
saveAleaM1 <- saveAlea
}
if (MX >= 2) {
if (!identical(saveAleaM1, saveAlea)) {
stop(c("Models of ", modeltypes[1], "and", modeltypes[MX],
"have not been fitted with the same cross-validation random subdatasets"))
}
}
}
saveAlea <- saveAleaM1
saveAlea.nb <- readr::read_rds(paste0(saveWD, "/saveAlea.nbM1.rds"))
# Combine all results
resAIC_unlist <- logical(0)
test_models_unlist <- logical(0)
ToutResDev_crossV <- logical(0)
ToutAUC_crossV <- logical(0)
ToutRMSE_crossV <- logical(0)
nb_models_per_nb <- logical(0)
nb_per_modeltypes <- logical(0)
BestTHD_crossV_unlist <- logical(0)
DiffSelSpeTHD_crossV_unlist <- logical(0)
MinUn_crossV <- logical(0)
MaxZero_crossV <- logical(0)
MeanPercentError_crossV <- logical(0)
l_Max_Nb_Var <- logical(0)
ToutLogl_crossV <- logical(0)
resParam_unlist <- logical(0)
allM <- paste0("M", 1:length(modeltypes))
for (MX in 1:length(allM)) {
# load(file = paste0(saveWD, "/resAIC_save", allM[MX], ".RData"))
resAIC_save <- readr::read_rds(paste0(saveWD, "/resAIC_save", allM[MX], ".rds"))
resAIC_unlist <- c(resAIC_unlist, unlist(resAIC_save))
nb_per_modeltypes <- c(nb_per_modeltypes, length(unlist(resAIC_save)))
if (grepl("TweedGLM|KrigeGLM", modeltypes[MX])) {
# load(file = paste0(saveWD, "/resParam_save", allM[MX], ".RData"))
resParam_save <- readr::read_rds(paste0(saveWD, "/resParam_save", allM[MX], ".rds"))
resParam_unlist <- c(resParam_unlist, unlist(resParam_save))
} else {
resParam_unlist <- c(resParam_unlist, rep(NA, length(unlist(resAIC_save))))
}
# load(file = paste0(saveWD, "/test_models", allM[MX], ".RData"))
test_models <- readr::read_rds(paste0(saveWD, "/test_models", allM[MX], ".rds"))
test_models_unlist_A <- unlist(test_models)
test_models_unlist <- c(test_models_unlist, test_models_unlist_A)
# load(file = paste0(saveWD, "/Output_models", allM[MX], ".RData"))
Output_models <- readr::read_rds(
paste0(saveWD, "/Output_models", allM[MX], ".rds"))
# ToutResDev_crossV_A <- apply(t(1:length(Output_models)), 2, function(x)
# Output_models[[x]][4,,])
# browser()
ToutResDev_crossV_A <- purrr::map(Output_models, ~.[4,,])
ToutResDev_crossV_A2 <- do.call(rbind, ToutResDev_crossV_A)
ToutResDev_crossV <- rbind(ToutResDev_crossV, ToutResDev_crossV_A2)
l_Max_Nb_Var <- c(l_Max_Nb_Var, length(ToutResDev_crossV_A))
if (grepl("PA|TweedGLM", modeltypes[MX])) {
#ToutAUC_crossV_A <- apply(t(1:length(Output_models)), 2, function(x)
# Output_models[[x]][7 + length(seqthd) + 1,,])
ToutAUC_crossV_A <- purrr::map(Output_models, ~.[7 + length(seqthd) + 1,,])
ToutAUC_crossV_A2 <- do.call(rbind, ToutAUC_crossV_A)
ToutAUC_crossV <- rbind(ToutAUC_crossV, ToutAUC_crossV_A2)
}
if (grepl("Cont|Count|Gamma", modeltypes[MX])) {
# MeanPercentError_crossV_A <- apply(t(1:length(Output_models)),
# 2, function(x) Output_models[[x]][5,,])
MeanPercentError_crossV_A <- purrr::map(Output_models, ~.[5,,])
MeanPercentError_crossV_A2 <- do.call(rbind, MeanPercentError_crossV_A)
MeanPercentError_crossV <- rbind(MeanPercentError_crossV,
MeanPercentError_crossV_A2)
}
if (grepl("PA|TweedGLM", modeltypes[MX])) {
N_RMSE <- 7 + length(seqthd) + 2
} else {
N_RMSE <- 6
}
#ToutRMSE_crossV_A <- apply(t(1:length(Output_models)), 2, function(x)
# Output_models[[x]][N_RMSE,,])
ToutRMSE_crossV_A <- purrr::map(Output_models, ~.[N_RMSE,,])
ToutRMSE_crossV_A2 <- do.call(rbind, ToutRMSE_crossV_A)
ToutRMSE_crossV <- rbind(ToutRMSE_crossV, ToutRMSE_crossV_A2)
nb_models_per_nb <- c(nb_models_per_nb, unlist(lapply(ToutRMSE_crossV_A, nrow)))
if (grepl("PA|TweedGLM", modeltypes[MX])) {
# load(file = paste0(saveWD, "/BestTHD_crossV", allM[MX], ".RData"))
BestTHD_crossV <- readr::read_rds(
paste0(saveWD, "/BestTHD_crossV", allM[MX], ".rds"))
# load(file = paste0(saveWD, "/DiffSelSpeTHD_crossV", allM[MX], ".RData"))
DiffSelSpeTHD_crossV <- readr::read_rds(
paste0(saveWD, "/DiffSelSpeTHD_crossV", allM[MX], ".rds"))
BestTHD_crossV_unlist <- c(BestTHD_crossV_unlist, unlist(BestTHD_crossV))
DiffSelSpeTHD_crossV_unlist_A <- do.call(rbind, DiffSelSpeTHD_crossV)
DiffSelSpeTHD_crossV_unlist <- rbind(DiffSelSpeTHD_crossV_unlist,
100 * DiffSelSpeTHD_crossV_unlist_A/dim(saveAlea)[1])
#MinUn_crossV_A <- apply(t(1:length(Output_models)), 2, function(x)
# Output_models[[x]][5,,])
MinUn_crossV_A <- purrr::map(Output_models, ~.[5,,])
MinUn_crossV_A2 <- do.call(rbind, MinUn_crossV_A)
MinUn_crossV <- rbind(MinUn_crossV, MinUn_crossV_A2)
#MaxZero_crossV_A <- apply(t(1:length(Output_models)), 2, function(x)
# Output_models[[x]][6,,])
MaxZero_crossV_A <- purrr::map(Output_models, ~.[6,,])
MaxZero_crossV_A2 <- do.call(rbind, MaxZero_crossV_A)
MaxZero_crossV <- rbind(MaxZero_crossV, MaxZero_crossV_A2)
}
save(BestTHD_crossV_unlist, file = paste0(saveWD, "/BestTHD_crossV_unlist.RData"))
save(MinUn_crossV, file = paste0(saveWD, "/MinUn_crossV.RData"))
save(MaxZero_crossV, file = paste0(saveWD, "/MaxZero_crossV.RData"))
if (grepl("TweedGLM", modeltypes[MX])) {
#ToutLogl_crossV_A <- apply(t(1:length(Output_models)), 2, function(x)
# Output_models[[x]][7 + length(seqthd) + 3,,])
ToutLogl_crossV_A <- purrr::map(Output_models, ~.[7 + length(seqthd) + 3,,])
ToutLogl_crossV_A2 <- do.call(rbind, ToutLogl_crossV_A)
ToutLogl_crossV <- rbind(ToutLogl_crossV, ToutLogl_crossV_A2)
}
} # END of allMX
if (grepl("Krige", datatype)) {
allmodeltypes <- rep(paste(modeltypes, Model, sep = "_"), nb_per_modeltypes)
} else {
allmodeltypes <- rep(modeltypes, nb_per_modeltypes)
}
# Fig for the choice of the best thd for 'PA' pred ----
if (plot & grepl("PA", datatype))
{
png(filename = paste0(saveWD, "/", imgprefix, "Thresholds_for_prediction.png"),
width = 18, height = 8, units = "cm", pointsize = 7, res = 300)
# x11()
par(mfrow = c(1, 3))
MinUn_crossV[MinUn_crossV == "Inf"] <- 1
for (i in c("BestTHD_crossV_unlist", "MinUn_crossV", "MaxZero_crossV")) {
if (i == "BestTHD_crossV_unlist") {
histSeuil <- hist(BestTHD_crossV_unlist, main = i)
} else {
histSeuil <- hist(apply(get(i), 1,
function(x) mean(x, na.rm = TRUE)),
main = i)
}
abline(v = 0.5, col = "blue", lwd = 2)
SeuilFromHist_crossV <- histSeuil$mids[which(histSeuil$density ==
max(histSeuil$density))]
abline(v = SeuilFromHist_crossV, col = "red", lwd = 2)
axis(1, line = 1, at = SeuilFromHist_crossV, col = "red",
col.axis = "red")
}
dev.off()
} # end of datatype == 'PA'
# Choose the very best models ----
# Define indice depending on datatype
minus_index <- 1
if (grepl("PA", datatype)) {
IndexForRank_crossV <- -ToutAUC_crossV
minus_index <- -1
nameIndex <- "Mean_AUC_crossV"
} else if (grepl("Cont", datatype)) {
# IndexForRank_crossV <- log(MeanPercentError_crossV)
IndexForRank_crossV <- ToutRMSE_crossV
nameIndex <- "Mean_RMSE_crossV"
} else if (grepl("Count|Krige", datatype)) {
# IndexForRank_crossV <- log(ToutRMSE_crossV)
IndexForRank_crossV <- ToutRMSE_crossV
nameIndex <- "Mean_RMSE_crossV"
} else if (grepl("Tweed", datatype)) {
# IndexForRank_crossV <- -ToutLogl_crossV
IndexForRank_crossV <- ToutRMSE_crossV
nameIndex <- "Mean_RMSE_crossV"
} # log likelihood
if (length(which(IndexForRank_crossV == "Inf")) > 0) {
IndexForRank_crossV[which(IndexForRank_crossV == "Inf")] <- 1e+15
}
if (length(which(IndexForRank_crossV == "-Inf")) > 0) {
IndexForRank_crossV[which(IndexForRank_crossV == "-Inf")] <- -1e+15
}
# Rank models ----
# according to their quality of prediction for each
# cross-validation sub-dataset
IndexForRank_crossVMean <- rowMeans(IndexForRank_crossV, na.rm = T)
ToutRMSE_crossVMean <- apply(ToutRMSE_crossV, 1, function(x) {
mean(x, na.rm = T)
})
# The best model is the one which is the best on average for all crossV samples
# meanLineMin <- order(OrderIndexForRank_crossVMean)[1]
# The model that is always smaller than the other is not necessarily the one
# with the smallest average rank because of the calculation of the rank and
# the possible same ranking
# It is also not necessary the one with the smallest average value
# as comparisons test will be paired and because there are NA values.
# Although models with too many NA are removed from the analysis
# To avoid comparison of all distributions against all distributions,
# we do an iterative procedure in maxit step maximum.
# This should be better for memory...
# 1/ The model with the smallest mean is set as the best model
# 2/ Mean comparison is done for all against this model
# 3/ If a model shows a mean difference lower than zero, then
# the lowest is retained and return to 2/
best.distri <- best_distri(x = IndexForRank_crossV, w = saveAlea.nb,
test = "wilcoxon", na.max = 0.5,
p.min = lim_pvalue_final, silent = TRUE,
cl = cl)
orderModels <- best.distri$orderModels
Line_keep <- best.distri$orderModels[which(best.distri$p.min.test)]
DiffTo1 <- meandiff_distri(x = IndexForRank_crossV[orderModels,],
n = 1, w = saveAlea.nb, comp.n = TRUE)
# Figures of ranks on Percent Errors ----
if (plot)
{
# Rank models, Select the best model and
# the best ones that are not significantly
# different from the 1st
if (grepl("PA", datatype)) {
titre <- "AUC on crossV"
} else {
titre <- "Root of Mean Squared Error on crossV"
}
png(filename = paste0(saveWD, "/", imgprefix, "Ordered_Models.png"),
width = 50, height = 8, units = "cm", pointsize = 5, res = 300)
par(mai = c(0.2, 0.4, 0.25, 0.3))
plot(-10,-10, main = "Mean difference of index with 1st model",
ylim = c(quantile(c(DiffTo1$comp.n), probs = 0.005, na.rm = TRUE),
quantile(c(DiffTo1$comp.n), probs = 0.995, na.rm = TRUE)),
pch = 20, xlab = "", ylab = "Mean difference with 1st model",
xlim = c(0,
nrow(IndexForRank_crossV) + 1),
xaxs = "i")
res <- apply(t(1:nrow(DiffTo1$comp.n)), 2, function(x) {
boxplot(DiffTo1$comp.n[x, ], at = x, pch = 20,
border = c("black", "forestgreen")[1 + best.distri$p.min.test[x]],
pch = 20, cex = 0.5, yaxt = "n", xaxt = "n",
outline = TRUE, add = TRUE)
})
points(DiffTo1$means, col = "green", pch = 20, cex = 0.5)
abline(h = 0, col = "grey", lty = "dashed")
par(new = TRUE)
plot(best.distri$p.values, pch = 20, col = "red",
xaxt = "n", yaxt = "n", cex = 0.5, xaxs = "i",
xlim = c(0, nrow(IndexForRank_crossV) + 1),
ylim = c(0,1), yaxs = "i", xlab = "", ylab = "")
axis(4, col.axis = "orangered")
mtext("p-value", side = 4, line = 2)
abline(h = c(lim_pvalue_final), col = "orangered", lwd = 2)
abline(v = max(which(best.distri$p.min.test)) + 0.5, col = "green")
par(new = FALSE)
dev.off()
IndexForRank_crossV.plot <- minus_index * IndexForRank_crossV
png(filename = paste0(saveWD, "/", imgprefix, "Index_Error_on_crossV.png"),
width = 50, height = 8, units = "cm", pointsize = 5, res = 300)
plot(-10,-10, main = titre,
ylim = c(stats::quantile(c(IndexForRank_crossV.plot), probs = 0.005, na.rm = TRUE),
stats::quantile(c(IndexForRank_crossV.plot), probs = 0.995, na.rm = TRUE)),
pch = 20, xlab = "", ylab = gsub("_", " ", nameIndex),
xlim = c(0, nrow(IndexForRank_crossV.plot) + 1),
xaxs = "i")
axis(1)
# Using apply accelerate calculation for table of models and
# reduce use of RAM
res <- apply(t(1:nrow(IndexForRank_crossV.plot)), 2, function(x) {
boxplot(IndexForRank_crossV.plot[x, ], at = x, pch = 20,
col = c("grey", "forestgreen")[1 + x %in% Line_keep],
border = c("black", heat.colors(length(Line_keep)))[
c(which(Line_keep %in% x) + 1, 1)[1]], yaxt = "n", xaxt = "n",
outline = FALSE, add = TRUE)
if (x %in% Line_keep) {
axis(1, at = x, labels = "", line = 1.5,
col = heat.colors(length(Line_keep))[which(Line_keep %in% x)],
col.axis = heat.colors(length(Line_keep))[which(Line_keep %in% x)])
}
})
rm(res)
gc()
axis(1, at = Line_keep[1], labels = Line_keep[1], line = 2, col = "red",
col.axis = "red")
# Add a line to separate models with 1, 2, 3 ... parameters
for (i in 1:length(nb_models_per_nb)) {
abline(v = c(0, sum(nb_models_per_nb[1:i])) + 0.5, col = "blue",
lwd = 2)
}
abline(v = 0.5, col = "red", lwd = 3)
mtext(modeltypes[1], side = 1, at = 0.5, cex = 1.5)
if (length(l_Max_Nb_Var) > 1) {
for (i in 1:(length(l_Max_Nb_Var) - 1)) {
pos <- cumsum(l_Max_Nb_Var)[i]
abline(v = sum(nb_models_per_nb[1:pos]) + 0.5, col = "red",
lwd = 3)
mtext(modeltypes[i+1], side = 1,
at = sum(nb_models_per_nb[1:pos]) + 0.5, cex = 1.5)
}
}
dev.off()
} # end of plot
# Create table with the models ranked ----
# and with the non-statistically differantiated best models
if (grepl("RMSE", nameIndex)) {
VeryBestModels_crossV_A <- data.frame(
Num = 1:length(test_models_unlist),
model = test_models_unlist,
nameIndex = round(IndexForRank_crossVMean, digits = 2),
AIC = round(resAIC_unlist)
)
} else {
VeryBestModels_crossV_A <- data.frame(
Num = 1:length(test_models_unlist),
model = test_models_unlist,
nameIndex = minus_index * round(IndexForRank_crossVMean, digits = 2),
Mean_RMSE_crossV = round(ToutRMSE_crossVMean, digits = 2),
AIC = round(resAIC_unlist)
)
}
# With tweedie, selection is made on SSQ but we need to get AUC of the
# binomial part
if (grepl("Tweed", datatype)) {
VeryBestModels_crossV_A$Mean_AUC_crossV <- -ToutAUC_crossVMean
}
# We keep XI fo tweedie or Lambda for KrigeGLM
if (length(grep("TweedGLM|KrigeGLM", modeltypes)) > 0) {
VeryBestModels_crossV_A$Param <- resParam_unlist
}
VeryBestModels_crossV_A$modeltype <- allmodeltypes
names(VeryBestModels_crossV_A)[
which(names(VeryBestModels_crossV_A) == "nameIndex")] <- nameIndex
for (i in 3:(ncol(VeryBestModels_crossV_A) - 1)) {
VeryBestModels_crossV_A[, i] <- as.numeric(as.character(
VeryBestModels_crossV_A[, i]))
}
VeryBestModels_crossV <- VeryBestModels_crossV_A[Line_keep, ]
# Save all models ordered
AllModels_crossV_ordered <- VeryBestModels_crossV_A[orderModels, ]
utils::write.table(AllModels_crossV_ordered,
file = paste0(saveWD, "/", imgprefix,
"AllModels_crossV_ordered.csv"),
sep = ";", col.names = TRUE, row.names = FALSE)
LimitNb <- nrow(VeryBestModels_crossV)
# List of covariates ranked ----
# with higher score when on the top of the list
nChar_crossV <- logical(0)
sumTot_crossV <- 0
for (i in 1:LimitNb) {
nChar_crossV[i] <- length(strsplit(gsub(", ", ",", as.character(
VeryBestModels_crossV[, "model"][i])), split = " ")[[1]])
sumTot_crossV <- sumTot_crossV + i
}
freq_fact_crossV2 <- tapply(rep(c(LimitNb:1), nChar_crossV),
unlist(strsplit(gsub(", ", ",", as.character(
VeryBestModels_crossV[, "model"])),
split = " ")), sum)
ScoreTab_A <- t(t(round(t(t(freq_fact_crossV2[
order(freq_fact_crossV2, decreasing = TRUE)]))/sumTot_crossV * 100)))
ScoreTab <- t(t(ScoreTab_A[-which(rownames(ScoreTab_A) %in%
c("+", "~", "dataY", "log(dataY)")), ]))
utils::write.table(ScoreTab, file = paste0(saveWD, "/", imgprefix, "ScoreTab_VeryBest.csv"),
sep = ";", col.names = TRUE, row.names = TRUE)
freq_fact_crossVlist2 <- strsplit(gsub(", ", ",", as.character(
VeryBestModels_crossV[, "model"])), split = " ")
# Compare models expression to the first 20 covariates chosen by the
# best models
NbParam <- min(20, nrow(ScoreTab))
ParamMat <- matrix(NA, nrow = NbParam, ncol = nrow(VeryBestModels_crossV))
for (i in 1:nrow(VeryBestModels_crossV)) {
VeryBestModels_crossV[i, "Num"]
for (j in 1:NbParam) {
if (sum(freq_fact_crossVlist2[[i]] %in% rownames(ScoreTab)[j]) >= 1) {
ParamMat[j, i] <- 1
}
}
}
if (plot) {
png(filename = paste0(saveWD, "/", imgprefix, "Help_for_model_Choice_NbParam-",
NbParam, ".png"),
width = 18, height = 9, units = "cm",
pointsize = 7, res = 300)
# x11(h=6,w=12) par(mai=c(0.5,4,0.8,0.1))
mai_left <- max(strwidth(c(paste0(nameIndex),
paste0(rownames(ScoreTab)[NbParam:1], " ")),
cex = 1, units = "inches"))
par(mai = c(0.3, mai_left + 0.1, 0.7, 0.4))
# par(mai = c(0.4, 3, 0.6, 0.4))
image(1:ncol(ParamMat), 1:nrow(ParamMat), t(ParamMat[nrow(ParamMat):1,]),
xaxt = "n", yaxt = "n", xlab = "", ylab = "", col = "forestgreen")
axis(2, at = 1:nrow(ParamMat), rownames(ScoreTab)[NbParam:1], las = 1)
axis(3, at = 1:ncol(ParamMat), labels = NA, las = 2)
axis(3, at = 1:ncol(ParamMat)-0.2, VeryBestModels_crossV[, "Num"],
las = 2, tick = FALSE, lwd = 0)
axis(3, at = 1:ncol(ParamMat)+0.2, VeryBestModels_crossV[, "modeltype"],
las = 2, cex.axis = 0.6, tick = FALSE, lwd = 0)
# SumScore <- apply(ParamMat, 2, function(x) sum(x, na.rm = TRUE))
# for (i in min(SumScore):max(SumScore)) {
# axis(1, at = c(1:ncol(ParamMat))[which(SumScore == i)],
# SumScore[which(SumScore == i)], tick = FALSE,
# col.axis = rev(heat.colors(max(SumScore) - min(SumScore) + 1))[
# i - min(SumScore) + 1])
# }
axis(1, at = 1:ncol(ParamMat), labels = VeryBestModels_crossV[,nameIndex],
tick = FALSE, las = 2, cex.axis = 0.8)
abline(v = (1:ncol(ParamMat)) + 0.5)
abline(h = (1:nrow(ParamMat)) + 0.5)
arrows(0.5, nrow(ParamMat) + (nrow(ParamMat)/4.5), ncol(ParamMat) + 0.5,
nrow(ParamMat) + (nrow(ParamMat)/4.5), xpd = TRUE,
angle = 20, length = 0.15)
text((ncol(ParamMat) + 0.5)/2, nrow(ParamMat) + (nrow(ParamMat)/4),
"+++ From Best to less best model +",
xpd = TRUE, cex = 1.5)
arrows(ncol(ParamMat) + 0.5 + 0.03 * ncol(ParamMat), nrow(ParamMat) + 0.5,
ncol(ParamMat) + 0.5 + 0.03 * ncol(ParamMat),
0.5, xpd = TRUE, angle = 15, length = 0.1)
text(ncol(ParamMat) + 0.5 + 0.05 * ncol(ParamMat), 0.5 * nrow(ParamMat) + 0.5,
"+++ From Best to less best factor +",
xpd = TRUE, cex = 1.5, srt = 270)
mtext(paste0(nameIndex, ": "), side = 1, line = 1,
at = 0.5, adj = 1, cex = 1)
dev.off()
} # end of plot
# Save the table of the very best models
utils::write.table(VeryBestModels_crossV,
file = paste0(saveWD, "/", imgprefix,
"VeryBestModels_crossV.csv"),
sep = ";", col.names = TRUE, row.names = FALSE)
save(VeryBestModels_crossV,
file = paste0(saveWD, "/", imgprefix, "VeryBestModels_crossV.RData"))
# Save the 2 best models for each modeltypes
if (length(modeltypes) > 1) {
w.bestmodeltype <- NULL
for (modeltype in levels(as.factor(modeltypes))) {
w.bestmodeltype <- c(w.bestmodeltype, orderModels[which(
VeryBestModels_crossV_A[orderModels, "modeltype"] == modeltype)[1:2]])
}
BestForModeltypes <- VeryBestModels_crossV_A[w.bestmodeltype, ]
if (grepl("TweedGLM|KrigeGLM", datatype)) {
BestForModeltypes <- cbind(BestForModeltypes, resParam_unlist[w.bestmodeltype])
names(BestForModeltypes)[ncol(BestForModeltypes)] <- "Param"
}
utils::write.table(BestForModeltypes, file = paste0(saveWD, "/", imgprefix,
"BestForModeltypes.txt"),
col.names = TRUE, row.names = FALSE)
} else {
BestForModeltypes <- VeryBestModels_crossV
}
# Zip all outputs in a unique file.
if (zip.file != FALSE) {
# Options j allows to save directly files without path
utils::zip(zip.file, files = saveWD, flags = "-urj9X", extras = "",
zip = Sys.getenv("R_ZIPCMD", "zip"))
# Return the file path of the zip file
return(list(VeryBestModels_crossV = VeryBestModels_crossV,
BestForModeltypes = BestForModeltypes,
zip.file = zip.file))
} else {
return(list(VeryBestModels_crossV = VeryBestModels_crossV,
BestForModeltypes = BestForModeltypes))
}
} # end of ModelOrder function
#' Characteristics of the model on which to produce outputs
#'
#' @param saveWD directory where outputs of model selection are saved, including
#' AllModels_crossV_ordered.
#' @param new.data SpatialPointsDataFrame dataset used to fit the model.
#' @param modeltype The model type as chosen among modeltypes available
#' (see \code{\link{modelselect_opt}})
#' @inheritParams ModelResults
#'
#' @return Num, modeltype, Family, formulaX
#' @importFrom splines ns
#' @importFrom mgcv gam
#'
#' @export
model_select <- function(saveWD, new.data, Num = NULL, model = NULL,
modeltype = NULL, powerXI = NULL)
{
Param <- NULL
# Read list of models
AllModels_crossV_ordered <- readr::read_delim(
file = paste0(saveWD, "/", basename(saveWD), "-AllModels_crossV_ordered.csv"),
delim = ";")#, col_types = "ncnnncn")
new.data.orig <- new.data
if (is(new.data)[1] == "SpatialPointsDataFrame") {
new.data <- tibble::as_tibble(new.data@data)
} else {
new.data <- tibble::as_tibble(new.data)
}
# If Num is not specified, choose the first best model
if (is.null(Num)) {
Num <- AllModels_crossV_ordered$Num[1]
}
if (!is.numeric(Num)) {Num <- as.numeric(as.character(Num))}
# Verify if model defined by user exists in already fitted models
if (!is.null(model)) {
if (sum(AllModels_crossV_ordered$model == model) == 1) {
Num <- AllModels_crossV_ordered$Num[which(
AllModels_crossV_ordered$model == model)]
Num <- as.numeric(as.character(Num))
message(c("Your model corresponds to model Num = ", Num))
} else {
Num <- "UserModel"
}
}
if (Num != "UserModel") {
# Usual case with model already fitted in crossV
modeltype <- as.character(AllModels_crossV_ordered[
which(AllModels_crossV_ordered$Num == Num), "modeltype"])
formulaX <- as.character(AllModels_crossV_ordered[
which(AllModels_crossV_ordered$Num == Num), "model"])
if (grepl("PA|Tweed", modeltype)) {
load(file = paste0(saveWD, "/BestTHD_crossV_unlist.RData"))
load(file = paste0(saveWD, "/MinUn_crossV.RData"))
load(file = paste0(saveWD, "/MaxZero_crossV.RData"))
if (grepl("Tweed", modeltype)) {
# THD values saved only for Tweed model, not for other Cont ones.
ord.Num <- order(unlist(AllModels_crossV_ordered[,"Num"]))
Num.thd <- which(unlist(AllModels_crossV_ordered[ord.Num,][
which(grepl("Tweed", unlist(AllModels_crossV_ordered[ord.Num,"modeltype"]))), "Num"]) == Num)
} else {
Num.thd <- Num
}
Seuil <- BestTHD_crossV_unlist[Num.thd]
# Minimum value where we see presence
SeuilMinUn <- mean(MinUn_crossV[Num.thd, ][
is.finite(MinUn_crossV[Num.thd, ])], na.rm = TRUE)
# Maximum value where we see absence
SeuilMaxZero <- mean(MaxZero_crossV[Num.thd, ][
is.finite(MaxZero_crossV[Num.thd, ])], na.rm = TRUE)
}
} else {
# Case with User defined model not already in crossV
# No guaranty on the utility of this type of operation
formulaX <- model
if (grepl("PA|Tweed", modeltype)) {
## Write the function to calculate Seuil(s) by crossV if model not
## already estimated
Seuil <- 0.5
SeuilMinUn <- 0.95 # Minimum value where we see presence
SeuilMaxZero <- 0.05 # Maximum value where we see absence
warning(c("Presence-absence 'Seuil(s)' values have been fixed for",
" compatibility purpose but they have not any sense for the",
" model chosen"))
}
}
# Modify formulaX for factor covariates
formulaX.tmp <- as.character(formulaX)
w.NotNum <- which(unlist(lapply(new.data[,all.vars(as.formula(formulaX.tmp))],
function(x) {!is.numeric(x)})))
if (length(w.NotNum) > 0) {
for (var in w.NotNum) {
if (!grepl(paste0("as.factor(as.character(",
all.vars(as.formula(formulaX.tmp))[var], "))"),
formulaX.tmp)) {
formulaX <- gsub(all.vars(as.formula(formulaX.tmp))[var],
paste0("as.factor(as.character(", all.vars(as.formula(formulaX.tmp))[var], "))"),
formulaX.tmp, fixed = TRUE)
}
}
}
if (grepl("PA", modeltype)) {
Family <- "binomial"
} else if (grepl("Cont", modeltype)) {
Family <- "gaussian"
} else if (grepl("Gamma", modeltype)) {
Family <- "Gamma(link=\"log\")"
} else if (grepl("Count", modeltype)) {
Family <- "poisson"
} else if (grepl("Tweed", modeltype)) {
if (Num != "UserModel") {
Param <- unlist(AllModels_crossV_ordered[
which(AllModels_crossV_ordered$Num == Num), "Param"])
} else {
Param <- tweedie::tweedie.profile(as.formula(formulaX), data = new.data,
xi.vec = seq(1.1, 1.9, 0.1), do.plot = FALSE, fit.glm = FALSE,
do.ci = FALSE, method = "series")$xi.max[1]
}
Family <- statmod::tweedie(var.power = Param, link.power = 0)
} else if (grepl("KrigeGLM", modeltype)) {
## To do
}
## Add data + 1
## Add warning + output param to say it is + 1 data
# For box-cox or Log transformation, data should be strictly positive Add +1 to
# data if needed
Sup <- 0
## Add for KrigeGLM with lambda
if (grepl("Log|Gamma", modeltype)) {
if (min(new.data$dataY) < 0) {
stop("For Box-Cox or Log transformation or Gamma model, data should be strictly positive")
}
if (min(new.data$dataY) == 0) {
Sup <- 1
warning("For Box-Cox or Log transformation or Gamma model, +1 was added to data")
}
}
new.data$dataY <- new.data$dataY + Sup
# For future KrigeGLM
# if (!is.na(Y_data_sample_lcc)) {
# Y_data_sample_lcc$dataY <- Y_data_sample_lcc$dataY + Sup
# }
if (grepl("GLM", modeltype)) {
modelX <- glm(as.formula(formulaX), data = new.data, family = Family,
control = list(epsilon = 1e-05, maxit = 1000))
} else {
modelX <- mgcv::gam(as.formula(formulaX), method = "REML", data = new.data,
family = Family)
}
if (grepl("Tweed", modeltype)) {
## Verify with TweedGAM
# 1-proba because output p is proba of absence
modelX$proba <- 1 - (apply(t(modelX$fitted), 2, function(i) tweedie::dtweedie(y = 0,
xi = Param, mu = i, phi = summary(modelX)$dispersion)))
}
res <- list(Num = Num, modeltype = as.character(modeltype), Family = Family,
formulaX = formulaX, modelX = modelX, Param = Param,
new.data = new.data.orig, Sup = Sup
)
if (grepl("PA|Tweed", modeltype)) {
res$Seuil <- Seuil
res$SeuilMinUn <- SeuilMinUn
res$SeuilMaxZero <- SeuilMaxZero
}
return(res)
}
#' Study the outputs and predictions of a specific model
#'
#' @param saveWD directory where outputs of the cross-validation procedure have
#' been saved or zip.file of this directory.
#' All necessary information have been saved in this directory to be
#' able to compile results. This is the only output of findBestModel in the
#' global R environment.
#' @param plot logical Whether to do plot outputs or not. Although there are not
#' a lot of other types of outputs...
#' @param zip.file TRUE (default) to save all outputs in a zipfile with saveWD,
#' FALSE for no zipfile output, or path to a new zip file
#' @param Num numeric The 'Num' value of the model retained for further analysis
#' like those saved in AllModels_crossV_ordered. Not useful if model and Family
#' are set. Default to the first model in AllModels_crossV_ordered.
#' @param model character string A user-defined model specification. This requires
#' to set the Family parameter. This overrides effects of 'Num', however, if the
#' model exists with the exact same expression in the list of model fitted,
#' 'Num' will be re-attributed.
#' @param modeltype The type of the model as chosen among modeltypes available
##' (see \code{\link{modelselect_opt}})
#' @param powerXI XI parameter for a tweedie model.
#' @param Marginals Calculate full marginal prediction for each covariate
#' @param cl a cluster as made with \code{\link[snow]{makeCluster}}.
#' If cl is empty, nbclust in \code{\link{modelselect_opt}} will be used
#' to create a cluster.
#' @param n.ech.marginal 5000 by default. Number of random samples for marginals
#' estimations.
#'
#' @importFrom sp spTransform CRS
#' @import graphics
#' @importFrom grDevices dev.off png heat.colors colorRampPalette
#' @importFrom ggplot2 ggplot geom_pointrange aes facet_wrap ggsave
#' @importFrom ggplot2 labs
#' @importFrom yarrr pirateplot
#' @import stats
#'
#' @export
ModelResults <- function(saveWD, plot, Num = NULL, model = NULL,
modeltype = NULL, powerXI = NULL, Marginals = FALSE,
zip.file = TRUE, cl = NULL, n.ech.marginal = 5000)
{
# browser()
if (utils::file_test("-f", saveWD) & file.exists(saveWD) & grepl("zip", saveWD)) {
if (zip.file == TRUE) {zip.file <- saveWD}
utils::unzip(saveWD, exdir = gsub(".zip", "", saveWD))
saveWD <- gsub(".zip", "", saveWD)
} else if (utils::file_test("-d", saveWD)) {
if (zip.file == TRUE) {zip.file <- paste0(saveWD, ".zip")}
} else {
stop("saveWD is neither an existing directory nor a zip file")
}
if (is.null(cl)) {
cl_inside <- TRUE
} else {
cl_inside <- FALSE
}
if (!is.null(Num) & !is.numeric(Num)) {
Num <- as.numeric(as.character(Num))
}
imgprefix <- paste0(basename(saveWD), "-")
saveWD <- normalizePath(saveWD)
message("Image outputs will be saved in ", saveWD)
# Load information
# load(paste0(saveWD, "/Allinfo_all.RData"))
Allinfo_all <- readr::read_rds(paste0(saveWD, "/Allinfo_all.rds"))
datatype <- Allinfo_all$datatype
nbclust <- modelselect_opt$nbclust
# Load variables of model configuration that need to be equivalent as fit
opt_saved <- readr::read_rds(paste0(saveWD, "/modelselect_opt.save.rds"))
# Options ----
nbMC <- opt_saved$nbMC
seqthd <- opt_saved$seqthd
MaxDist <- opt_saved$MaxDist
Phi <- opt_saved$Phi
Model <- opt_saved$Model
Lambda <- opt_saved$Lambda
modeltypes <- opt_saved$modeltypes
lcc_proj <- opt_saved$lcc_proj
if (grepl("KrigeGLM", datatype)) {
Y_data_sample_lcc <- spTransform(Allinfo_all$data, CRS(lcc_proj))
} else {
Y_data_sample_lcc <- NA
}
if (is(Allinfo_all$data)[1] == "SpatialPointsDataFrame") {
Y_data_sample <- tibble::as_tibble(Allinfo_all$data@data)
} else {
Y_data_sample <- tibble::as_tibble(Allinfo_all$data)
}
# Precision to keep for predictions
prec.data <- prec_data(Y_data_sample$dataY) - 1
# Define the model on which to produce outputs
model_selected <- model_select(saveWD = saveWD,
new.data = Y_data_sample,
Num = Num, model = model,
modeltype = modeltype, powerXI = powerXI)
Num <- model_selected$Num
modeltype <- model_selected$modeltype
Family <- model_selected$Family
formulaX <- model_selected$formulaX
modelX <- model_selected$modelX
Param <- model_selected$Param
if (grepl("PA|Tweed", modeltype)) {
Seuil <- model_selected$Seuil
SeuilMinUn <- model_selected$SeuilMinUn
SeuilMaxZero <- model_selected$SeuilMaxZero
}
# Outputs of the best chosen model ====
# Find the prediction closest to mean average prediction ----
# This is to build some marginal distributions of covariates
datapred.tbl <- model_selected$new.data[,all.vars(stats::as.formula(modelX))] %>%
tibble::as_tibble() %>%
dplyr::mutate_at(dplyr::vars(-dataY), .funs = function(x) {
if (is.numeric(x)) {
minmax <- stats::quantile(x, probs = c(0.1, 0.9), na.rm = TRUE)
x[(x <= minmax[1] | x >= minmax[2])] <- NA
}
x
}) %>%
dplyr::mutate(pred = stats::predict(modelX),
na.row = apply(., 1, function(x) sum(is.na(x)) != 0),
mean = ifelse(grepl("PA", modeltype), 0, mean(dataY)),
pred.absmean = abs(pred - mean)) %>%
dplyr::filter(!na.row) %>%
dplyr::arrange(pred.absmean)
CovForMeanPred <- datapred.tbl[1,]
readr::write_rds(CovForMeanPred,
paste0(saveWD, "/", basename(saveWD),
"-CovForMeanPred_", model_selected$Num, ".rds"))
# Deviance explained ----
## Also add AUC and RMSE
w.dataY <- "dataY"
if (model_selected$Sup != 0) {
w.dataY <- "dataY + 1"
}
if (grepl("Log", modeltype)) {
factlist <- c(paste0("log(", w.dataY, ") ~ 1"), gsub("dataY", w.dataY,
unlist(strsplit(formulaX, "+", fixed = TRUE))))
} else {
factlist <- c(paste(w.dataY, "~ 1"), gsub("dataY", w.dataY, unlist(strsplit(
formulaX,
"+", fixed = TRUE))))
}
aov_save <- as.data.frame(matrix(ncol = 5, nrow = length(factlist)))
rownames(aov_save) <- factlist
# For crossV indices
# load(file = paste0(saveWD, "/saveAleaM1.RData"))
saveAlea <- readr::read_rds(paste0(saveWD, "/saveAleaM1.rds"))
RMSE_crossV_save <- matrix(0, nrow = length(factlist), ncol = 2)
if (cl_inside) {
cl <- parallel::makePSOCKcluster(nbclust)
}
for (fl in 1:length(factlist)) {
if (fl == 1) {
model <- factlist[1]
if (grepl("GLM", modeltype)) {
assign(paste0("modelX", fl), stats::glm(as.formula(model),
data = Y_data_sample, family = Family, control = list(epsilon = 1e-06,
maxit = 1000)))
} else {
assign(paste0("modelX", fl), mgcv::gam(as.formula(model),
method = "REML", data = Y_data_sample, family = Family))
}
} else {
model <- paste(factlist[2:fl], collapse = "+")
if (grepl("GLM", modeltype)) {
assign(paste0("modelX", fl), stats::glm(as.formula(paste(factlist[2:fl],
collapse = "+")), data = Y_data_sample, family = Family,
control = list(epsilon = 1e-06, maxit = 1000)))
} else {
assign(paste0("modelX", fl), mgcv::gam(as.formula(paste(factlist[2:fl],
collapse = "+")), method = "REML", data = Y_data_sample,
family = Family))
}
# Compare anova
if (fl == 2) {
aov1 <- anova(get(paste0("modelX", fl - 1)), get(paste0("modelX",
fl)), test = "Chisq")
aov_save[1:2, ] <- aov1
names(aov_save) <- names(aov1)
} else {
aov_save[fl, ] <- anova(get(paste0("modelX", fl - 1)),
get(paste0("modelX", fl)), test = "Chisq")[2,]
}
}
print(model)
# Calculate crossV indices ----
RMSE_crossV <- snow::parApply(
cl, t(1:nbMC), 2,
function(MC,
formula, modeltype, saveAlea, Y_data_sample,
seqthd, resParam_save, nb, Y_data_sample_lcc,
MaxDist, Phi, Model, lambda) {
res <- crossV_indices(
MC = MC, formulas = t(t(formula)),
modeltype = modeltype, saveAlea = saveAlea, Y_data_sample = Y_data_sample,
seqthd = seqthd,
resParam_save = resParam_save, Y_data_sample_lcc = Y_data_sample_lcc,
MaxDist = MaxDist, Phi = Phi, model = Model,
lambda = lambda)
if (grepl("PA", modeltype)) {
res <- res["ROC_crossV",]
# Test for threshold value
# res <- res[grep("DiffSelSpe", rownames(res)),]
} else {
res <- res["RMSE_crossV",]
}
return(unlist(res))
c(res)
}, formula = gsub(" + 1", "", model, fixed = TRUE),
modeltype = modeltype,
saveAlea = saveAlea, Y_data_sample = Y_data_sample,
seqthd = seqthd, resParam_save = Param,
Y_data_sample_lcc = Y_data_sample_lcc, MaxDist = MaxDist, #nb = nb,
Phi = Phi, Model = Model[which(modeltypes == modeltype)],
lambda = Lambda[which(modeltypes == modeltype)])
# Test for threshold value
# plot(seqthd, RMSE_crossV[,1], type = "l")
# apply(RMSE_crossV, 2, function(x) lines(seqthd, x))
# lines(seqthd,
# apply(RMSE_crossV, 1, function(x) mean(x, na.rm = TRUE)),
# col = "red", lwd = 2)
RMSE_crossV_save[fl,1] <- mean(RMSE_crossV, na.rm = TRUE)
if (fl >= 2) {
RMSE_crossV_save[fl,2] <- RMSE_crossV_save[fl,1] - RMSE_crossV_save[fl - 1,1]
}
}
# Round values of RMSE properly
diff.rmse <- max(RMSE_crossV_save[,2]) - min(RMSE_crossV_save[fl,2])
if (grepl("PA", modeltype)) {
prec <- 2
} else {
prec <- 1
}
if (diff.rmse > 1) {
# precision defined by length of the number
n <- nchar(round(diff.rmse))
} else {
# precision defined by number of zero after comma
n.w0 <- (nchar(diff.rmse) - 2) # nchar with zero after comma
n.wo0 <- nchar(diff.rmse * 10 ^ n.w0) # nchar without zero
n <- -n.w0 + n.wo0 # diff of nchar
}
RMSE_crossV_save <- round(RMSE_crossV_save / 10 ^ (n - prec)) * 10 ^ (n - prec)
if (cl_inside) {
parallel::stopCluster(cl); gc()
}
aov_save[,5] <- cut(aov_save[, 5],
breaks = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1),
labels = c("p<0.0001", "p<0.001", "p<0.01", "p<0.05", "p<0.1", "NS"))
names(aov_save)[5] <- "p"
aov_save <- rbind(aov_save, c(rep(NA, 3),
aov_save[nrow(aov_save), 2], NA, NA))
rownames(aov_save)[nrow(aov_save)] <- "Residuals"
aov_save <- cbind(aov_save, round(aov_save[, "Deviance"] /
aov_save[1, 2] * 100, digits = 2),
rbind(RMSE_crossV_save, NA))
if (grepl("PA", modeltype)) {
names(aov_save)[6:8] <- c("%Exp.Dev", "AUC", ">Diff")
} else {
names(aov_save)[6:8] <- c("%Exp.Dev", "RMSE", ">Diff")
}
save(aov_save, file = paste0(saveWD, "/aov_save_", Num, ".RData"))
utils::write.csv(aov_save,
file = paste0(saveWD, "/", imgprefix, "aov_save_", Num, ".csv"))
if (plot) {
# Fig Deviance ----
png(filename =
paste0(saveWD, "/", imgprefix, "Param_Exp_Deviance_", Num, ".png"),
width = 11, height = 5, units = "cm", pointsize = 8,
res = 300)
# Calculate number of character to define par(mai) on left
mai_left <- max(strwidth(paste0(rownames(aov_save), " "), cex = 0.5, units = "inches"))
par(mai = c(0, mai_left + 0.05, 0.3, 0.05), cex = 0.5)
plot(NA, xlim = c(0.75, ncol(aov_save) + 0.25), ylim = c(1, nrow(aov_save)),
yaxt = "n", xaxt = "n", xlab = "", ylab = "",
main = "Contribution of factors")
axis(2, at = (2:nrow(aov_save)), labels = rev(paste0(rownames(aov_save), " "))[-1],
las = 1, tick = FALSE, line = -1)
axis(2, at = 1, labels = paste0(rownames(aov_save), " ")[nrow(aov_save)],
las = 1, font = 3, tick = FALSE, line = -1)
axis(3, at = c(1:ncol(aov_save)), labels = names(aov_save), las = 1,
tick = FALSE, line = -0.5)
for (i in 1:ncol(aov_save)) {
for (j in c(1:nrow(aov_save))) {
font <- 1
if (j == nrow(aov_save)) {font <- 3} # Residuals
if (i %in% c(1:4)) {
text(i, nrow(aov_save) - j + 1,
labels = round(aov_save[j, i], digits = 2), font = font)
}
# if (i == 5) {
# text(i, nrow(aov_save) - j + 1,
# labels = round(aov_save[j, i], digits = 4), font = font)
# }
if (i %in% c(5,6)) {
text(i, nrow(aov_save) - j + 1,
labels = aov_save[j, i], font = font)
}
if (i %in% c(7,8)) {
text(i, nrow(aov_save) - j + 1,
labels = aov_save[j, i], font = 3)
}
}
}
abline(h = c(1.5, nrow(aov_save) - 0.5), lty = "dashed", col = "grey")
dev.off()
# Fig Residuals analysis ----
png(filename = paste0(saveWD, "/", imgprefix, "Residual-Analysis_", Num, ".png"),
width = 15, height = 15, units = "cm", pointsize = 8,
res = 300)
par(mfrow = c(2,2))
plot(fitted(modelX), residuals(modelX), col = "black",
xlab = "Fitted", ylab = "Resp. residuals", main = "Resp. residuals vs Fitted")
panel.smooth(fitted(modelX), residuals(modelX, type = "pearson"), col = NA)
abline(h = 0, v = 0, lty = 3, col = "gray")
qqnorm(residuals(modelX), main = "Residuals QQplot")
qqline(residuals(modelX), lty = 3, col = "gray50")
plot(fitted(modelX), residuals(modelX, type = "pearson"), col = "black",
xlab = "Fitted", ylab = "Std. residuals", main = "Std residuals vs Fitted")
panel.smooth(fitted(modelX), residuals(modelX, type = "pearson"), col = NA)
abline(h = 0, v = 0, lty = 3, col = "gray")
plot(cooks.distance(modelX), type = "h", ylab = "Cook's distance",
main = "Cook's distance")
abline(h = c(0.5, 1), lty = c(3,1) , col = "gray50")
dev.off()
}
# Covariates and Interactions used ====
# Find if estimates are for numeric, factors or interactions
# Get list of covariates in the model
n_tmp <- unlist(strsplit(gsub(", ", ",", as.character(formulaX)), split = " "))
cov_used_A <- n_tmp[-which(n_tmp %in% c("~", "+", "dataY", "log(dataY)"))]
cov_used_B <- apply(t(cov_used_A), 2, function(x) strsplit(x, split = ",")[[1]][1])
if (length(grep("GLM", modeltype)) == 1) {
# For GLM
cov_used_B <- cov_used_A
}
# Get factors in interactions
if (length(grep("te(", cov_used_B, fixed = TRUE)) > 0 |
length(grep(":", cov_used_B, fixed = TRUE)) > 0) {
cov_used_C <- t(cov_used_B[-c(grep("te(", cov_used_B, fixed = TRUE),
grep(":", cov_used_B, fixed = TRUE))])
inter_used_A <- cov_used_A[c(grep("te(", cov_used_A, fixed = TRUE),
grep(":", cov_used_B, fixed = TRUE))] # interaction of continuous factors
inter_used_B <- matrix(ncol = 2, nrow = length(inter_used_A))
if (length(grep("GLM", modeltype)) == 1) {
# For GLM
inter_used_B <- t(apply(t(inter_used_A), 2, function(x)
strsplit(x, split = ":")[[1]][1:2]))
wpol <- unique(grep("poly(", inter_used_B, fixed = TRUE),
grep("ns(", inter_used_B, fixed = TRUE))
if (length(wpol) > 0) {
inter_used_B[wpol] <- c(apply(t(inter_used_B[wpol]), 2,
function(x) strsplit(x, split = ",", fixed = TRUE)[[1]][1]))
inter_used_B[wpol] <- apply(t(inter_used_B[wpol]), 2, function(x)
strsplit(x, split = "(", fixed = TRUE)[[1]][2])
}
inter_used <- inter_used_B
} else {
# For GAM
interWithfact <- grep("by=", inter_used_A, fixed = TRUE)
if (length(interWithfact) == 0) {
inter_used_B <- t(apply(t(inter_used_A), 2, function(x)
strsplit(x, split = ",")[[1]][1:2]))
inter_used_B[, 1] <- apply(t(inter_used_B[, 1]), 2, function(x)
strsplit(x, split = "(", fixed = TRUE)[[1]][2])
}
if (length(interWithfact) != 0 &
length(interWithfact) < length(inter_used_A)) {
inter_used_B[-interWithfact, ] <- t(apply(
t(inter_used_A[-interWithfact]),
2, function(x) strsplit(x, split = ",")[[1]][1:2]))
inter_used_B[-interWithfact, 1] <- apply(
t(inter_used_B[-interWithfact, 1]), 2, function(x)
strsplit(x, split = "(", fixed = TRUE)[[1]][2])
inter_used_B[interWithfact, ] <- t(apply(
t(inter_used_A[interWithfact]), 2, function(x)
strsplit(x, split = ",")[[1]][c(1, 3)]))
inter_used_B[interWithfact, 2] <- apply(
t(inter_used_B[interWithfact, 2]), 2, function(x)
strsplit(strsplit(x, split = "=", fixed = TRUE)[[1]][2],
split = ")")[[1]][1])
inter_used_B[interWithfact, 1] <- apply(
t(inter_used_B[interWithfact, 1]), 2, function(x)
strsplit(x, split = "(", fixed = TRUE)[[1]][2])
}
if (length(interWithfact) == length(inter_used_A)) {
inter_used_B <- t(apply(t(inter_used_A), 2, function(x)
strsplit(x, split = ",")[[1]][c(1, 3)]))
inter_used_B[, 1] <- apply(t(inter_used_B[, 1]), 2, function(x)
strsplit(x, split = "(", fixed = TRUE)[[1]][2])
inter_used_B[, 2] <- apply(t(inter_used_B[, 2]), 2, function(x)
strsplit(strsplit(x, split = "=", fixed = TRUE)[[1]][2],
split = ")")[[1]][1])
}
inter_used_C <- inter_used_B
if (length(inter_used_C) >= 3) {
if (length(grep("k=", inter_used_C[, 1])) >= 1 &
length(grep("k=", inter_used_C[, 1])) != nrow(inter_used_C)) {
inter_used <- inter_used_C[-grep("k=", inter_used_C[, 1]), ]
}
}
if (length(grep("k=", inter_used_C[, 1])) == 0) {
inter_used <- inter_used_C
}
}
} else {
cov_used_C <- t(cov_used_B)
}
cov_used_D <- apply(cov_used_C, 2, function(x) {
res <- x
if (grepl("GLM", modeltype)) {
if (grepl("ns", modeltype)) {
if (length(strsplit(x, split = "ns(", fixed = TRUE)[[1]]) == 2) {
res1 <- strsplit(x, split = "ns(", fixed = TRUE)[[1]][2]
res <- strsplit(res1, split = ",", fixed = TRUE)[[1]][1]
}
} else {
if (length(strsplit(x, split = "poly(", fixed = TRUE)[[1]]) == 2) {
res1 <- strsplit(x, split = "poly(", fixed = TRUE)[[1]][2]
res <- strsplit(res1, split = ",", fixed = TRUE)[[1]][1]
}
}
} else {
if (length(strsplit(x, split = "s(", fixed = TRUE)[[1]]) == 2) {
res <- strsplit(x, split = "s(", fixed = TRUE)[[1]][2]
}
}
if (length(strsplit(x, split = "as.character(", fixed = TRUE)[[1]]) == 2) {
res1 <- strsplit(x, split = "as.character(", fixed = TRUE)[[1]][2]
res <- strsplit(res1, split = ")", fixed = TRUE)[[1]][1]
}
res
})
cov_used <- cov_used_D #[order(cov_used_D)]
# NumFact <- 0
# discretisation of numeric factors for graph representation
## S'assurer que le nombre de levels est suffisant pour qu'il y ait un tableau à travailler
# IsDataNumeric <- apply(t(cov_used), 2, function(x) is.numeric(Y_data_sample[, x]))
IsDataNumeric <- (Y_data_sample %>%
dplyr::select(dplyr::one_of(cov_used)) %>%
sapply(., class) == "numeric")
save(cov_used, file = paste0(saveWD, "/cov_used_", Num, ".RData"))
save(IsDataNumeric, file = paste0(saveWD, "/IsDataNumeric_", Num, ".RData"))
if (sum(grepl("factor", cov_used) == !IsDataNumeric) !=
length(IsDataNumeric)) {
warnings("Verify if all factors have been accounted",
"as factors in the model selection procedure !")
IsDataNumeric[grepl("factor", cov_used)] <- FALSE
}
n.MargFig <- length(cov_used)
if (exists("inter_used")) {n.MargFig <- length(cov_used) + length(inter_used)}
save(n.MargFig, file = paste0(saveWD, "/n.MargFig_", Num, ".RData"))
# Marginal predictions for covariates ====
if (Marginals) {
nb_lev <- 15
if (sum(IsDataNumeric) >= 2 & length(IsDataNumeric) >= 6) {
nb_lev <- 10
}
AllFact <- lapply(cov_used, function(f) { # f <- cov_used[4]
# y <- Y_data_sample[, which(names(Y_data_sample) == f)]
y <- Y_data_sample %>% dplyr::select(dplyr::matches(f))
if (IsDataNumeric[cov_used == f]) {
seq(min(y), max(y), (max(y) - min(y))/nb_lev)
} else {
unique(unlist(y))
}
})
names(AllFact) <- cov_used
# Create the fractionnar plan for predictions
# -- Remove as planor has been removed from CRAN --
# frac.key <- planor::planor.designkey(
# factors = names(AllFact),
# nlevels = lengths(AllFact),
# model = as.formula(paste0("~(", paste(names(AllFact), collapse = "+"), ")^2")),
# base = as.formula(paste0("~(", paste(names(AllFact), collapse = "+"), ")"))
# )
#
# frac.plan <- planor::planor.design(frac.key)
#
# GridFact <- as.data.frame(apply(t(1:length(AllFact)), 2, function(x) {
# AllFact[[x]][frac.plan@design[,x]]
# }))
#
GridFact <- purrr::map_df(AllFact, ~sample(.x, n.ech.marginal, replace = TRUE))
if (sum(IsDataNumeric) != length(!IsDataNumeric)) {
GridFact <- GridFact %>%
dplyr::mutate_if(.predicate = !IsDataNumeric, .funs = function(x) as.factor(as.character(x))) %>%
dplyr::mutate_if(.predicate = IsDataNumeric, .funs = function(x) as.numeric(as.character(x)))
}
names(GridFact) <- names(AllFact)
# head(GridFact)
# rm(frac.key, frac.plan); gc()
# Calculate predictions while accounting for uncertainty of prediction
if (grepl("PA", modeltype)) {
PredGridFact <- cbind(GridFact, predict(modelX, newdata = GridFact,
type = "link", se.fit = TRUE))
names(PredGridFact) <- c(names(GridFact), "pred", "se")
PredGridFact_boot2 <- t(apply(PredGridFact[, c("pred", "se")], 1,
function(x) {
1/(1 + exp(-qnorm(c(0.5, 0.05, 0.95), mean = x[1], sd = x[2])))
}))
} else if (grepl("Cont", modeltype) & !grepl("Log", modeltype)) {
PredGridFact <- cbind(GridFact, predict(modelX, newdata = GridFact,
type = "response", se.fit = TRUE))
names(PredGridFact) <- c(names(GridFact), "pred", "se")
PredGridFact_boot2 <- t(apply(PredGridFact[, c("pred", "se")], 1,
function(x) {
qnorm(c(0.5, 0.05, 0.95), mean = x[1], sd = x[2])
}))
} else if (grepl("Gamma", modeltype)) {
PredGridFact <- cbind(GridFact, predict(modelX, newdata = GridFact,
type = "link", se.fit = TRUE))
names(PredGridFact) <- c(names(GridFact), "pred", "se")
# Gamma distribution
# beta = scale = (se*se)/pred
# alpha= shape = 1/(CV*CV)
PredGridFact_boot2 <- t(apply(PredGridFact[, c("pred", "se")], 1,
function(x) {
exp(qnorm(c(0.5, 0.05, 0.95), mean = x[1], sd = x[2]))
}))
} else if (grepl("Log", modeltype)) {
PredGridFact <- cbind(GridFact, predict(modelX, newdata = GridFact,
type = "response", se.fit = TRUE))
names(PredGridFact) <- c(names(GridFact), "pred", "se")
# Log-normal distribution with Laurent transformation
PredGridFact_boot2 <- t(apply(PredGridFact[, c("pred", "se")], 1,
function(x) {
exp(qnorm(c(0.5, 0.05, 0.95), mean = x[1],
sd = x[2]) + 0.5 * var(modelX$residuals))
}))
} else if (grepl("Count", modeltype)) {
PredGridFact <- cbind(GridFact,
predict(modelX, newdata = GridFact, type = "link", se.fit = TRUE))
names(PredGridFact) <- c(names(GridFact), "pred", "se")
PredGridFact_boot2 <- t(apply(PredGridFact[, c("pred", "se")], 1,
function(x) {
exp(qnorm(c(0.5, 0.05, 0.95), mean = x[1],
sd = x[2]) + 0.5 * var(modelX$residuals))
}))
} else if (grepl("Tweed", modeltype)) {
PredGridFact <- cbind(GridFact,
predict(modelX, newdata = GridFact, type = "response", se.fit = TRUE))
maxY <- max(Y_data_sample$dataY) * 1.1
# Ne pas recalculer pour toutes les valeurs de fit
# Faire pour 10^-2 la précision des datas.
PredGridFact$fit2 <- round(PredGridFact$fit * 10^-prec.data) * 10^prec.data
PredGridFact$fit2[which(PredGridFact$fit2 == min(PredGridFact$fit2, na.rm = TRUE))] <- min(PredGridFact$fit, na.rm = TRUE)
fit.unique <- data.frame(unique(PredGridFact$fit2))
names(fit.unique) <- "fit2"
if (cl_inside) {
cl <- parallel::makePSOCKcluster(nbclust)
}
fit.unique2 <- t(snow::parApply(
cl, t(fit.unique), 2,
function(x, powerXI, modelX, maxY) {
if (x > maxY) {
## Calculation is very long for too high values
a <- rep(maxY, 3)
} else {
a <- rep(NA, 3)
if (x == 0) {
a <- rep(0, 3)
} else {
try(a <- tweedie::qtweedie(p = c(0.5, 0.05, 0.95), xi = powerXI,
mu = x, phi = summary(modelX)$dispersion))
}
}
a
}, powerXI = Param, modelX = modelX, maxY = maxY))
if (cl_inside) {
parallel::stopCluster(cl)
}
PredGridFact_boot2 <- dplyr::left_join(PredGridFact, cbind(fit.unique, fit.unique2)) %>%
dplyr::select(dplyr::one_of(c("1","2","3")))
}
# Add +1 when Sup = 1
if (model_selected$Sup != 0) {
PredGridFact_boot2 <- PredGridFact_boot2 - model_selected$Sup
}
if (grepl("PA", modeltype)) {
ymax <- 1
ymin <- 0
} else {
ymax <- quantile(PredGridFact_boot2[,1], probs = 0.975, na.rm = TRUE)
ymin <- quantile(PredGridFact_boot2[,1], probs = 0.025, na.rm = TRUE)
}
MaxMed <- ymax
# Plot Marginals ----
if (plot) {
# One figure for each covariate or interaction
# n.fig <- 0
if (cl_inside) {
cl <- parallel::makePSOCKcluster(nbclust)
}
if (!exists("Seuil")) {Seuil <- NA}
tmp <- snow::parLapply(
cl, 1:length(cov_used),
function(i,
cov_used, saveWD, imgprefix, Num,
PredGridFact, PredGridFact_boot2,
modeltype, Y_data_sample, IsDataNumeric,
ymin, ymax, Seuil)
{
f <- cov_used[i]
grid.pred.2 <- data.frame(PredGridFact[,f], PredGridFact_boot2)
names(grid.pred.2) <- c("grid", "med", "min", "max")
if (!IsDataNumeric[cov_used == f]) {
grid.pred.2[,1] <- as.numeric(grid.pred.2[,1])
}
# n.fig <- n.fig + 1
png(filename =
paste0(saveWD, "/", imgprefix, "Marginal_Effects_", Num, "_", i, ".png"),
# width = fig.dim$ncol * 5, height = fig.dim$nrow * 5,
width = 8, height = 8,
units = "cm", pointsize = 8, res = 300)
# x11()
# par(mfrow = c(fig.dim$nrow, fig.dim$ncol), mar = c(3.5, 3, 0.5, 0.5))
par(mar = c(3.25, 3, 0.5, 0.5))
# Figures of marginals of covariates
# for (f in cov_used) { # f <- cov_used[1]
wdth <- sort(unique(grid.pred.2[,1]))[2] - sort(unique(grid.pred.2[,1]))[1]
if (IsDataNumeric[cov_used == f]) {
plot(NA, xlim = c(min(grid.pred.2[,1]) - 0.5 * wdth,
max(grid.pred.2[,1]) + 0.5 * wdth),
ylim = c(ymin, ymax), #main = f, cex.main = 1,
ylab = NA, xlab = NA, xaxs = "i")
} else {
plot(NA,
xlim = c(min(grid.pred.2[,1] - 0.5 * wdth),
max(grid.pred.2[,1]) + 0.5 * wdth),
ylim = c(ymin, ymax), #main = f, cex.main = 1,
xaxt = "n", ylab = NA, xlab = NA, xaxs = "i")
axis(1, at = 1:length(unique(PredGridFact[,f])))
}
title(line = 2, xlab = f, ylab = "Y", cex.lab = 1.2)
# tmp <- lapply(1:length(unique(PredGridFact[,f])), function(nb.lev)
# .marge_cov(nb.lev, grid = PredGridFact[,f], grid.pred = PredGridFact_boot2))
for (z in 1:3) {
p <- c("med", "min", "max")[z]
col <- c("grey30", "forestgreen", "orangered")[z]
m.wdth <- c(0.49, 0.3, 0.2)[z]
pt.o <- c(0.005, 0, 0)[z]
for (val in sort(unique(grid.pred.2$grid))) {
# val <- sort(unique(grid.pred.2$grid))[6]
pred <- as.data.frame(grid.pred.2[which(grid.pred.2$grid == val), p])
if (sum(!duplicated(pred)) == 1) {
boxplot(
pred, at = val,
boxwex = 2 * wdth * m.wdth, border = col,
add = TRUE)
} else {
yarrr::pirateplot(formula = as.formula(paste(p, "~ grid")),
data = as.data.frame(
grid.pred.2[which(grid.pred.2$grid == val), ]),
# avg.line.fun = median, inf = "iqr",
avg.line.fun = mean,
point.o = pt.o,# 0.05 * m.wdth,
point.cex = 0.5,
bean.b.o = 1, bean.lwd = 0.8,
avg.line.lwd = 0.8,
inf.f.o = 0.35, inf.b.o = 0.35,
pal = col,
# bw = "bcv",
jitter.val = 0.1 * wdth * m.wdth,
at = val,
width.min = wdth * m.wdth * 0.99,
width.max = wdth * m.wdth,
xaxt = "n", yaxt = "n",
add = TRUE)
}
}
}
# rm(tmp)
if (modeltype %in% c("PA", "PAGLM")) {
abline(h = Seuil, lwd = 2, lty = "dashed")
}
# rug(Y_data_sample[,which(names(Y_data_sample) == f)], ticksize = 0.02)
rug(unlist(dplyr::select(Y_data_sample, dplyr::matches(f))), ticksize = 0.02)
dev.off()
},
cov_used = cov_used, saveWD = saveWD, imgprefix = imgprefix, Num = Num,
PredGridFact = PredGridFact, PredGridFact_boot2 = PredGridFact_boot2,
modeltype = modeltype, Y_data_sample = Y_data_sample,
IsDataNumeric = IsDataNumeric, ymin = ymin, ymax = ymax, Seuil = Seuil
) # End of parApply on cov_used
# Figures of marginales of interactions
if (exists("inter_used")) {
if (!is.matrix(inter_used)) {inter_used <- matrix(inter_used, ncol = 2)}
# if (length(inter_used) >= 3) {
tmp <- snow::parLapply(cl, (length(cov_used) + 1):n.MargFig,
function(i,
inter_used, cov_used, saveWD, imgprefix, Num,
PredGridFact, PredGridFact_boot2,
modeltype, Y_data_sample, IsDataNumeric,
ymin, ymax, MaxMed)
{
#for (inter in 1:nrow(inter_used)) {
inter <- i - length(cov_used)
# n.fig <- n.fig + 1
png(filename =
paste0(saveWD, "/", imgprefix, "Marginal_Effects_", Num, "_", i, ".png"),
# width = fig.dim$ncol * 5, height = fig.dim$nrow * 5,
width = 5, height = 5,
units = "cm", pointsize = 8, res = 300)
# x11()
# par(mfrow = c(fig.dim$nrow, fig.dim$ncol), mar = c(3.5, 3, 0.5, 0.5))
par(mar = c(3.5, 3, 0.5, 0.5))
MEAN <- tapply(PredGridFact_boot2[, 1],
list(PredGridFact[,
which(names(PredGridFact) == inter_used[inter, 1])],
PredGridFact[,
which(names(PredGridFact) == inter_used[inter, 2])]),
FUN = median)
if (is(PredGridFact[,
which(names(PredGridFact) == inter_used[inter, 2])])[1]
== "numeric") {
x1 <- as.numeric(as.character(colnames(MEAN)))
} else {
x1 <- as.numeric(as.factor(colnames(MEAN)))
}
if (is(PredGridFact[,
which(names(PredGridFact) == inter_used[inter, 1])])[1]
== "numeric") {
y1 <- as.numeric(as.character(rownames(MEAN)))
} else {
y1 <- as.numeric(as.factor(rownames(MEAN)))
}
image(x1, y1, t(MEAN), col = rev(heat.colors(255)),
breaks = c(seq(ymin, ymax, (ymax - ymin)/254),
MaxMed), xlab = NA, ylab = NA)
title(line = 2, xlab = inter_used[inter,2],
ylab = inter_used[inter,1])
# rug(Y_data_sample[,
# which(names(Y_data_sample) == inter_used[inter, 2])])
#
rug(unlist(dplyr::select(Y_data_sample, dplyr::matches(inter_used[inter, 2]))),
ticksize = 0.02)
rug(unlist(dplyr::select(Y_data_sample, dplyr::matches(inter_used[inter, 1]))),
ticksize = 0.02, side = 2)
# rug(Y_data_sample[,
# which(names(Y_data_sample) == inter_used[inter, 1])], side = 2)
points(
unlist(dplyr::select(Y_data_sample, dplyr::matches(inter_used[inter, 2]))),
unlist(dplyr::select(Y_data_sample, dplyr::matches(inter_used[inter, 1]))),
# Y_data_sample[, which(names(Y_data_sample) == inter_used[inter,2])],
# Y_data_sample[, which(names(Y_data_sample) == inter_used[inter,1])],
pch = 20, cex = 0.1)
if (modeltype %in% c("PA", "PAGLM")) {
contour(x1, y1, t(MEAN), levels = Seuil, lwd = 2,
add = TRUE)
}
dev.off()
},# End of inter_used
cov_used = cov_used, saveWD = saveWD, imgprefix = imgprefix, Num = Num,
PredGridFact = PredGridFact, PredGridFact_boot2 = PredGridFact_boot2,
modeltype = modeltype, Y_data_sample = Y_data_sample,
IsDataNumeric = IsDataNumeric, ymin = ymin, ymax = ymax, Seuil = Seuil,
MaxMed = MaxMed
) # end of parLapply
} # End of if(exists("inter_used"))
if (cl_inside) {
parallel::stopCluster(cl); gc()
}
} # end of plot
}
# Simple Marginals close to mean or 0 (PA) ----
AllFact.simple <- lapply(cov_used, function(covar) { # covar <- cov_used[2]
# y <- Y_data_sample[, which(names(Y_data_sample) == covar)]
y <- Y_data_sample %>% dplyr::select(covar)
if (IsDataNumeric[cov_used == covar]) {
# seq(min(y), max(y), (max(y) - min(y))/)
cov.val <- unique(quantile(unlist(y), probs = seq(0, 1, 0.05)))
res <- data.frame(
covar = cov.val,
CovForMeanPred[,cov_used[-which(cov_used == covar)]],
cov = covar,
cov.val = as.character(cov.val),
type = "numeric")
names(res)[1] <- covar
} else {
res <- data.frame(
unique(unlist(y)),
CovForMeanPred[,cov_used[-which(cov_used == covar)]],
cov = covar,
# cov.val = as.numeric(as.factor(as.character(unique(unlist(y))))))
cov.val = unique(unlist(y)),
type = "character"
)
names(res)[1] <- covar
}
res %>%
dplyr::mutate_if(is.factor, as.character)
})
AllFact.simple.tbl <- tibble::as_tibble(
do.call(dplyr::bind_rows, AllFact.simple)) %>%
dplyr::bind_cols(data.frame(predict(modelX, newdata = .,
type = "response", se.fit = TRUE))) %>%
dplyr::mutate(cov2 = factor(cov, levels = cov_used))
if (plot) {
figdim <- Fig_split(length(cov_used))
plot_list <- purrr::map(levels(AllFact.simple.tbl$cov2),
~{
# browser()
AllFact.simple.tbl %>%
dplyr::filter(cov2 == .x) %>%
dplyr::mutate(cov.val = ifelse(type == "numeric",
as.numeric(as.character(cov.val)),
cov.val)) %>%
ggplot() +
geom_pointrange(aes(x = cov.val, y = fit,
ymin = fit-se.fit,
ymax = fit+se.fit)) +
facet_wrap(~cov2, scales = "free_x") +
labs(x = NULL)
}
)
g <- cowplot::plot_grid(plotlist = plot_list, ncol = figdim$ncol)
# g <- ggplot(AllFact.simple.tbl) +
# geom_pointrange(aes(x = cov.val, y = fit,
# ymin = fit-se.fit,
# ymax = fit+se.fit)) +
# facet_wrap(~cov2, scales = "free_x",
# nrow = figdim$nrow,
# ncol = figdim$ncol)
rm(AllFact.simple)
ggsave(plot = g,
filename = paste0(
saveWD, "/", basename(saveWD),
"-CovForMeanPred_Marginals_", model_selected$Num, ".png"),
width = 16, height = 16*(figdim$nrow / figdim$ncol), units = "cm",
dpi = 300)
readr::write_rds(AllFact.simple.tbl,
paste0(
saveWD, "/", basename(saveWD),
"-CovForMeanPred_Marginals_", model_selected$Num, ".rds"))
}
# Comparison of predictions against observations ====
if (!grepl("PA", modeltype) & plot) {
if (grepl("Log", model_selected$modeltype)) {
fits <- exp(modelX$fitted + 0.5 * var(residuals(modelX)))
} else {
fits <- modelX$fitted
}
png(filename =
paste0(saveWD, "/", imgprefix, "Obs-Pred_", Num, ".png"),
width = 12, height = 6,
units = "cm", pointsize = 8, res = 300)
par(mai = c(0.35, 0.35, 0.1, 0.1), mfrow = c(1,2), cex.axis = 0.8)
smoothScatter(Y_data_sample$dataY, fits, nbin = 256,
xlab = "", ylab = "", col = "grey")
mtext(side = 1, line = 1.75, "Observations")
mtext(side = 2, line = 1.75, "Predictions in obs. scale")
abline(b = 1, a = 0, lty = "dashed", col = "grey")
smoothScatter(Y_data_sample$dataY, Y_data_sample$dataY - fits, nbin = 256,
colramp = colorRampPalette(c("white", "red")),
xlab = "", ylab = "", col = "grey")
mtext(side = 1, line = 1.75, "Observations")
mtext(side = 2, line = 1.75, "Obs-Pred Difference")
abline(h = 0, lty = "dashed", col = "grey")
dev.off()
}
if (grepl("PA|Tweed", modeltype) & plot)
{
png(filename =
paste0(saveWD, "/", imgprefix, "Thresholds_for_prediction_", Num, ".png"),
width = 9, height = 8,
units = "cm", pointsize = 7, res = 300)
par(mai = c(0.3, 0.5, 0.3, 0.9), yaxs = "i")
if (grepl("Tweed", modeltype)) {
dataBin <- unlist(1 * (Y_data_sample$dataY > 0))
dataFit <- modelX$proba
} else {
dataBin <- unlist(Y_data_sample$dataY)
dataFit <- modelX$fitted
}
pirateplot(formula = dataFit ~ dataBin,
data = Y_data_sample,
avg.line.fun = mean,
point.o = .05, point.cex = 0.5,
bean.b.o = 1, bean.lwd = 1,
avg.line.lwd = 0.8,
inf.f.o = 0.35, inf.b.o = 0.35,
pal = "black",
bw = "bcv",
xlab = "",
ylab = "Model prediction (probability of presence)",
main = "Comparison of observation and prediction",
ylim = c(0, 1)
)
abline(h = c(SeuilMinUn, SeuilMaxZero), col = c("red", "olivedrab3"),
lty = "dashed")
segments(1.5, SeuilMinUn, 3, SeuilMinUn, col = "red", lwd = 2)
segments(1.5, SeuilMaxZero, 0, SeuilMaxZero, col = "olivedrab3",
lwd = 2)
mtext(side = 1, line = 1.5, text = "Field observation")
axis(4, at = SeuilMinUn, labels = round(SeuilMinUn, digits = 2),
col.axis = "red", las = 1)
axis(4, at = SeuilMaxZero, labels = round(SeuilMaxZero, digits = 2),
col.axis = "olivedrab3", las = 1)
axis(4, at = Seuil, labels = round(Seuil, digits = 2),
col.axis = "forestgreen", las = 1)
abline(h = Seuil, col = "forestgreen", lwd = 2.5)
# axis(4,at=Seuil,labels=round(Seuil,digits=2),col.axis='forestgreen',las=1)
text(1.5, Seuil, "Best THD", pos = 3, col = "forestgreen", font = 2)
# segments(2.9,SeuilMaxZero,3,SeuilMaxZero,col='olivedrab3',lty='dashed',xpd=TRUE)
segments(2.6, 0, 3, 0, lty = "dashed", xpd = TRUE)
segments(2.6, 1, 3, 1, lty = "dashed", xpd = TRUE)
arrows(3, 0, 3, SeuilMaxZero, xpd = TRUE, code = 3, length = 0.05)
text(3.1, 0, "95% of observation of absence", xpd = TRUE, srt = 270, adj = 1)
arrows(3, 1, 3, SeuilMaxZero, xpd = TRUE, code = 3, length = 0.05,
col = "olivedrab3")
text(3.1, 1, "Sure presence", xpd = TRUE, srt = 270, col = "olivedrab3",
adj = 0)
segments(3.2, SeuilMinUn, 3.3, SeuilMinUn, col = "red", lty = "dashed",
xpd = TRUE)
segments(3.2, 0, 3.3, 0, lty = "dashed", xpd = TRUE)
segments(3.2, 1, 3.3, 1, lty = "dashed", xpd = TRUE)
arrows(3.3, 0, 3.3, SeuilMinUn, xpd = TRUE, code = 3, length = 0.05,
col = "red")
text(3.4, 0, "Sure absence", xpd = TRUE, srt = 270, col = "red",
adj = 1)
arrows(3.3, 1, 3.3, SeuilMinUn, xpd = TRUE, code = 3, length = 0.05)
text(3.4, 1, "95% of observation of presence", xpd = TRUE, srt = 270,
adj = 0)
box()
text(0.65, SeuilMaxZero, "MaxZero", pos = 3, col = "olivedrab3", xpd = TRUE)
text(2.35, SeuilMinUn, "MinUn", pos = 1, col = "red", xpd = TRUE)
dev.off()
}
# Zip all outputs in a unique file.
if (zip.file != FALSE) {
# Options j allows to save directly files without path
utils::zip(zip.file, files = saveWD, flags = "-urj9X", extras = "",
zip = Sys.getenv("R_ZIPCMD", "zip"))
# Return the file path of the zip file
return(zip.file)
} else {
# Return the folder path where all results have been saved
return(saveWD)
}
} # end of ModelResults
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.