#' Cross-validation procedure
#' @description
#' `r badge('stable')`
#'
#' Cross-validation for estimation of AMMI models
#'
#' THe original dataset is split into two datasets: training set and validation
#' set. The 'training' set has all combinations (genotype x environment) with
#' N-1 replications. The 'validation' set has the remaining replication. The
#' splitting of the dataset into modeling and validation sets depends on the
#' design informed. For Completely Randomized Block Design (default), and
#' alpha-lattice design (declaring `block` arguments), complete replicates
#' are selected within environments. The remained replicate serves as validation
#' data. If `design = 'RCD'` is informed, completely randomly samples are
#' made for each genotype-by-environment combination (Olivoto et al. 2019). The
#' estimated values considering `naxis`-Interaction Principal Component
#' Axis are compared with the 'validation' data. The Root Mean Square Prediction
#' Difference (RMSPD) is computed. At the end of boots, a list is returned.
#'
#' **IMPORTANT:** If the data set is unbalanced (i.e., any genotype missing
#' in any environment) the function will return an error. An error is also
#' observed if any combination of genotype-environment has a different number of
#' replications than observed in the trial.
#'
#' @param .data The dataset containing the columns related to Environments,
#' Genotypes, replication/block and response variable(s).
#' @param env The name of the column that contains the levels of the
#' environments.
#' @param gen The name of the column that contains the levels of the genotypes.
#' @param rep The name of the column that contains the levels of the
#' replications/blocks. **AT LEAST THREE REPLICATES ARE REQUIRED TO
#' PERFORM THE CROSS-VALIDATION**.
#' @param resp The response variable.
#' @param block Defaults to `NULL`. In this case, a randomized complete
#' block design is considered. If block is informed, then a resolvable
#' alpha-lattice design (Patterson and Williams, 1976) is employed.
#' **All effects, except the error, are assumed to be fixed.**
#' @param nboot The number of resamples to be used in the cross-validation.
#' Defaults to 200.
#' @param naxis The number of axis to be considered for estimation of GE
#' effects.
#' @param design The experimental design. Defaults to `RCBD` (Randomized
#' complete Block Design). For Completely Randomized Designs inform
#' `design = 'CRD'`.
#' @param verbose A logical argument to define if a progress bar is shown.
#' Default is `TRUE`.
#' @references Olivoto, T., A.D.C. L{\'{u}}cio, J.A.G. da silva, V.S. Marchioro,
#' V.Q. de Souza, and E. Jost. 2019. Mean performance and stability in
#' multi-environment trials I: Combining features of AMMI and BLUP techniques.
#' Agron. J. 111:2949-2960.
#' \doi{10.2134/agronj2019.03.0220}
#'
#' @references Patterson, H.D., and E.R. Williams. 1976. A new class of
#' resolvable incomplete block designs. Biometrika 63:83-92.
#'
#' @return An object of class `cv_ammi` with the following items: *
#' **RMSPD**: A vector with nboot-estimates of the Root Mean Squared
#' Prediction Difference between predicted and validating data.
#'
#' * **RMSPDmean**: The mean of RMSPDmean estimates.
#'
#' * **Estimated**: A data frame that contain the values (predicted,
#' observed, validation) of the last loop.
#'
#' * **Modeling**: The dataset used as modeling data in the last loop
#'
#' * **Testing**: The dataset used as testing data in the last loop.
#' @md
#' @author Tiago Olivoto \email{tiagoolivoto@@gmail.com}
#' @seealso [cv_ammif()], [cv_blup()]
#' @export
#' @examples
#'
#' \donttest{
#' library(metan)
#' model <- cv_ammi(data_ge,
#' env = ENV,
#' gen = GEN,
#' rep = REP,
#' resp = GY,
#' nboot = 5,
#' naxis = 2)
#' }
#'
#'
cv_ammi <- function(.data, env, gen, rep, resp, block = NULL, naxis = 2, nboot = 200, design = "RCBD", verbose = TRUE) {
if(missing(block)){
if (!design %in% c("RCBD", "CRD")) {
stop("Incorrect experimental design informed! Plesease inform RCBD for randomized complete block or CRD for completely randomized design.")
}
data <- .data %>%
dplyr::select(ENV = {{env}},
GEN = {{gen}},
REP = {{rep}},
Y = {{resp}})%>%
mutate(across(1:3, as.factor))
RMSPDres <- data.frame(RMSPD = matrix(0, nboot, 1))
data <- add_row_id(data)
Nenv <- length(unique(data$ENV))
Ngen <- length(unique(data$GEN))
Nbloc <- length(unique(data$REP))
if (Nbloc <= 2) {
stop("At least three replicates are required to perform the cross-validation.")
}
nrepval <- Nbloc - 1
minimo <- min(Nenv, Ngen) - 1
if (naxis > minimo) {
stop("The number of axis to be used must be lesser than or equal to ",
minimo, " [min(GEN-1;ENV-1)]")
}
if (verbose == TRUE) {
pb <- progress(max = nboot, style = 4)
}
test <-
data %>%
n_by(ENV, GEN) %>%
mutate(test =
case_when(Y > 0 & Y != Nbloc ~ TRUE,
TRUE ~ FALSE)
)
if(any(test$test == TRUE)){
df_test <-
data.frame(
test[which(test$test == TRUE),] %>%
remove_cols(row_id,REP, test) %>%
rename(n = Y)
)
message(paste0(capture.output(df_test), collapse = "\n"))
stop("Combinations of genotype and environment with different number of replication than observed in the trial (", Nbloc, ")", call. = FALSE)
}
if(!is_balanced_trial(data, ENV, GEN, Y)){
stop("AMMI analysis cannot be computed with unbalanced data.", call. = FALSE)
}
for (b in 1:nboot) {
if (design == "CRD") {
X <- sample(1:10000, 1)
set.seed(X)
modeling <- data %>%
dplyr::group_by(ENV, GEN) %>%
dplyr::sample_n(nrepval, replace = FALSE) %>%
arrange(row_id) %>%
as.data.frame()
rownames(modeling) <- modeling$row_id
}
if (design == "RCBD") {
tmp <- split_factors(data, ENV, keep_factors = TRUE, verbose = FALSE)
modeling <- do.call(rbind, lapply(tmp, function(x) {
X2 <- sample(unique(data$REP), nrepval, replace = FALSE)
x %>% dplyr::group_by(GEN) %>% dplyr::filter(REP %in% X2)
})) %>% as.data.frame()
rownames(modeling) <- modeling$row_id
}
testing <- anti_join(data, modeling, by = c("ENV", "GEN", "REP", "Y", "row_id")) %>%
arrange(ENV, GEN) %>%
as.data.frame()
MEDIAS <-
modeling %>%
mean_by(ENV, GEN) %>%
as.data.frame()
residual <-
modeling %>%
mean_by(ENV, GEN) %>%
ungroup() %>%
mutate(residuals = residuals(lm(Y ~ ENV + GEN, data = .))) %>%
pull(residuals)
s <- svd(t(matrix(residual, Nenv, byrow = T)))
MGEN <- model.matrix(~factor(testing$GEN) - 1)
MENV <- model.matrix(~factor(testing$ENV) - 1)
MEDIAS %<>% mutate(Ypred = Y - residual,
ResAMMI = ((MGEN %*% s$u[, 1:naxis]) * (MENV %*% s$v[, 1:naxis])) %*% s$d[1:naxis],
YpredAMMI = Ypred + ResAMMI,
testing = testing$Y,
error = YpredAMMI - testing,
errrorAMMI0 = Ypred - testing)
RMSPDres[, 1][b] <- ifelse(naxis == 0, sqrt(sum(MEDIAS$errrorAMMI0^2)/length(MEDIAS$errrorAMMI0)),
sqrt(sum(MEDIAS$error^2)/length(MEDIAS$error)))
if (verbose == TRUE) {
run_progress(pb,
text = paste("Validating", b, "of", nboot, "sets"),
actual = b)
}
}
RMSPDmean <- summarise(RMSPDres,
MODEL = paste("AMMI", naxis, sep = ""),
mean = mean(RMSPD),
sd = sd(RMSPD),
se = sd(RMSPD)/sqrt(n()),
Q2.5 = quantile(RMSPD, 0.025),
Q97.5 = quantile(RMSPD, 0.975))
RMSPDres <- RMSPDres %>%
mutate(MODEL = paste("AMMI", naxis, sep = "")) %>%
dplyr::select(MODEL, everything())
}
if(!missing(block)){
data <- .data %>%
dplyr::select(ENV = {{env}},
GEN = {{gen}},
REP = {{rep}},
BLOCK = {{block}},
Y = {{resp}})
RMSPDres <- data.frame(RMSPD = matrix(0, nboot, 1))
data <- add_row_id(data)
Nenv <- nlevels(data$ENV)
Ngen <- nlevels(data$GEN)
Nbloc <- nlevels(data$REP)
if (Nbloc <= 2) {
stop("At least three replicates are required to perform the cross-validation.")
}
nrepval <- Nbloc - 1
minimo <- min(Nenv, Ngen) - 1
if (naxis > minimo) {
stop("The number of axis to be used must be lesser than or equal to ",
minimo, " [min(GEN-1;ENV-1)]")
}
test <-
data %>%
n_by(ENV, GEN) %>%
mutate(test =
case_when(Y > 0 & Y != Nbloc ~ TRUE,
TRUE ~ FALSE)
)
if(any(test$test == TRUE)){
df_test <-
data.frame(
test[which(test$test == TRUE),] %>%
remove_cols(row_id,REP, test) %>%
rename(n = Y)
)
message(paste0(capture.output(df_test), collapse = "\n"))
stop("Combinations of genotype and environment with different number of replication than observed in the trial (", Nbloc, ")", call. = FALSE)
}
if(!is_balanced_trial(data, ENV, GEN, Y)){
stop("AMMI analysis cannot be computed with unbalanced data.", call. = FALSE)
}
if (verbose == TRUE) {
pb <- progress(max = nboot, style = 4)
}
for (b in 1:nboot) {
tmp <- split_factors(data, ENV, keep_factors = TRUE, verbose = FALSE)
modeling <- do.call(rbind, lapply(tmp, function(x) {
X2 <- sample(unique(data$REP), nrepval, replace = FALSE)
x %>% dplyr::group_by(GEN) %>% dplyr::filter(REP %in% X2)
})) %>% as.data.frame()
rownames(modeling) <- modeling$row_id
testing <- anti_join(data, modeling, by = c("ENV", "GEN", "REP", "BLOCK", "Y", "row_id")) %>%
arrange(ENV, GEN) %>%
as.data.frame()
MEDIAS <- modeling %>%
group_by(ENV, GEN) %>%
summarise(Y = mean(Y)) %>%
as.data.frame()
residual <- modeling %>%
mutate(residuals = residuals(lm(Y ~ ENV/REP + REP:BLOCK + ENV + GEN, data = .))) %>%
group_by(ENV, GEN) %>%
summarise(residuals = mean(residuals)) %>%
ungroup() %>%
pull(residuals)
s <- svd(t(matrix(residual, Nenv, byrow = T)))
MGEN <- model.matrix(~factor(testing$GEN) - 1)
MENV <- model.matrix(~factor(testing$ENV) - 1)
MEDIAS %<>% mutate(Ypred = Y - residual,
ResAMMI = ((MGEN %*% s$u[, 1:naxis]) * (MENV %*% s$v[, 1:naxis])) %*% s$d[1:naxis],
YpredAMMI = Ypred + ResAMMI,
testing = testing$Y,
error = YpredAMMI - testing,
errrorAMMI0 = Ypred - testing)
RMSPDres[, 1][b] <- ifelse(naxis == 0, sqrt(sum(MEDIAS$errrorAMMI0^2)/length(MEDIAS$errrorAMMI0)),
sqrt(sum(MEDIAS$error^2)/length(MEDIAS$error)))
if (verbose == TRUE) {
run_progress(pb,
text = paste("Validating", b, "of", nboot, "sets"),
actual = b)
}
}
RMSPDmean <- summarise(RMSPDres,
MODEL = paste("a_AMMI", naxis, sep = ""),
mean = mean(RMSPD),
sd = sd(RMSPD),
se = sd(RMSPD)/sqrt(n()),
Q2.5 = quantile(RMSPD, 0.025),
Q97.5 = quantile(RMSPD, 0.975))
RMSPDres <- RMSPDres %>%
mutate(MODEL = paste("a_AMMI", naxis, sep = "")) %>%
dplyr::select(MODEL, everything())
}
return(structure(list(RMSPD = RMSPDres, RMSPDmean = RMSPDmean,
Estimated = MEDIAS, Modeling = modeling, Testing = testing),
class = "cvalidation"))
}
#' Plot the RMSPD of a cross-validation procedure
#'
#' Boxplot showing the Root Means Square Prediction Difference of of a cross
#' validation procedure.
#'
#' Five statistics are shown in this type of plot. The lower and upper hinges
#' correspond to the first and third quartiles (the 25th and 75th percentiles).
#' The upper whisker extends from the hinge to the largest value no further than
#' 1.5 * IQR from the hinge (where IQR is the inter-quartile range). The lower
#' whisker extends from the hinge to the smallest value at most 1.5 * IQR of the
#' hinge. Data beyond the end of the whiskers are considered outlying points.
#'
#' @param x An object of class `cvalidation` fitted with the functions
#' [cv_ammi()], [cv_ammif()], [cv_blup()], or a
#' bound object fitted with [bind_cv()].
#' @param violin Define if a violin plot is used with boxplot. Default is 'TRUE'
#' @param export Export (or not) the plot. Default is `T`.
#' @param order_box Logical argument. If `TRUE` then the boxplots will be
#' ordered according to the values of the RMSPD.
#' @param x.lab The label of x-axis. New arguments can be inserted as
#' `x.lab = 'my x label'`.
#' @param y.lab The label of y-axis. New arguments can be inserted as
#' `y.lab = 'my y label'`.
#' @param size.tex.lab The size of the text in axis text and labels.
#' @param file.type The type of file to be exported. Default is `pdf`,
#' Graphic can also be exported in `*.tiff` format by declaring
#' `file.type = 'tiff'`.
#' @param file.name The name of the file for exportation, default is
#' `NULL`, i.e. the files are automatically named.
#' @param plot_theme The graphical theme of the plot. Default is
#' `plot_theme = theme_metan()`. For more details,see
#' [ggplot2::theme()].
#' @param width The width 'inch' of the plot. Default is `6`.
#' @param height The height 'inch' of the plot. Default is `6`.
#' @param resolution The resolution of the plot. Parameter valid if
#' `file.type = 'tiff'` is used. Default is `300` (300 dpi)
#' @param col.violin Parameter valid if `violin = T`. Define the color of
#' the violin plot. Default is 'gray90.
#' @param col.boxplot Define the color for boxplot. Default is 'gray70'.
#' @param col.boxplot.win Define the color for boxplot of the best model.
#' Default is 'cyan'.
#' @param width.boxplot The width of boxplots. Default is `0.2`.
#' @param x.lim The range of x-axis. Default is `NULL` (maximum and minimum
#' values of the data set). New arguments can be inserted as `x.lim =
#' c(x.min, x.max)`.
#' @param x.breaks The breaks to be plotted in the x-axis. Default is
#' `authomatic breaks`. New arguments can be inserted as `x.breaks =
#' c(breaks)`
#' @param ... Currently not used.
#' @return An object of class `gg, ggplot`.
#' @author Tiago Olivoto \email{tiagoolivoto@@gmail.com}
#' @method plot cvalidation
#' @export
#' @examples
#'
#' \donttest{
#' validation <- cv_ammif(data_ge2,
#' resp = EH,
#' gen = GEN,
#' env = ENV,
#' rep = REP,
#' nboot = 5)
#' plot(validation)
#' }
#'
#'
plot.cvalidation <- function(x, violin = FALSE, export = FALSE, order_box = FALSE,
x.lab = NULL, y.lab = NULL, size.tex.lab = 12, file.type = "pdf",
file.name = NULL, plot_theme = theme_metan(), width = 6, height = 6,
resolution = 300, col.violin = "gray90", col.boxplot = "gray70",
col.boxplot.win = "cyan", width.boxplot = 0.6, x.lim = NULL,
x.breaks = waiver(), ...) {
y.lab <- ifelse(missing(y.lab),
expression(paste("Root mean square prediction difference (Mg ha"^-1, ")")),
y.lab)
x.lab <- ifelse(missing(x.lab), "Model", x.lab)
a <- suppressMessages(x$RMSPD %>%
group_by(MODEL) %>%
summarise(RMSPD = mean(RMSPD)) %>%
top_n(-1))
mod <- paste(a$MODEL[1])
if (violin == TRUE) {
dodge <- position_dodge(width = 1)
if(order_box == TRUE){
p1 <- ggplot(x$RMSPD, aes(x = reorder(MODEL, RMSPD), y = RMSPD))
} else {
p1 <- ggplot(x$RMSPD, aes(x = MODEL, y = RMSPD))
}
p1 = p1 +
geom_violin(position = dodge, fill = col.violin) +
stat_boxplot(geom = "errorbar", width=width.boxplot/2, na.rm = TRUE)+
geom_boxplot(width = width.boxplot,
position = dodge,
color = "black",
outlier.alpha = 0.75,
outlier.color = "black",
fill = col.boxplot) +
geom_boxplot(data = x$RMSPD[x$RMSPD$MODEL == mod, ],
aes(x = MODEL, y = RMSPD),
width = width.boxplot,
position = dodge,
color = "black",
outlier.alpha = 0.75,
outlier.color = "black",
fill = col.boxplot.win) +
stat_summary(fun = mean,
geom = "point",
shape = 23,
fill = "black") +
plot_theme %+replace%
theme(axis.text = element_text(size = size.tex.lab, colour = "black"),
axis.title = element_text(size = size.tex.lab, colour = "black")) +
coord_flip() +
scale_y_continuous(limits = x.lim, breaks = x.breaks) +
labs(x = x.lab, y = y.lab)
} else {
dodge <- position_dodge(width = 1)
if(order_box == TRUE){
p1 <- ggplot(x$RMSPD, aes(x = reorder(MODEL, RMSPD), y = RMSPD))
} else {
p1 <- ggplot(x$RMSPD, aes(x = MODEL, y = RMSPD))
}
p1 = p1 +
stat_boxplot(geom = "errorbar", width=width.boxplot/2, na.rm = TRUE)+
geom_boxplot(width = width.boxplot,
position = dodge,
color = "black",
outlier.alpha = 0.75,
outlier.color = "black",
fill = col.boxplot) +
geom_boxplot(data = x$RMSPD[x$RMSPD$MODEL == mod, ],
aes(x = MODEL, y = RMSPD),
width = width.boxplot,
position = dodge,
outlier.alpha = 0.75,
color = "black",
outlier.color = "black",
fill = col.boxplot.win) +
stat_summary(fun = mean,
geom = "point",
shape = 23,
fill = "black") +
plot_theme %+replace%
theme(axis.text = element_text(size = size.tex.lab, colour = "black"),
axis.title = element_text(size = size.tex.lab, colour = "black")) +
coord_flip() +
scale_y_continuous(limits = x.lim, breaks = x.breaks) +
labs(x = x.lab, y = y.lab)
}
if (export == FALSE) {
return(p1)
} else if (file.type == "pdf") {
if (is.null(file.name)) {
pdf("RMSPD validation.pdf", width = width, height = height)
} else pdf(paste0(file.name, ".pdf"), width = width, height = height)
plot(p1)
dev.off()
}
if (file.type == "tiff") {
if (is.null(file.name)) {
tiff(filename = "RMSPD validation.tiff", width = width,
height = height, units = "in", compression = "lzw",
res = resolution)
} else tiff(filename = paste0(file.name, ".tiff"), width = width,
height = height, units = "in", compression = "lzw",
res = resolution)
plot(p1)
dev.off()
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.