Nothing
#' Cross-validation procedure
#' @description
#' `r badge('stable')`
#'
#' Cross-validation for estimation of all AMMI-family models
#'
#' `cv_ammif` provides a complete cross-validation of replicate-based data
#' using AMMI-family models. By default, the first validation is carried out
#' considering the AMMIF (all possible axis used). Considering this model, the
#' original dataset is split up into two datasets: training set and validation
#' set. The 'training' set has all combinations (genotype x environment) with
#' N-1 replications. The 'validation' set has the remaining replication. The
#' splitting of the dataset into modeling and validation sets depends on the
#' design informed. For Completely Randomized Block Design (default), and
#' alpha-lattice design (declaring `block` arguments), complete replicates
#' are selected within environments. The remained replicate serves as validation
#' data. If `design = 'RCD'` is informed, completely randomly samples are
#' made for each genotype-by-environment combination (Olivoto et al. 2019). The
#' estimated values for each member of the AMMI-family model are compared with
#' the 'validation' data. The Root Mean Square Prediction Difference (RMSPD) is
#' computed. At the end of boots, a list is returned.
#'
#' **IMPORTANT:** If the data set is unbalanced (i.e., any genotype missing
#' in any environment) the function will return an error. An error is also
#' observed if any combination of genotype-environment has a different number of
#' replications than observed in the trial.
#'
#' @param .data The dataset containing the columns related to Environments,
#' Genotypes, replication/block and response variable(s).
#' @param env The name of the column that contains the levels of the
#' environments.
#' @param gen The name of the column that contains the levels of the genotypes.
#' @param rep The name of the column that contains the levels of the
#' replications/blocks. **AT LEAST THREE REPLICATES ARE REQUIRED TO
#' PERFORM THE CROSS-VALIDATION**.
#' @param resp The response variable.
#' @param block Defaults to `NULL`. In this case, a randomized complete
#' block design is considered. If block is informed, then a resolvable
#' alpha-lattice design (Patterson and Williams, 1976) is employed.
#' **All effects, except the error, are assumed to be fixed.**
#' @param nboot The number of resamples to be used in the cross-validation.
#' Defaults to 200.
#' @param design The experimental design used in each environment. Defaults to
#' `RCBD` (Randomized complete Block Design). For Completely Randomized
#' Designs inform `design = 'CRD'`.
#' @param verbose A logical argument to define if a progress bar is shown.
#' Default is `TRUE`.
#' @references Patterson, H.D., and E.R. Williams. 1976. A new class of
#' resolvable incomplete block designs. Biometrika 63:83-92.
#'
#' @return An object of class `cv_ammif` with the following items:
#' * **RMSPD**: A vector with nboot-estimates of the Root Mean Squared
#' Prediction Difference between predicted and validating data.
#'
#' * **RMSPDmean**: The mean of RMSPDmean estimates.
#'
#' * **Estimated**: A data frame that contain the values (predicted, observed,
#' validation) of the last loop.
#'
#' * **Modeling**: The dataset used as modeling data in the last loop
#'
#' * **Testing**: The dataset used as testing data in the last loop.
#' @md
#' @author Tiago Olivoto \email{tiagoolivoto@@gmail.com}
#' @seealso [cv_ammi()], [cv_blup()]
#' @export
#' @examples
#'
#' \donttest{
#' library(metan)
#' model <- cv_ammif(data_ge2,
#' env = ENV,
#' gen = GEN,
#' rep = REP,
#' resp = EH,
#' nboot = 5)
#' plot(model)
#' }
#'
cv_ammif <- function(.data, env, gen, rep, resp, nboot = 200, block, design = "RCBD", verbose = TRUE) {
if(missing(block)){
if (!design %in% c("RCBD", "CRD")) {
stop("Incorrect experimental design informed! Plesease inform RCBD for randomized complete block or CRD for completely randomized design.")
}
data <- .data %>%
dplyr::select(ENV = {{env}},
GEN = {{gen}},
REP = {{rep}},
Y = {{resp}})%>%
mutate(across(1:3, as.factor))
RMSPDres <- data.frame(RMSPD = matrix(0, nboot, 1))
data <- add_row_id(data)
Nenv <- length(unique(data$ENV))
Ngen <- length(unique(data$GEN))
Nbloc <- length(unique(data$REP))
if (Nbloc <= 2) {
stop("At least three replicates are required to perform the cross-validation.")
}
nrepval <- Nbloc - 1
minimo <- min(Nenv, Ngen) - 1
naxisvalidation <- minimo + 1
totalboot <- naxisvalidation * nboot
initial <- 0
NAXIS <- minimo
AMMIval <- data.frame(matrix(NA, nboot, 1))
test <-
data %>%
n_by(ENV, GEN) %>%
mutate(test =
case_when(Y > 0 & Y != Nbloc ~ TRUE,
TRUE ~ FALSE)
)
if(any(test$test == TRUE)){
df_test <-
data.frame(
test[which(test$test == TRUE),] %>%
remove_cols(row_id,REP, test) %>%
rename(n = Y)
)
message(paste0(capture.output(df_test), collapse = "\n"))
stop("Combinations of genotype and environment with different number of replication than observed in the trial (", Nbloc, ")", call. = FALSE)
}
if(!is_balanced_trial(data, ENV, GEN, Y)){
stop("AMMI analysis cannot be computed with unbalanced data.", call. = FALSE)
}
if (verbose == TRUE) {
pb <- progress(max = totalboot, style = 4)
}
for (y in 1:naxisvalidation) {
RMSPDres <- data.frame(matrix(NA, nboot, 1))
for (b in 1:nboot) {
if (design == "CRD") {
X <- sample(1:10000, 1)
set.seed(X)
modeling <- data %>%
dplyr::group_by(ENV, GEN) %>%
dplyr::sample_n(nrepval, replace = FALSE) %>%
arrange(row_id) %>%
as.data.frame()
rownames(modeling) <- modeling$row_id }
if (design == "RCBD") {
tmp <- split_factors(data, ENV, keep_factors = TRUE, verbose = FALSE)
modeling <- do.call(rbind, lapply(tmp, function(x) {
X2 <- sample(unique(data$REP), nrepval, replace = FALSE)
x %>% as.data.frame() %>%
dplyr::group_by(GEN) %>%
dplyr::filter(REP %in% X2)
})) %>% as.data.frame()
rownames(modeling) <- modeling$row_id
}
testing <- anti_join(data, modeling, by = c("ENV", "GEN", "REP", "Y", "row_id")) %>%
arrange(ENV, GEN) %>%
as.data.frame()
MEDIAS <-
modeling %>%
mean_by(ENV, GEN) %>%
as.data.frame()
residual <-
modeling %>%
mean_by(ENV, GEN) %>%
mutate(residuals = residuals(lm(Y ~ ENV + GEN, data = .))) %>%
pull(residuals)
s <- svd(t(matrix(residual, Nenv, byrow = T)))
MGEN <- model.matrix(~factor(testing$GEN) - 1)
MENV <- model.matrix(~factor(testing$ENV) - 1)
MEDIAS %<>% mutate(Ypred = Y - residual,
ResAMMI = ((MGEN %*% s$u[, 1:NAXIS]) * (MENV %*% s$v[, 1:NAXIS])) %*% s$d[1:NAXIS],
YpredAMMI = Ypred + ResAMMI,
testing = testing$Y,
error = YpredAMMI - testing,
errrorAMMI0 = Ypred - testing)
RMSPDres[, 1][b] <- ifelse(NAXIS == 0, sqrt(sum(MEDIAS$errrorAMMI0^2)/length(MEDIAS$errrorAMMI0)),
sqrt(sum(MEDIAS$error^2)/length(MEDIAS$error)))
ACTUAL <- ifelse(NAXIS == minimo, "AMMIF", sprintf("AMMI%.0f", NAXIS))
initial <- initial + 1
if (verbose == TRUE) {
run_progress(pb,
text = paste(b, "of", nboot, "sets using", ACTUAL),
actual = initial)
}
}
if (NAXIS == minimo) {
AMMIval[["AMMIF"]] <- RMSPDres[, 1]
} else AMMIval[[sprintf("AMMI%.0f", NAXIS)]] <- RMSPDres[, 1]
NAXIS <- NAXIS - 1
initial <- initial
}
RMSPD <- stack(AMMIval[, -c(1)]) %>% dplyr::select(ind, everything())
names(RMSPD) <- c("MODEL", "RMSPD")
RMSPDmean <- RMSPD %>%
group_by(MODEL) %>%
summarise(mean = mean(RMSPD),
sd = sd(RMSPD),
se = sd(RMSPD)/sqrt(n()),
Q2.5 = quantile(RMSPD, 0.025),
Q97.5 = quantile(RMSPD, 0.975),
.groups = "drop") %>%
arrange(mean)
return(
structure(
list(RMSPD = RMSPD,
RMSPDmean = RMSPDmean,
Estimated = MEDIAS,
Modeling = modeling,
Testing = testing),
class = "cvalidation")
)
}
if(!missing(block)){
data <- .data %>%
dplyr::select(ENV = {{env}},
GEN = {{gen}},
REP = {{rep}},
BLOCK = {{block}},
Y = {{resp}})
RMSPDres <- data.frame(RMSPD = matrix(0, nboot, 1))
data <- add_row_id(data)
Nenv <- length(unique(data$ENV))
Ngen <- length(unique(data$GEN))
Nbloc <- length(unique(data$REP))
if (Nbloc <= 2) {
stop("At least three replicates are required to perform the cross-validation.")
}
nrepval <- Nbloc - 1
minimo <- min(Nenv, Ngen) - 1
naxisvalidation <- minimo + 1
totalboot <- naxisvalidation * nboot
initial <- 0
NAXIS <- minimo
AMMIval <- data.frame(matrix(NA, nboot, 1))
test <-
data %>%
n_by(ENV, GEN) %>%
mutate(test =
case_when(Y > 0 & Y != Nbloc ~ TRUE,
TRUE ~ FALSE)
)
if(any(test$test == TRUE)){
df_test <-
data.frame(
test[which(test$test == TRUE),] %>%
remove_cols(row_id,REP, test) %>%
rename(n = Y)
)
message(paste0(capture.output(df_test), collapse = "\n"))
stop("Combinations of genotype and environment with different number of replication than observed in the trial (", Nbloc, ")", call. = FALSE)
}
if(!is_balanced_trial(data, ENV, GEN, Y)){
stop("AMMI analysis cannot be computed with unbalanced data.", call. = FALSE)
}
if (verbose == TRUE) {
pb <- progress(max = totalboot, style = 4)
}
for (y in 1:naxisvalidation) {
RMSPDres <- data.frame(matrix(NA, nboot, 1))
for (b in 1:nboot) {
if (design == "CRD") {
X <- sample(1:10000, 1)
set.seed(X)
modeling <- data %>%
dplyr::group_by(ENV, GEN) %>%
dplyr::sample_n(nrepval, replace = FALSE) %>%
arrange(row_id) %>%
as.data.frame()
rownames(modeling) <- modeling$row_id }
if (design == "RCBD") {
tmp <- split_factors(data, ENV, keep_factors = TRUE, verbose = FALSE)
modeling <- do.call(rbind, lapply(tmp, function(x) {
X2 <- sample(unique(data$REP), nrepval, replace = FALSE)
x %>% dplyr::group_by(GEN) %>% dplyr::filter(REP %in% X2)
})) %>% as.data.frame()
rownames(modeling) <- modeling$row_id
}
testing <- anti_join(data, modeling, by = c("ENV", "GEN", "REP", "BLOCK", "Y", "row_id")) %>%
arrange(ENV, GEN) %>%
as.data.frame()
MEDIAS <-
modeling %>%
mean_by(ENV, GEN) %>%
as.data.frame()
residual <- modeling %>%
mutate(residuals = residuals(lm(Y ~ ENV/REP + REP:BLOCK + ENV + GEN, data = .))) %>%
mean_by(ENV, GEN) %>%
pull(residuals)
s <- svd(t(matrix(residual, Nenv, byrow = T)))
MGEN <- model.matrix(~factor(testing$GEN) - 1)
MENV <- model.matrix(~factor(testing$ENV) - 1)
MEDIAS %<>% mutate(Ypred = Y - residual,
ResAMMI = ((MGEN %*% s$u[, 1:NAXIS]) * (MENV %*% s$v[, 1:NAXIS])) %*% s$d[1:NAXIS],
YpredAMMI = Ypred + ResAMMI,
testing = testing$Y,
error = YpredAMMI - testing,
errrorAMMI0 = Ypred - testing)
RMSPDres[, 1][b] <- ifelse(NAXIS == 0, sqrt(sum(MEDIAS$errrorAMMI0^2)/length(MEDIAS$errrorAMMI0)),
sqrt(sum(MEDIAS$error^2)/length(MEDIAS$error)))
ACTUAL <- ifelse(NAXIS == minimo, "AMMIF", sprintf("AMMI%.0f", NAXIS))
initial <- initial + 1
if (verbose == TRUE) {
run_progress(pb,
text = paste(b, "of", nboot, "sets using", ACTUAL),
actual = initial)
}
}
if (NAXIS == minimo) {
AMMIval[["AMMIF"]] <- RMSPDres[, 1]
} else AMMIval[[sprintf("AMMI%.0f", NAXIS)]] <- RMSPDres[, 1]
NAXIS <- NAXIS - 1
initial <- initial
}
RMSPD <- stack(AMMIval[, -c(1)]) %>% dplyr::select(ind, everything())
names(RMSPD) <- c("MODEL", "RMSPD")
RMSPDmean <- RMSPD %>%
group_by(MODEL) %>%
summarise(mean = mean(RMSPD),
sd = sd(RMSPD),
se = sd(RMSPD)/sqrt(n()),
Q2.5 = quantile(RMSPD, 0.025),
Q97.5 = quantile(RMSPD, 0.975),
.groups = "drop") %>%
arrange(mean)
return(
structure(
list(RMSPD = RMSPD,
RMSPDmean = RMSPDmean,
Estimated = MEDIAS,
Modeling = modeling,
Testing = testing),
class = "cvalidation")
)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.