#' Missing value imputation
#' @description
#' `r badge('stable')`
#'
#' Impute the missing entries of a matrix with missing values using different
#' algorithms. See **Details** section for more details
#' @details
#' **`EM-AMMI` algorithm**
#'
#' The `EM-AMMI` algorithm completes a data set with missing values according to both
#' main and interaction effects. The algorithm works as follows (Gauch and
#' Zobel, 1990):
#' 1. The initial values are calculated as the grand mean increased by main
#' effects of rows and main effects of columns. That way, the matrix of
#' observations is pre-filled in.
#' 2. The parameters of the AMMI model are estimated.
#' 3. The adjusted means are calculated based on the AMMI model with
#' `naxis` principal components.
#' 4. The missing cells are filled with the adjusted means.
#' 5. The root mean square error of the predicted values (`RMSE_p`) is
#' calculated with the two lasts iteration steps. If `RMSE_p > tol`, the
#' steps 2 through 5 are repeated. Declare convergence if `RMSE_p < tol`.
#' If `max_iter` is achieved without convergence, the algorithm will stop
#' with a warning.
#'
#' **`EM-SVD` algorithm**
#'
#' The `EM-SVD` algorithm impute the missing entries using a low-rank Singular
#' Value Decomposition approximation estimated by the Expectation-Maximization
#' algorithm. The algorithm works as follows (Troyanskaya et al., 2001).
#' 1. Initialize all `NA` values to the column means.
#' 2. Compute the first `naxis` terms of the SVD of the completed matrix
#' 3. Replace the previously missing values with their approximations from the SVD
#' 4. The root mean square error of the predicted values (`RMSE_p`) is
#' calculated with the two lasts iteration steps. If `RMSE_p > tol`, the
#' steps 2 through 3 are repeated. Declare convergence if `RMSE_p < tol`.
#' If `max_iter` is achieved without convergence, the algorithm will stop
#' with a warning.
#'
#' **`colmeans` algorithm**
#'
#' The `colmeans` algorithm simply impute the missing entires using the
#' column mean of the respective entire. Thus, there is no iteractive process.
#'
#'
#' @md
#'
#' @param .data A matrix to impute the missing entries. Frequently a two-way
#' table of genotype means in each environment.
#' @param naxis The rank of the Singular Value Approximation. Defaults to `1`.
#' @param algorithm The algorithm to impute missing values. Defaults to
#' `"EM-SVD"`. Other possible values are `"EM-AMMI"` and
#' `"colmeans"`. See **Details** section.
#' @param tol The convergence tolerance for the algorithm.
#' @param max_iter The maximum number of steps to take. If `max_iter` is
#' achieved without convergence, the algorithm will stop with a warning.
#' @param simplified Valid argument when `algorithm = "EM-AMMI"`. IF
#' `FALSE` (default), the current effects of rows and columns change from
#' iteration to iteration. If `TRUE`, the general mean and effects of
#' rows and columns are computed in the first iteration only, and in next
#' iterations uses these values.
#' @param verbose Logical argument. If `verbose = FALSE` the code will run
#' silently.
#'
#' @return An object of class `imv` with the following values:
#'
#' * **.data** The imputed matrix
#' * **pc_ss** The sum of squares representing variation explained by the
#' principal components
#' * **iter** The final number of iterations.
#' * **Final_RMSE** The maximum change of the estimated values for missing cells in the last step of iteration.
#' * **final_axis** The final number of principal component axis.
#' * **convergence** Logical value indicating whether the modern converged.
#' @export
#'
#' @references
#' Gauch, H. G., & Zobel, R. W. (1990). Imputing missing yield trial data.
#' Theoretical and Applied Genetics, 79(6), 753-761.
#' \doi{10.1007/BF00224240}
#'
#' Troyanskaya, O., Cantor, M., Sherlock, G., Brown, P., Hastie, T., Tibshirani,
#' R., . Altman, R. B. (2001). Missing value estimation methods for DNA
#' microarrays. Bioinformatics, 17(6), 520-525.
#'
#'
#' @examples
#' \donttest{
#' library(metan)
#' mat <- (1:20) %*% t(1:10)
#' mat
#' # 10% of missing values at random
#' miss_mat <- random_na(mat, prop = 10)
#' miss_mat
#' mod <- impute_missing_val(miss_mat)
#' mod$.data
#' }
impute_missing_val <- function(.data,
naxis = 1,
algorithm = "EM-SVD",
tol = 1e-10,
max_iter = 1000,
simplified = FALSE,
verbose = TRUE){
if(!has_na(.data)){
stop("No missing values found in data.")
}
if(!algorithm %in% c("EM-AMMI", "EM-SVD", "colmeans")){
stop("The algorithm must be of of 'EM-AMMI', 'EM-SVD', and 'colmeans")
}
if(algorithm == "EM-AMMI"){
.data <- as.matrix(.data)
missing <- matrix(1, nrow(.data), ncol(.data))
missing[is.na(.data)] <- 0
max_ipc <- min(c(rowSums(missing), colSums(missing))) - 1
axis_used <- ifelse(max_ipc < naxis, max_ipc, naxis)
data_in <- .data
data_mean <- mean(c(.data), na.rm = TRUE)
row_m <- matrix(rowMeans(.data, na.rm = TRUE), nrow(.data),ncol(.data))
col_m <- t(matrix(colMeans(.data,na.rm = TRUE), ncol(.data),nrow(.data)))
estimated <- (-data_mean) + row_m + col_m
data_in[is.na(data_in)] <- estimated[is.na(.data)]
iter <- 1
new_data <- data_in
change <- tol + 1
while((change > tol) & (iter < max_iter)){
if (iter == 1 | !simplified){
grand_mean <- mean(new_data)
int_mat <- new_data - grand_mean
int_mat <- scale(int_mat, scale = FALSE)
col_center <- attr(int_mat,"scaled:center")
int_mat <- t(scale(t(int_mat), scale = FALSE))
row_center <- attr(int_mat,"scaled:center")
} else {
int_mat <- new_data - grand_mean - row_eff - col_eff
}
if (axis_used >= 1){
SVD <- La.svd(int_mat)
diag_l <- diag(SVD$d, nrow = axis_used)
d <- SVD$d[1:axis_used]
u <- SVD$u[,1:axis_used]
v <- SVD$v[1:axis_used,]
adj_val <- u %*% diag_l %*% v
} else {
adj_val <- 0
}
if(iter == 1 | !simplified){
row_eff <- matrix(row_center, nrow(.data), ncol(.data))
col_eff <- t(matrix(col_center,ncol(.data), nrow(.data)))
}
new_data2 <- new_data
new_data2[is.na(.data)] <- (grand_mean + row_eff + col_eff + adj_val)[is.na(.data)]
change <- sqrt(mean((new_data2 - new_data)[is.na(.data)]^2))
iter <- iter + 1
new_data <- new_data2
}
}
if(algorithm == "EM-SVD"){
data_in <- as.matrix(replace_na(data.frame(.data), replacement = "colmean"))
max_ipc <- min(nrow(data_in), ncol(data_in)) - 1
axis_used <- ifelse(max_ipc < naxis, max_ipc, naxis)
iter <- 1
new_data <- data_in
change <- tol + 1
while((change > tol) & (iter < max_iter)){
SVD <- La.svd(new_data)
diag_l <- diag(SVD$d, nrow = axis_used)
d <- SVD$d[1:axis_used]
u <- SVD$u[,1:axis_used]
v <- SVD$v[1:axis_used,]
adj_val <- u %*% diag_l %*% v
new_data2 <- new_data
new_data2[is.na(.data)] <- adj_val[is.na(.data)]
change <- sqrt(mean((new_data2 - new_data)[is.na(.data)]^2))
iter <- iter + 1
new_data <- new_data2
}
}
if(algorithm == "colmeans"){
new_data <- as.matrix(replace_na(data.frame(.data), replacement = "colmean"))
pc_ss <- NULL
iter <- NULL
final_tol <- NULL
final_naxis <- NULL
converged <- NULL
change <- NULL
axis_used <- NULL
}
if(algorithm %in% c("EM-AMMI", 'EM-SVD')){
pc_ss <- d^2
converged <- ifelse(change <= tol, TRUE, FALSE)
if (axis_used < naxis){
converged = FALSE
}
if (axis_used == 0){
SVD <- list(d = 0)
}
if(verbose == TRUE){
cat("----------------------------------------------\n")
cat("Convergence information\n")
cat("----------------------------------------------\n")
cat("Number of iterations:", iter)
cat("\nFinal RMSE:", change)
cat("\nNumber of axis:", axis_used)
cat("\nConvergence:", converged)
cat("\n----------------------------------------------\n")
}
if(!converged){
warning("Maximum number of iterations achieved without convergence.\nTo increase the number of iterations use 'max_iter'.", call. = FALSE)
}
}
return(list(
.data = as.matrix(new_data),
pc_ss = pc_ss,
iter = iter,
Final_RMSE = change,
final_naxis = axis_used,
convergence = converged) %>% set_class("imv"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.