Nothing
#' @eval document_LSimpute("gene")
#'
#' @details
#'
#' Bo et al. (2004) seem to have chosen `min_common_obs = 5`. However, they did
#' not document this behavior. This value emerged from inspecting
#' imputation results from the original jar-file, which is provided by Bo et
#' al. (2004).
#'
#' If there are less than `min_common_obs` observed values in a row and at least
#' one observed value, the mean of the observed row values is imputed. If no
#' value is observed in a row, the observed column means are imputed for the
#' missing row values. This is the only known difference between this function
#' and the original one from Bo et al. (2004). The original function would not
#' impute such a row and return a dataset with missing values in this row. There
#' is one more case that needs a special treatment: If no suitable row can be
#' found to impute a row, the mean of the observed values is imputed, too. If
#' `verbose = TRUE`, a message will be given for the encountered instances of
#' the described special cases. If `verbose = FALSE`, the function will deal
#' with theses cases silently.
#'
#'
#' @param k Number of most correlated genes used for the imputation of a gene.
#' @param eps Used in the calculation of the weights (Bo et al. (2004) used
#' `eps = 1e-6`).
#' @param min_common_obs A row can only take part in the imputation of another
#' row, if both rows share at least `min_common_obs` columns with no missing
#' values.
#' @param return_r_max Logical; normally, this should be `FALSE`. `TRUE` is
#' used inside of `impute_LS_adaptive()` to speed up some computations.
#' @param verbose Should messages be given for special cases (see details)?
#'
#' @return If `return_r_max = TRUE`, a list with the imputed dataset and r_max.
#' @md
impute_LS_gene <- function(ds, k = 10, eps = 1e-6, min_common_obs = 5,
return_r_max = FALSE, verbose = FALSE) {
## Check some arguments -----------------------------------------------------
if (k >= nrow(ds)) {
stop("k must be smaller as nrow(ds)")
}
if (min_common_obs < 3) {
stop(
"min_common_obs should be bigger as 2 to allow calculations for ",
"correlations and regression models"
)
}
## Define some variables for later ------------------------------------------
if (return_r_max) {
r_max_matrix <- matrix(NA, nrow = nrow(ds), ncol = ncol(ds))
}
ds_mat_original <- as.matrix(ds)
ds_imp <- ds_mat_original
M <- is.na(ds)
rows_imp_with_colMeans <- integer(0)
rows_imp_with_rowMeans_min_obs <- integer(0)
rows_imp_with_rowMeans_no_suitable <- integer(0)
## Impute row by row --------------------------------------------------------
for (i in seq_len(nrow(ds))) {
M_i <- M[i, ]
if (any(M_i)) { # only impute, if any missing value in row i
## Check for enough observations in row i -------------------------------
if (all(M_i)) { # all values in row i are missing
# Bo et al. do not impute in this case, they return ds with NA values!
ds_imp[i, ] <- colMeans(ds, na.rm = TRUE)
rows_imp_with_colMeans <- c(rows_imp_with_colMeans, i)
} else if (sum(!M_i) < min_common_obs) { # less than min_common_obs observed values in row i
# Bo et al. impute all values in these rows with the mean of the observed row values
# (source: try and error with original jar-file, see test-file)
ds_imp[i, M_i] <- mean(ds[i, !M_i])
rows_imp_with_rowMeans_min_obs <- c(rows_imp_with_rowMeans_min_obs, i)
} else {
## Enough observations in row i ------------------------------------
# At least min_common_obs (>= 3) observed values in row i
# -> proceed with "normal" LSimpute_gene
# Possible candidate rows (common obs!) for imputing and their correlation
rows_candidates <- find_rows_candidates(ds_mat_original, i, M, M_i, min_common_obs)
# Check if at least one row candidate is found
if (nrow(rows_candidates) == 0) { # no rows_candidates found -> impute mean
# This condition may crashes the jar-file from Bo et al. (2004)
ds_imp[i, M_i] <- mean(ds[i, ], na.rm = TRUE)
rows_imp_with_rowMeans_no_suitable <- c(rows_imp_with_rowMeans_no_suitable, i)
} else { # at least one candidate row -> proceed with "normal" LSimpute_gene
# Save calculated regression coefficients (later)
betas <- matrix(NA_real_, nrow = nrow(rows_candidates), ncol = 2)
# Idea for betas: colnames(betas) <- c("beta_0", "beta_1")
## Impute value by value in row i -----------------------------------
for (j_ind in which(M_i)) {
# find the k suitable rows for imputation and save their index from rows_candidates
suitable_logic <- !M[rows_candidates$row_index, j_ind] # No NA in j_ind -> suitable
suitable_index <- which(suitable_logic)
suitable_index <- suitable_index[seq_len(min(k, length(suitable_index)))]
if (length(suitable_index) == 0) { # no suitable row found -> impute mean
# This condition may crashes the jar-file from Bo et al. (2004)
ds_imp[i, M_i] <- mean(ds[i, ], na.rm = TRUE)
rows_imp_with_rowMeans_no_suitable <- c(rows_imp_with_rowMeans_no_suitable, i)
} else { # everything fine -> proceed with "normal" LSimpute_gene
if (return_r_max) { # save highest correlation
r_max_matrix[i, j_ind] <- rows_candidates$similarity[suitable_index[1]]
}
## Calculate imputation values for all k neighbors --------------
# Prior to that: check, if any regression coefficient is unknown
row_indices <- rows_candidates$row_index[suitable_index]
for (j in seq_along(row_indices)) { # check for unknown regression parameters
if (is.na(betas[suitable_index[j], 1])) { # calculate regression parameters, if not already known
common_observed <- !(M_i | M[row_indices[j], ]) # this will be at least min_common_obs (>= 3) TRUEs, (requirement for suitable) -> regression possible
betas[suitable_index[j], ] <- calc_lm_coefs_simple_reg(
ds_mat_original[i, common_observed],
ds_mat_original[row_indices[j], common_observed]
)
}
}
# Calculate the imputation values
y <- betas[suitable_index, 1] + betas[suitable_index, 2] * ds[row_indices, j_ind]
## Calculate weights ---------------------------------------------
similarities_sq <- rows_candidates$similarity[suitable_index]^2
w <- (similarities_sq /
(1 - similarities_sq + eps))^2
w <- w / sum(w)
## Calculate final imputation value -------------------------------
ds_imp[i, j_ind] <- sum(w * y)
}
}
}
}
}
}
## Messages, if verbose = TRUE ----------------------------------------------
if (verbose) {
if (length(rows_imp_with_colMeans) > 0) {
message(
"No observed value in row(s) ",
paste(rows_imp_with_colMeans, collapse = ", "), ". ",
"These rows were imputed with column means.",
appendLF = FALSE
)
}
if (length(rows_imp_with_rowMeans_min_obs) > 0) {
message(
"Not enough observed values in row(s) ",
paste(rows_imp_with_rowMeans_min_obs, collapse = ", "), ". ",
"These rows were imputed with osbserved row means.",
appendLF = FALSE
)
}
if (length(rows_imp_with_rowMeans_no_suitable) > 0) {
message(
"No suitable row for the imputation of (parts of) row(s) ",
paste(rows_imp_with_rowMeans_no_suitable, collapse = ", "), " found. ",
"These rows (or parts of them) were imputed with osbserved row means.",
appendLF = FALSE
)
}
}
## Return value -------------------------------------------------------------
ds <- assign_imputed_values(ds, ds_imp, M)
if (return_r_max) {
return(list(imp = ds, r_max = r_max_matrix))
} else {
return(ds)
}
}
## Helpers for LSimpute_gene --------------------------------------------------
calc_common_obs <- function(M, M_i) {
nr_vars <- ncol(M)
nr_vars - colSums(t(M) | M_i)
}
#' Simple linear regression
#'
#' @param y numeric vector
#' @param x numeric vector
#'
#' @return a vector: first element beta_0, second beta_1
#' @noRd
calc_lm_coefs_simple_reg <- function(y, x) {
sum_x <- sum(x)
sum_y <- sum(y)
sum_xy <- sum(x * y)
sum_x_sq <- sum(x^2)
n <- length(x)
beta_1 <- (sum_xy - sum_x * sum_y / n) / (sum_x_sq - sum_x^2 / n)
beta_0 <- sum_y / n - beta_1 * sum_x / n
c(beta_0, beta_1)
}
calc_similarity <- function(ds, y) {
similarity_i <- abs(stats::cor(t(ds), y, use = "pairwise.complete.obs"))
as.vector(similarity_i)
}
#' Find candidates genes/rows for the imputation of a row i
#'
#' @param ds whole dataset
#' @param i index of a row of ds, for which candidates should be found
#' @param M missing data indicator matrix
#' @param M_i missing data indicator vector of row i
#' @param min_common_obs documented in impute_LS_gene()
#'
#' @return a data frame with two columns. First column: indices of candidates;
#' second column: similarity (= abs(correlation)) between row i and candidate rows
#' data frame is ordered (decreasing) by similarity
#' @noRd
find_rows_candidates <- function(ds, i, M = is.na(ds), M_i = M[i, ], min_common_obs) {
rows_candidates_index <- which(calc_common_obs(M, M_i) >= min_common_obs & seq_len(nrow(ds)) != i)
if (length(rows_candidates_index) == 0) { # no rows_candidates found
return(data.frame(row_index = integer(0)))
}
# Calculate correlation
similarity_i <- calc_similarity(ds[rows_candidates_index, ], ds[i, ])
# Order rows by similarity
orderd_index <- order(similarity_i, decreasing = TRUE)
data.frame(
row_index = rows_candidates_index[orderd_index],
similarity = similarity_i[orderd_index]
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.