# https://github.com/pcastellanoescuder/POMA/blob/master/R/PomaImpute.R
# https://github.com/WandeRum/MVI-evaluation/blob/master/Impute_wrapper.R
#' @title Imputation methods on missing value
#'
#' @description
#' This function offers different methods to impute missing values in data.
#'
#' @references Armitage, E. G., Godzien, J., Alonso‐Herranz, V., López‐Gonzálvez,
#' Á., & Barbas, C. (2015). Missing value imputation strategies for metabolomics
#' data. Electrophoresis, 36(24), 3050-3060.
#'
#' @author Created by Pol Castellano-Escuder;
#' Modified by Hua Zou (12/02/2022 Shenzhen China)
#'
#' @param object (Required). a [`phyloseq::phyloseq-class`] or
#' [`SummarizedExperiment::SummarizedExperiment`] object.
#' @param level (Optional). character. Summarization
#' level (from \code{rank_names(pseq)}, default: NULL).
#' @param group (Required). character. group for determining missing values.
#' @param ZerosAsNA (Optional). logical. zeros in the data are missing
#' values (default: FALSE).
#' @param RemoveNA (Optional). logical. those features with more than selected
#' cutoff missing values in each group have to be removed (default: TRUE).
#' @param cutoff (Optional). numeric. percentage of missing values allowed in
#' each group. If one of the groups have less missing values than selected
#' cutoff value, these feature will not be removed.
#' @param method (Optional). character. Imputation method. Options are:
#' * "none": all missing values will be replaced by zero.
#' * "LOD": specific Limit Of Detection which provides by user.
#' * "half_min": half minimal values across samples except zero.
#' * "median": median values across samples except zero.
#' * "mean": mean values across samples except zero.
#' * "min": minimal values across samples except zero.
#' * "knn": k-nearest neighbors samples.
#' * "rf": nonparametric missing value imputation using Random Forest.
#' * "global_mean": a normal distribution with a mean that is down-shifted from
#' the sample mean and a standard deviation that is a
#' fraction of the standard deviation of the sample distribution.
#' * "svd": missing values imputation based Singular value decomposition.
#' * "QRILC": missing values imputation based quantile regression.
#' (default: "none").
#' @param LOD (Optional). Numeric. limit of detection (default: NULL).
#' @param knum (Optional). Numeric. Number of neighbors to be used in the
#' imputation (default=10).
#'
#' @usage impute_abundance(
#' object,
#' level = c(NULL, "Kingdom", "Phylum", "Class",
#' "Order", "Family", "Genus",
#' "Species", "Strain", "unique"),
#' group,
#' ZerosAsNA = FALSE,
#' RemoveNA = TRUE,
#' cutoff = 20,
#' method = c("none", "LOD", "half_min", "median",
#' "mean", "min", "knn", "rf",
#' "global_mean", "svd", "QRILC"),
#' LOD = NULL,
#' knum = 10)
#'
#' @export
#'
#' @return A [`phyloseq::phyloseq-class`] or
#' [`SummarizedExperiment::SummarizedExperiment`] object with cleaned data.
#'
#' @importFrom dplyr %>%
#' @importFrom SummarizedExperiment colData assay
#'
#' @examples
#'
#' \dontrun{
#' # phyloseq object
#' data("Zeybel_2022_gut")
#' impute_abundance(
#' Zeybel_2022_gut,
#' level = "Phylum",
#' group = "LiverFatClass",
#' ZerosAsNA = TRUE,
#' RemoveNA = TRUE,
#' cutoff = 20,
#' method = "knn")
#'
#' # SummarizedExperiment object
#' data("Zeybel_2022_protein")
#' impute_abundance(
#' Zeybel_2022_protein,
#' group = "LiverFatClass",
#' ZerosAsNA = TRUE,
#' RemoveNA = TRUE,
#' cutoff = 20,
#' method = "knn")
#' }
#'
impute_abundance <- function(
object,
level = NULL,
group,
ZerosAsNA = FALSE,
RemoveNA = TRUE,
cutoff = 20,
method = c("none", "LOD", "half_min", "median",
"mean", "min", "knn", "rf",
"global_mean", "svd", "QRILC"),
LOD = NULL,
knum = 10) {
# data("Zeybel_2022_gut")
# object = Zeybel_2022_gut
# level = "Phylum"
# group = "LiverFatClass"
# ZerosAsNA = TRUE
# RemoveNA = TRUE
# cutoff = 20
# method = "QRILC"
# data("Zeybel_2022_protein")
# object = Zeybel_2022_protein
# level = NULL
# group = "LiverFatClass"
# ZerosAsNA = TRUE
# RemoveNA = TRUE
# cutoff = 20
# method = "knn"
# knum = 10
if (base::missing(object)) {
stop("object argument is empty!")
}
if (all(!methods::is(object, "phyloseq"),
!methods::is(object, "SummarizedExperiment"))) {
stop("object is not either a phyloseq or SummarizedExperiment object.")
}
method <- match.arg(
method, c("none", "LOD", "half_min", "median",
"mean", "min", "knn", "rf",
"global_mean", "svd", "QRILC")
)
# if (!(method %in% c("none", "LOD", "half_min", "median",
# "mean", "min", "knn", "rf",
# "global_mean", "svd", "QRILC"))) {
# stop("Incorrect value for method argument!")
# }
if (base::missing(method)) {
message("method argument is empty! KNN will be used")
}
# profile: row->samples; col->features
if (all(!is.null(object), inherits(object, "phyloseq"))) {
# phyloseq object
ps <- check_sample_names(object = object)
# taxa level
if (!is.null(level)) {
ps <- aggregate_taxa(x = ps, level = level)
} else {
ps <- ps
}
## sample table & profile table
sam_tab <- phyloseq::sample_data(ps) %>%
data.frame() %>%
tibble::rownames_to_column("TempRowNames")
if (phyloseq::taxa_are_rows(ps)) {
prf_tab <- phyloseq::otu_table(phyloseq::t(ps)) %>%
data.frame()
} else {
prf_tab <- phyloseq::otu_table(ps) %>%
data.frame()
}
} else if (all(!is.null(object), inherits(object, "SummarizedExperiment"))) {
# sample table & profile table
sam_tab <- SummarizedExperiment::colData(object) %>%
data.frame() %>%
tibble::rownames_to_column("TempRowNames")
## samples->Row; features->Column
prf_tab <- SummarizedExperiment::assay(object) %>%
as.data.frame() %>%
t()
}
group_index <- which(colnames(sam_tab) == group)
samples_groups <- sam_tab[, group_index]
to_imp_data <- prf_tab %>% as.matrix()
if (ZerosAsNA) {
to_imp_data[to_imp_data == 0] <- NA
to_imp_data <- data.frame(cbind(Group = samples_groups, to_imp_data))
colnames(to_imp_data)[2:ncol(to_imp_data)] <- colnames(prf_tab)
} else {
to_imp_data <- data.frame(cbind(Group = samples_groups, to_imp_data))
colnames(to_imp_data)[2:ncol(to_imp_data)] <- colnames(prf_tab)
}
percent_na <- sum(is.na(to_imp_data))
if (percent_na == 0) {
print("No missing values detected in your data")
if (method != "none") {
method <- "none"
}
}
if (isTRUE(RemoveNA)) {
count_NA <- stats::aggregate(
. ~ Group,
data = to_imp_data,
function(x) { 100 * (sum(is.na(x)) / (sum(is.na(x)) + sum(!is.na(x))) ) },
na.action = NULL)
count_NA <- count_NA %>%
dplyr::select(-Group)
correct_names <- names(count_NA)
supress <- unlist(as.data.frame(lapply(count_NA, function(x) all(x > cutoff))))
names(supress) <- correct_names
correct_names <- names(supress[supress == "FALSE"])
depurdata <- to_imp_data[, 2:ncol(to_imp_data)][!supress]
samplename <- rownames(depurdata)
depurdata <- sapply(depurdata, function(x) as.numeric(as.character(x)))
rownames(depurdata) <- samplename
} else {
depurdata <- to_imp_data[, 2:ncol(to_imp_data)]
samplename <- rownames(depurdata)
depurdata <- sapply(depurdata, function(x) as.numeric(as.character(x)))
correct_names <- colnames(prf_tab)
rownames(depurdata) <- samplename
}
# Row->feature;Col->sample
if (method == "none") {
depurdata[is.na(depurdata)] <- 0
} else if (method == "LOD") {
if (is.null(LOD)) {
message("No LOD provided, regard one-tenth mininal value as LOD")
depurdata_withoutNA <- depurdata[!is.na(depurdata)]
LOD <- min(depurdata_withoutNA[depurdata_withoutNA != 0]) / 10
}
depurdata[is.na(depurdata)] <- LOD
depurdata[depurdata == 0] <- LOD
} else if (method == "half_min") {
depurdata <- apply(depurdata, 2, function(x) {
if(is.numeric(x)) ifelse(is.na(x), min(x, na.rm = TRUE)/2, x) else x})
} else if (method == "median") {
depurdata <- apply(depurdata, 2, function(x) {
if(is.numeric(x)) ifelse(is.na(x), median(x, na.rm = TRUE), x) else x})
} else if (method == "mean") {
depurdata <- apply(depurdata, 2, function(x) {
if(is.numeric(x)) ifelse(is.na(x), mean(x, na.rm = TRUE), x) else x})
} else if (method == "min") {
depurdata <- apply(depurdata, 2, function(x) {
if(is.numeric(x)) ifelse(is.na(x), min(x, na.rm = TRUE), x) else x})
} else if (method == "knn") {
depurdata <- t(depurdata)
datai <- impute::impute.knn(
depurdata, # Row->features; Column->samples
k = knum,
rowmax = 0.5,
colmax = 0.8,
maxp = 1500)
depurdata <- t(datai$data)
} else if (method == "rf") {
# depurdata <- data.frame(group = samples_groups, depurdata)
# depurdata <- randomForest::rfImpute(group ~ ., depurdata)
# depurdata <- depurdata %>%
# dplyr::select(-group)
fit <- missForest::missForest(t(depurdata))
depurdata <- fit$ximp %>%
t()
} else if (method == "global_mean") {
depurdata <- .GlobalMean(object = t(depurdata)) %>%
t()
} else if (method == "svd") {
depurdata <- .SVD_wrapper(depurdata)
} else if (method == "QRILC") {
fit <- log(t(depurdata)) %>%
imputeLCMD::impute.QRILC() #%>%
#extract2(1) %>% exp
depurdata <- t(fit[[1]])
}
colnames(depurdata) <- correct_names
rownames(depurdata) <- rownames(prf_tab)
if (methods::is(object, "phyloseq")) {
res <- ps
phyloseq::otu_table(res) <- phyloseq::otu_table(t(depurdata),
taxa_are_rows = TRUE)
}
if (methods::is(object, "SummarizedExperiment")) {
if (ncol(depurdata) != ncol(prf_tab)) {
rdata <- SummarizedExperiment::rowData(object)
cdata <- SummarizedExperiment::colData(object)
if (length(object@metadata) == 0) {
mdata <- NULL
} else {
mdata <- object@metadata
}
res <- import_SE(object = t(depurdata),
rowdata = rdata,
coldata = cdata,
metadata = mdata)
# res <- base::subset(res, colnames(prf_tab) %in% colnames(depurdata))
} else {
res <- object
SummarizedExperiment::assay(res) <- t(depurdata)
}
}
return(res)
}
#' Global Standard imputation
#'
#' Global Standard imputation is used the Global mean and standard as the
#' cutoff to impute the missing value. We calculate the mean for
#' all non missing values of that variable then replace missing
#' value with mean.
#'
#' @keywords internal
#' @noRd
#'
#' @param object, Object; a [`matrix`](Row->Features; Column->Samples).
#'
.GlobalMean <- function(
object,
width = 0.3,
downshift = 1.8) {
# Row->Features; Column->Samples
if (length(object[is.na(object)]) > 1) {
for (i in 1:ncol(object)) {
temp <- object[, i]
temp_sd <- width * sd(temp, na.rm = TRUE)
temp_mean <- mean(temp, na.rm = TRUE) - downshift * sd(temp, na.rm = TRUE)
n_missing <- sum(is.na(temp))
object[is.na(temp), i] <- rnorm(n_missing, mean = temp_mean, sd = temp_sd)
object_imputed <- object
}
} else {
object_imputed <- object
message("No NAs were found\n")
}
return(object_imputed)
}
#' @keywords internal
#' @noRd
.SVD_wrapper <- function(
object,
K = 5) {
data_sc_res <- .scale_recover(object, method = "scale")
data_sc <- data_sc_res[[1]]
data_sc_param <- data_sc_res[[2]]
result <- data_sc %>%
imputeLCMD::impute.wrapper.SVD(., K = K) %>%
.scale_recover(., method = "recover", param_df = data_sc_param) #%>%
#extract2(1)
res <- result[[1]]
return(res)
}
# Scale and recover -------------
#' @keywords internal
#' @noRd
.scale_recover <- function(
object,
method = "scale",
param_df = NULL) {
results <- list()
data_res <- object
if (!is.null(param_df)) {
if (method == "scale") {
data_res[] <- scale(object, center=param_df$mean, scale=param_df$std)
} else if (method=="recover") {
data_res[] <- t(t(object) * param_df$std + param_df$mean)
}
} else {
if (method == "scale") {
param_df <- data.frame(mean=apply(object, 2, function(x) mean(x, na.rm=T)),
std=apply(object, 2, function(x) sd(x, na.rm=T)))
data_res[] <- scale(object, center=param_df$mean, scale=param_df$std)
} else {stop("no param_df found for recover...")}
}
results[[1]] <- data_res
results[[2]] <- param_df
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.