#' Mark specified values as missing
#'
#' Replaces all values in the exprs part that equal the specified value with NA.
#' For example, vendor software might use 0 or 1 to signal a missing value,
#' which is not understood by R.
#'
#' @param object a MetaboSet object
#' @param value the value to be converted to NA
#'
#' @return MetaboSet object as the one supplied, with missing values correctly set to NA
#'
#' @examples
#' nas_marked <- mark_nas(merged_sample, value = 0)
#'
#' @export
mark_nas <- function(object, value) {
ex <- exprs(object)
ex[ex == value] <- NA
exprs(object) <- ex
object
}
#' Replace NA values in pheno data columns with "QC"
#'
#' Loops through specified columns in pheno data and replaces all NA values with "QC". Also transforms
#' the column to factor and sets "QC" as the last level.
#'
#' @param data a data frame such as pheno_data returned by \code{\link{read_from_excel}}
#' @param cols the columns to fix
#'
#' @return a data frame with the column modified
#'
#' @examples
#' # Remove QC labels first
#' pheno_data <- pData(merged_sample)
#' pheno_data$Group[pheno_data$Group == "QC"] <- NA
#' head(pheno_data$Group, 20)
#' pheno_data <- mark_qcs(pheno_data, cols = "Group")
#' head(pheno_data$Group, 20)
#'
#' @export
mark_qcs <- function(data, cols) {
for (col in cols) {
data[col] <- as.character(data[, col])
data[is.na(data[, col]), col] <- "QC"
data[col] <- factor(data[, col])
good_levels <- levels(data[, col])
good_levels <- c(good_levels[good_levels != "QC"], "QC")
data[col] <- factor(as.character(data[, col]), levels = good_levels)
}
data
}
#' Drop QC samples
#'
#' @param object a MetaboSet object
#'
#' @return MetaboSet object as the one supplied, without QC samples
#'
#' @examples
#' dim(merged_sample)
#' noqc <- drop_qcs(merged_sample)
#' dim(noqc)
#'
#' @importFrom Biobase pData "pData<-"
#'
#' @export
drop_qcs <- function(object) {
object <- object[, object$QC != "QC"]
pData(object) <- droplevels(pData(object))
object
}
#' Drop flagged features
#'
#' Removes all features that have been flagged by quality control functions.
#' Only features that do not have a flag (Flag == NA) are retained.
#'
#' @param object a MetaboSet object
#' @param all_features logical, should all features be retained? Mainly used by internal functions
#'
#' @examples
#' dim(merged_sample)
#' flagged <- flag_quality(merged_sample)
#' noflags <- drop_flagged(flagged)
#' dim(noflags)
#'
#' @export
drop_flagged <- function(object, all_features = FALSE) {
if (!all_features) {
object <- object[is.na(flag(object)), ]
}
object
}
#' Partially replace exprs field with new values
#'
#' Replaces a subset of data in exprs of an object by new values.
#' Used after an operation such as imputation is computed only for a subset of features or samples
#' such as only good-quality features that have not been flagged.
#' This function is mainly used internally, but can be useful.
#'
#' @param object a MetaboSet object
#' @param y matrix containing new values to be merged into exprs
#'
#' @return a MetaboSet object with the new exprs values
#'
#' @examples
#' ex_set <- merged_sample[1:5, 1:5]
#' exprs(ex_set)
#' # Create a matrix of replacment values for rows 1, 3, 5 and columns 1, 3, 4
#' replacement <- matrix(1:9, ncol = 3,
#' dimnames = list(featureNames(ex_set)[c(1, 3, 5)],
#' sampleNames(ex_set)[c(1, 3, 4)]))
#' replacement
#' merged <- merge_exprs(ex_set, replacement)
#' exprs(merged)
#'
#' @export
merge_exprs <- function(object, y) {
# Colnames and rownames should be found in the object
if (!all(colnames(y) %in% colnames(exprs(object))) | is.null(colnames(y))) {
stop("Column names of y do not match column names of exprs(object)")
}
if(!all(rownames(y) %in% rownames(exprs(object))) | is.null(rownames(y))) {
stop("Row names of y do not match row names of exprs(object)")
}
exprs_tmp <- exprs(object)
exprs_tmp[rownames(y), colnames(y)] <- y
exprs(object) <- exprs_tmp
object
}
#' Impute missing values using random forest
#'
#' Impute the missing values in the exprs part of the object using a
#' random forest. The estimated error in the imputation is logged.
#' It is recommended to set the seed number for reproducibility
#' (it is called random forest for a reason).
#' This a wrapper around \code{missForest::missForest}.
#' Use parallelize = "variables" to run in parallel for faster testing.
#' NOTE: running in parallel prevents user from setting a seed number.
#' \strong{CITATION:} When using this function, cite the \code{missForest} package
#'
#' @param object a MetaboSet object
#' @param all_features logical, should all features be used? If FALSE (the default), flagged features are removed before imputation.
#' @param ... passed to MissForest function
#'
#' @return MetaboSet object as the one supplied, with missing values imputed.
#'
#' @examples
#' missing <- mark_nas(example_set, 0)
#' set.seed(38)
#' imputed <- impute_rf(missing)
#'
#' @seealso \code{\link[missForest]{missForest}} for detail about the algorithm and the parameters
#'
#' @export
impute_rf <- function(object, all_features = FALSE, ...) {
if (!requireNamespace("missForest", quietly = TRUE)) {
stop("Package \"missForest\" needed for this function to work. Please install it.",
call. = FALSE)
}
# Start log
log_text(paste("\nStarting random forest imputation at", Sys.time()))
# Drop flagged features
dropped <- drop_flagged(object, all_features)
if (!requireNamespace("missForest", quietly = TRUE)) {
stop("missForest package not found")
}
# Impute missing values
mf <- missForest::missForest(xmis = t(exprs(dropped)), ...)
imputed <- t(mf$ximp)
# Log imputation error
log_text(paste0("Out-of-bag error in random forest imputation: ",
round(mf$OOBerror, digits = 3), "\n"))
# Assign imputed data to the droppped
rownames(imputed) <- rownames(exprs(dropped))
colnames(imputed) <- colnames(exprs(dropped))
# Attach imputed abundances to object
object <- merge_exprs(object, imputed)
log_text(paste("Random forest imputation finished at", Sys.time()))
object
}
#' Simple imputation
#'
#' Impute missing values using a simple imputation strategy. All missing values
#' of a feature are imputed with the same value. It is possible
#' to only impute features with a large number of missing values this way. This can be useful
#' for using this function before random forest imputation to speed things up.
#' The imputation strategies available are:
#'
#' \itemize{
#' \item a numeric value: impute all missing values in all features with the same value, e.g. 1
#' \item "min": impute missing values of a feature with the minimum observed value of that feature
#' \item "half_min": impute missing values of a feature with half the minimum observed value of that feature
#' \item "small_random": impute missing values of a feature with random numbers between 0 and the
#' minimum of that feature (uniform distribution, remember to set the seed number!).
#' }
#'
#' @param object a MetaboSet object
#' @param value the value used for imputation, either a numeric or one of "min", "half_min", "small_random",
#' see above
#' @param na_limit only impute features with the proportion of NAs over this limit. For example, if
#' \code{na_limit = 0.5}, only features with at least half of the values missing are imputed.
#'
#' @examples
#' missing <- mark_nas(merged_sample, 0)
#' imputed <- impute_simple(missing, value = "min")
#'
#' @export
impute_simple <- function(object, value, na_limit = 0) {
imp <- exprs(object)
nas <- apply(imp, 1, prop_na)
imp <- imp[nas > na_limit, ]
if (nrow(imp) == 0) {
warning("none of the features satisfy the NA limit, returning the original object")
return(object)
}
# Replace all missing values with the given constant
if (is.numeric(value)) {
imp[is.na(imp)] <- value
} else if (value == "min") {
imp <- t(apply(imp, 1, function(x){
x[is.na(x)] <- finite_min(x)
x
}))
} else if (value == "half_min") {
imp <- t(apply(imp, 1, function(x){
x[is.na(x)] <- finite_min(x) / 2
x
}))
} else if (value == "small_random") {
imp <- t(apply(imp, 1, function(x){
x[is.na(x)] <- runif(n = sum(is.na(x)), min = 0, max = finite_min(x))
x
}))
} else {
stop('value should be a numeric value or one of "min", "half_min", "small_random"')
}
obj <- merge_exprs(object, imp)
obj
}
#' Inverse-rank normalization
#'
#' Applies inverse rank normalization to all features to approximate
#' a normal distribution.
#'
#' @param object a MetaboSet object
#'
#' @return MetaboSet object as the one supplied, with normalized features
#'
#' @examples
#' normalized <- inverse_normalize(merged_sample)
#'
#' @export
inverse_normalize <- function(object) {
exprs(object) <- exprs(object) %>%
apply(1, function(x) {
qnorm((rank(x, na.last="keep")-0.5) / sum(!is.na(x)))}) %>%
t()
object
}
#' A report of flagged features
#'
#' Computes the number of features at each stage of flagging for each mode.
#'
#' @param object a MetaboSet object
#'
#' @return data frame with the number of features at each stage of flagging
#'
#' @examples
#' flagged <- merged_sample %>%
#' mark_nas(0) %>%
#' flag_detection() %>%
#' flag_quality()
#' flag_report(flagged)
#'
#' @export
flag_report <- function(object) {
splits <- sort(unique(fData(object)$Split))
report <- data.frame()
flag(object)[is.na(flag(object))] <- "Kept"
for (split in splits) {
tmp <- object[fData(object)$Split == split, ]
report_row <- flag(tmp) %>% table %>% as.matrix() %>% t()
report_row <- data.frame(Split = split, report_row)
if (is.null(report_row$Kept)) {
report_row$Kept <- 0
}
report_row$Total <- nrow(tmp)
report_row$Flagged <- report_row$Total - report_row$Kept
report <- suppressWarnings(dplyr::bind_rows(report, report_row))
}
report
}
# ---------- Logarithms ----------
#' Logarithm
#'
#' Log-transforms the exprs part of a MetaboSet object. Shortcust log2 and log10 also implemented.
#' For more information, see \code{\link{log}}
#'
#' @export
setMethod("log", "MetaboSet", function(x, base = exp(1)) {
exprs(x) <- log(exprs(x), base = base)
x
})
#' @rdname log-MetaboSet-method
#' @export
setMethod("log2", "MetaboSet", function(x) {
exprs(x) <- log2(exprs(x))
x
})
#' @rdname log-MetaboSet-method
#' @export
setMethod("log10", "MetaboSet", function(x) {
exprs(x) <- log10(exprs(x))
x
})
# scale
setGeneric("scale")
#' Scale exprs data
#'
#' Applies the base R function scale to transposed exprs matrix. See ?scale for details
#' @export
setMethod("scale", "MetaboSet", function(x, center = TRUE, scale = TRUE) {
exprs(x) <- t(scale(t(exprs(x)), center = center, scale = scale))
x
})
#' Exponential function
#'
#' Apply the exponential function to feature abundances (exprs)
#'
#' @param object a MetaboSet object
#' @param base base of the exponential
#'
#' @return a MetaboSet object with altered feature abundances
#'
#' @export
setGeneric("exponential", signature = "object",
function(object, base = exp(1)) standardGeneric("exponential"))
#' @export
setMethod("exponential", c(object = "MetaboSet"),
function(object, base = exp(1)) {
exprs(object) <- base^exprs(object)
object
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.