#' Multivariate outlier detection
#'
#' Detect multivariante outliers using Mahalanobis distance using mean and covariance estimated either with standard or robust methods.
#'
#' @param data matrix of data
#' @param robust use robust covariance method, defaults to \code{FALSE}
#' @param ... arguments passed to \code{MASS::cov.rob()}
#'
#' @details
#' The distance follow a chisq distrubtion under the null with standard method for mean and covariance. It is approximate if the robust method is used. So use \code{qchisq(p = 0.999 , df = k)} to get cutoff to keep 99.9\% of samples under the null for data with \code{k=2} columns.
#'
#' @return \code{data.frame} storing chisq and z-score for each entry indicating deviation from the mean. The z-score is computed by evaluating the p-value of chisq statistic and converting it into a z-score
#'
#' @examples
#' data <- matrix(rnorm(200), 100, 2)
#'
#' res <- outlier(data)
#'
#' res[1:4,]
#' @importFrom MASS cov.rob
#' @importFrom stats cov mahalanobis pchisq prcomp qnorm
#' @export
outlier <- function(data, robust = FALSE, ...) {
if (robust) {
res <- cov.rob(data, ...)
mu <- res$center
C <- res$cov
} else {
# means
mu <- colMeans2(data, useNames=FALSE)
# covariance
C <- cov(data)
}
# Mahalanobis distance
d = mahalanobis(data, mu, C)
# P-values based on d ~ chisq(k)
pOut <- pchisq(d, ncol(data), lower.tail=FALSE)
z <- qnorm(pOut/2, lower.tail=FALSE)
df = data.frame(chisq = d, z = z, pValue = pOut)
rownames(df) = rownames(data)
df
}
#' Outlier analysis for each assay
#'
#' Compute outlier score for each sample in each assay using \code{outlier()} run on the top principal components. Mahalanobis distance is used for outlier detect and multivariate normal assumption is used to compute p-values
#'
#' @param object \code{dreamletProcessedData} from \code{processAssays()}
#' @param assays assays / cell types to analyze
#' @param nPC number of PCs to uses for outlier score with \code{outlier()}
#' @param robust use robust covariance method, defaults to \code{FALSE}
#' @param ... arguments passed to \code{MASS::cov.rob()}
#'
#' @return
#' \itemize{
#' \item{\code{ID}:}{sample identifier}
#' \item{\code{assay}:}{specify assay}
#' \item{\code{PCs}:}{principal components}
#' \item{\code{chisq}:}{mahalanobis distance that is distributed as chisq(k) k = nPC if the data is multivariate gaussian}
#' \item{\code{z}:}{z-score corresponding to the chisq distance}
#' }
#'
#' @examples
#' library(muscat)
#' library(SingleCellExperiment)
#'
#' data(example_sce)
#'
#' # create pseudobulk for each sample and cell cluster
#' pb <- aggregateToPseudoBulk(example_sce,
#' assay = "counts",
#' cluster_id = "cluster_id",
#' sample_id = "sample_id",
#' verbose = FALSE
#' )
#'
#' # voom-style normalization
#' res.proc <- processAssays(pb, ~group_id)
#'
#' # Compute PCs and outlier scores
#' outlierByAssay( res.proc, c("B cells", "CD14+ Monocytes"))
#
#' @importFrom irlba prcomp_irlba
#' @seealso \code{outlier()}
#' @export
outlierByAssay = function(object, assays = names(object), nPC=2, robust = FALSE, ...){
stopifnot(is(object, "list"))
df = lapply(assays, function(id){
# get normalized expression
# Y <- assay(object, id)$E
if( is(object[[id]], "EList") ){
Y <- object[[id]]$E
}else{
Y <- object[[id]]
}
# PCA
# dcmp <- prcomp(scale(t(Y)))
if( nPC < min(dim(Y))/2 ){
dcmp = prcomp_irlba(t(Y), n = nPC, center=TRUE, scale.=TRUE)
}else{
dcmp = prcomp(t(Y), center=TRUE, scale.=TRUE)
}
# outlier analysis on first 2 PCs
# there are already scaled by the variance
U = dcmp$x[,seq(nPC),drop=FALSE]
# d = dcmp$sdev[seq(nPC)]
# evalute outliers on scaled PCs
df_outlier <- outlier(U, robust = robust, ...)
data.frame(ID = colnames(Y),
assay = id,
# return scaled PCs, same as U %*% diag(1/d)
scale(U),
df_outlier) %>%
as_tibble
})
bind_rows(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.