#' calculate median correlations
#'
#' Given a correlation matrix and optionally the sample class information,
#' calculates the median correlations of each sample to all other samples in
#' the same class. May be useful for determining outliers.
#'
#' @param cor_matrix the sample - sample correlations
#' @param sample_classes the sample classes as a character or factor
#' @param between_classes should the between class correlations be evaluated?
#'
#' @return data.frame
#' @export
#'
#' @details The data.frame may have 5 columns, first three are always present,
#' the second two come up if \code{between_classes = TRUE}:
#' \describe{
#' \item{med_cor}{the median correlation with other samples}
#' \item{sample_id}{the sample id, either the rowname or an index}
#' \item{sample_class}{the class of the sample. If not provided, set to "C1"}
#' \item{compare_class}{the class of the other sample}
#' \item{plot_class}{\code{sample_class::compare_class} for easy grouping}
#' }
#'
median_correlations <- function(cor_matrix, sample_classes = NULL, between_classes = FALSE){
stopifnot(nrow(cor_matrix) == ncol(cor_matrix))
n_sample <- nrow(cor_matrix)
if (is.null(sample_classes)) {
use_classes <- factor(rep("C1", n_sample))
} else if (is.factor(sample_classes)) {
use_classes <- sample_classes
} else {
sample_classes <- factor(sample_classes)
use_classes <- sample_classes
}
if (is.null(rownames(cor_matrix))) {
stopifnot(rownames(cor_matrix) == colnames(cor_matrix))
sample_id <- paste0("S", seq(1, n_sample))
rownames(cor_matrix) <- colnames(cor_matrix) <- sample_id
} else {
sample_id <- rownames(cor_matrix)
}
split_classes <- split(sample_id, use_classes)
names(use_classes) <- sample_id
sample_median_cor <- lapply(sample_id, function(in_sample){
#message(in_sample)
sample_loc <- which(colnames(cor_matrix) %in% in_sample)
sample_cor <- cor_matrix[sample_loc, -sample_loc]
cor_by_class <- lapply(split_classes, function(class_ids){
#message(class_ids[1])
class_samples <- setdiff(class_ids, in_sample)
if (length(class_samples) > 0) {
med_cor = median(sample_cor[class_samples])
} else {
med_cor = NA
}
data.frame(sample_id = in_sample,
med_cor = med_cor,
sample_class = as.character(use_classes[in_sample]),
compare_class = as.character(unique(use_classes[class_ids])),
stringsAsFactors = FALSE)
})
cor_by_class <- do.call(rbind, cor_by_class)
})
sample_median_cor <- do.call(rbind, sample_median_cor)
if (between_classes) {
sample_median_cor$plot_class <- paste0(sample_median_cor$sample_class, "::", sample_median_cor$compare_class)
} else {
sample_median_cor <- sample_median_cor[(sample_median_cor$sample_class == sample_median_cor$compare_class), ]
sample_median_cor$compare_class <- NULL
}
rownames(sample_median_cor) <- NULL
sample_median_cor
}
#' calculate median class correlations
#'
#' Given a correlation matrix the sample class information,
#' calculates the median correlations of the samples within
#' the class and between classes.
#'
#' @param cor_matrix the sample - sample correlations
#' @param sample_classes the sample classes as a character or factor
#'
#' @return matrix
#' @export
#'
#'
median_class_correlations <- function(cor_matrix, sample_classes = NULL){
stopifnot(nrow(cor_matrix) == ncol(cor_matrix))
n_sample <- nrow(cor_matrix)
if (is.null(sample_classes)) {
stop("You didn't provide sample classes to work with.")
}
if (is.null(rownames(cor_matrix))) {
stopifnot(rownames(cor_matrix) == colnames(cor_matrix))
sample_id <- paste0("S", seq(1, n_sample))
rownames(cor_matrix) <- colnames(cor_matrix) <- sample_id
} else {
sample_id <- rownames(cor_matrix)
}
names(sample_classes) = rownames(cor_matrix)
sample_classes = sort(sample_classes)
all_comp = combn(names(sample_classes), 2)
comp_df = data.frame(s1 = all_comp[1, ],
s2 = all_comp[2, ],
class1 = sample_classes[all_comp[1, ]],
class2 = sample_classes[all_comp[2, ]],
cor = NA)
for (irow in seq_len(nrow(comp_df))) {
comp_df[irow, "cor"] = cor_matrix[comp_df[irow, "s1"],
comp_df[irow, "s2"]]
}
med_df = comp_df %>%
dplyr::group_by(class1, class2) %>%
dplyr::summarise(median = median(cor), .groups = "keep") %>%
dplyr::ungroup() %>%
data.frame()
out_classes = unique(sample_classes)
out_median = matrix(NA, nrow = length(out_classes),
ncol = length(out_classes))
rownames(out_median) = colnames(out_median) = out_classes
for (irow in seq_len(nrow(med_df))) {
class1 = med_df[irow, "class1"]
class2 = med_df[irow, "class2"]
out_median[class1, class2] = med_df[irow, "median"]
}
out_median
}
# calculates if something is an outlier
.calc_outlier <- function(data, n_trim, n_sd, remove_missing){
outlier_data <- apply(data, 1, function(x){
is_bad <- is.infinite(x) | is.na(x) | is.nan(x)
if (length(remove_missing) > 0) {
all_bad <- is_bad | (x %in% remove_missing)
} else {
all_bad <- is_bad
}
y <- x[!all_bad]
n_y <- length(y)
y <- sort(y)
y_start <- n_trim + 1
y_end <- n_y - (n_trim + 1)
if ((y_end <= y_start) || (y_end <= 0)) {
y_start <- 1
y_end <- n_y
}
y <- y[y_start:y_end]
n_y <- length(y)
if (n_y >= 3) {
y_mean <- mean(y)
y_sd <- sd(y)
y_lo <- y_mean - (n_sd * y_sd)
y_hi <- y_mean + (n_sd * y_sd)
x_out <- !((x >= y_lo) & (x <= y_hi))
# set those things that were outliers & bad in beginning to FALSE so
# we don't overestimate the outlier proportion
x_out[!(x_out & !all_bad)] <- FALSE
} else {
x_out <- rep(FALSE, length(x))
}
x_out
})
return(outlier_data)
}
#' fraction of outliers
#'
#' Calculates the fraction of entries in each sample that are more than \code{X}
#' standard deviations from the trimmed mean. See Details.
#'
#' @param data the data matrix (samples are columns, rows are features)
#' @param sample_classes the sample classes
#' @param n_trim how many features to trim at each end (default is 3)
#' @param n_sd how many SD before treated as outlier (default is 5)
#' @param remove_missing what missing values be removed before calculating? (default is NA)
#'
#' @details Based on the Gerlinski paper \href{https://dx.doi.org/10.1093/bioinformatics/btv425}{link}
#' for each feature (in a sample class), take the range across all the samples,
#' remove the \code{n_trim} lowest and highest values, and calculate the \code{mean}
#' and \code{sd}, and the actual upper and lower ranges of \code{n_sd} from the
#' \code{mean}. For each sample and feature, determine if \emph{within} or \emph{outside}
#' that limit. Fraction is reported as the number of features outside the range.
#'
#' Returns a \code{data.frame} with:
#' \describe{
#' \item{sample_id}{the sample id, \code{rownames} are used if available, otherwise
#' this is an index}
#' \item{sample_class}{the class of the sample if \code{sample_classes} were provided,
#' otherwise given a default of "C1"}
#' \item{frac}{the actual outlier fraction calculated for that sample}
#' }
#'
#' @export
#' @return data.frame
outlier_fraction <- function(data, sample_classes = NULL, n_trim = 3,
n_sd = 5, remove_missing = NA){
n_sample <- ncol(data)
if (is.null(sample_classes)) {
use_classes <- factor(rep("C1", n_sample))
} else if (is.factor(sample_classes)) {
use_classes <- sample_classes
} else {
sample_classes <- factor(sample_classes)
use_classes <- sample_classes
}
split_classes <- split(seq(1, n_sample), use_classes)
if (is.null(colnames(data))) {
sample_names <- paste0("S", seq(1, n_sample))
} else {
sample_names <- colnames(data)
}
frac_outlier_class <- lapply(names(split_classes), function(class_name){
class_index <- split_classes[[class_name]]
is_outlier <- .calc_outlier(data[, class_index, drop = FALSE], n_trim, n_sd, remove_missing)
if (nrow(is_outlier) != nrow(data)) {
is_outlier = t(is_outlier)
}
frac_outlier <- colSums(is_outlier) / nrow(data)
data.frame(sample_id = sample_names[class_index], sample_class = class_name, frac = frac_outlier)
})
frac_outlier <- do.call(rbind, frac_outlier_class)
frac_outlier
}
#' determine outliers
#'
#' @param median_correlations median correlations
#' @param outlier_fraction outlier fractions
#' @param cor_weight how much weight for the correlation score?
#' @param frac_weight how much weight for the outlier fraction?
#' @param only_high should only things at the low end of score be removed?
#'
#' @details For outlier sample detection, one should
#' first generate median correlations using
#' `median_correlations`, and outlier fractions using
#' `outlier_fraction`. If you only have one or the other,
#' than you should use named arguments to only pass the one
#' or the other.
#'
#' Alternatively, you can change the weighting used for median correlations
#' or outlier fraction, including setting them to 0.
#'
#' @export
#' @return data.frame
determine_outliers = function(median_correlations = NULL, outlier_fraction = NULL,
cor_weight = 1, frac_weight = 1, only_high = TRUE){
if (!is.null(median_correlations) && !is.null(outlier_fraction)) {
full_data = dplyr::left_join(median_correlations, outlier_fraction, by = "sample_id", suffix = c(".cor", ".frac"))
if ((nrow(full_data) != nrow(median_correlations)) || (nrow(full_data) != nrow(outlier_fraction))) {
stop("Samples in median_correlations and outlier_fraction don't match up!")
}
if (!all(full_data$sample_class.cor == full_data$sample_class.frac)) {
stop("The sample classes (sample_class) from median_correlations and outlier_fraction are NOT all the same!")
}
full_data$sample_class = full_data$sample_class.cor
full_data$sample_class.cor = NULL
full_data$sample_class.frac = NULL
}
if (is.null(outlier_fraction) && !is.null(median_correlations)) {
full_data = median_correlations
full_data$frac = 0
}
if (!is.null(outlier_fraction) && is.null(median_correlations)) {
full_data = outlier_fraction
full_data$med_cor = 0
}
data_score = (cor_weight * log(1 - full_data$med_cor)) + (frac_weight * log1p(full_data$frac))
names(data_score) = full_data$sample_id
split_score = split(data_score, full_data$sample_class)
out_each = purrr::map(split_score, function(in_scores){
score_out = boxplot.stats(in_scores)$out
names(in_scores)[in_scores %in% score_out]
})
all_out = unlist(out_each)
full_data$score = data_score
full_data$outlier = FALSE
full_data$outlier[full_data$sample_id %in% all_out] = TRUE
if (only_high) {
split_data = split(full_data, full_data$sample_class)
full_data = purrr::map(split_data, \(in_data){
mean_score = mean(in_data$score)
wrong_side = in_data |>
dplyr::filter(score < mean_score, outlier) |>
dplyr::pull(sample_id)
in_data$outlier[in_data$sample_id %in% wrong_side] = FALSE
in_data
}) |>
dplyr::bind_rows()
}
full_data
}
#' grp_cor_data
#'
#' Example data used for demonstrating \code{median correlation}. A list with
#' 2 named entries:
#'
#' \describe{
#' \item{data}{a data matrix with 100 rows and 20 columns}
#' \item{class}{a character vector of 20 entries denoting classes}
#' }
#'
#' The data comes from two groups of samples, where there is ~0.85 correlation
#' within each group, and ~0.53 correlation between groups.
#'
#' @format List with 2 entries, data and class
#' @source Robert M Flight
"grp_cor_data"
#' grp_exp_data
#'
#' Example data that requires log-transformation before doing PCA or other QC.
#' A list with 2 named entries:
#'
#' \describe{
#' \item{data}{a data matrix with 1000 rows and 20 columns}
#' \item{class}{a character vector of 20 entries denoting classes}
#' }
#'
#' The data comes from two groups of samples, where there is ~0.80 correlation
#' within each group, and ~0.38 correlation between groups.
#'
#' @format List with 2 entries, data and class
#' @source Robert M Flight
"grp_exp_data"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.