#' Calculate the default features used to predict interactions in PrInCE
#'
#' Calculate the six features that are used to discriminate interacting and
#' non-interacting protein pairs based on co-elution profiles in PrInCE,
#' namely: raw Pearson R value, cleaned Pearson R value, raw Pearson P-value,
#' Euclidean distance, co-peak, and co-apex. Optionally, one or more of these
#' can be disabled.
#'
#' @param profile_matrix a numeric matrix of co-elution profiles, with proteins
#' in rows, or a \code{\linkS4class{MSnSet}} object
#' @param gaussians a list of Gaussian mixture models fit to the profile matrix
#' by \code{link{build_gaussians}}
#' @param min_pairs minimum number of overlapping fractions between any given
#' protein pair to consider a potential interaction
#' @param pearson_R_raw if true, include the Pearson correlation (R) between
#' raw profiles as a feature
#' @param pearson_R_cleaned if true, include the Pearson correlation (R) between
#' cleaned profiles as a feature
#' @param pearson_P if true, include the P-value of the Pearson correlation
#' between raw profiles as a feature
#' @param euclidean_distance if true, include the Euclidean distance between
#' cleaned profiles as a feature
#' @param co_peak if true, include the 'co-peak score' (that is, the distance,
#' in fractions, between the single highest value of each profile) as a feature
#' @param co_apex if true, include the 'co-apex score' (that is, the minimum
#' Euclidean distance between any pair of fit Gaussians) as a feature
#' @param max_euclidean_quantile very high Euclidean distance values are trimmed
#' to avoid numerical precision issues; values above this quantile will be
#' replaced with the value at this quantile (default: \code{0.9})
#'
#' @return a data frame containing the calculated features for all possible
#' protein pairs
#'
#' @importFrom stats cor dist
#' @importFrom MSnbase exprs
#' @importFrom methods is
#' @importFrom Hmisc rcorr
#' @importFrom purrr map map_int
#' @importFrom dplyr filter
#'
#' @export
calculate_features <- function(profile_matrix, gaussians,
min_pairs = 0,
pearson_R_raw = TRUE,
pearson_R_cleaned = TRUE,
pearson_P = TRUE,
euclidean_distance = TRUE,
co_peak = TRUE,
co_apex = TRUE,
n_pairs = FALSE,
max_euclidean_quantile = 0.9
) {
if (is(profile_matrix, "MSnSet")) {
profile_matrix <- exprs(profile_matrix)
}
# replace missing values with near-zero noise
cleaned <- clean_profiles(profile_matrix, impute_NA = FALSE, smooth = FALSE,
noise_floor = 0.05)
proteins <- rownames(cleaned)
n_proteins <- length(proteins)
# create features list
feature_matrices <- list()
## cor_R_raw, cor_R_cleaned, cor_P, eucl, co_peak, CA)
# calculate Pearson correlation and P-value distances
pairs <- crossprod(t(!is.na(profile_matrix)))
if (pearson_R_raw) {
cor_R_raw <- suppressWarnings(
1 - cor(t(profile_matrix), use = 'pairwise.complete.obs'))
feature_matrices[["cor_R_raw"]] <- cor_R_raw
}
if (pearson_R_cleaned) {
cor_R_cleaned <- 1 - cor(t(cleaned))
feature_matrices[["cor_R_cleaned"]] <- cor_R_cleaned
}
if (pearson_P) {
cor_P <- suppressWarnings(
rcorr(t(profile_matrix))$P)
feature_matrices[["cor_P"]] <- cor_P
}
# calculate Euclidean distance
if (euclidean_distance) {
eucl <- as.matrix(dist(cleaned, method = 'euclidean'))
feature_matrices[["euclidean_distance"]] <- eucl
}
# calculate co-peak distance
if (co_peak) {
maxes <- apply(cleaned, 1, which.max)
co_peak <- as.matrix(dist(maxes))
feature_matrices[["co_peak"]] <- co_peak
}
# calculate co-apex (Gaussian) score
if (co_apex) {
CA <- co_apex(gaussians, proteins)
feature_matrices[["co_apex"]] <- CA
}
# add number of pairs to the feature matrix list
feature_matrices[["n_pairs"]] <- pairs
## make sure all dimensions are identical
if (!all(map_int(feature_matrices, nrow) == n_proteins) |
!all(map_int(feature_matrices, ncol) == n_proteins))
stop("at least one feature matrix did not have correct dimensions")
if (length(feature_matrices) == 0)
stop("no features were calculated")
first <- feature_matrices[[1]]
tri <- upper.tri(first)
idxs <- which(tri, arr.ind = TRUE)
dat <- data.frame(protein_A = rownames(first)[idxs[, 1]],
protein_B = rownames(first)[idxs[, 2]],
stringsAsFactors = FALSE)
dat <- cbind(dat, map(feature_matrices, ~ .[tri]))
## cap Euclidean distance at the 99.5th %ile
if ("euclidean_distance" %in% colnames(dat)) {
vec = dat$euclidean_distance
threshold = quantile(vec, probs = max_euclidean_quantile, na.rm = T)
dat$euclidean_distance[vec >= threshold] <- threshold
}
# filter features based on n_pairs
dat <- filter(dat, n_pairs >= min_pairs)
# remove n_pairs from the feature list
if (!n_pairs) {
dat$n_pairs <- NULL
}
return(dat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.