#' Calculate and merge features for a list of CF-MS matrices
#'
#' Score each protein pair in a series of CF-MS experiments, using a measure of
#' association of choice, then merge the calculated features into a single
#' data frame that contains scores for each pair across all of the input
#' experiments. This data frame can then be provided to a classifier as input,
#' alongside a set of 'gold-standard' interacting pairs, in order to score
#' interactions in a manner that integrates data from multiple CF-MS replicates.
#' Note that the final data frame may contain missing values; to replace them,
#' use the \link{impute_missing_features} function.
#'
#' @param mats a list of CF-MS matrices, with proteins in rows and fractions in
#' columns
#' @param metric the measure of association to use in scoring protein pairs
#'
#' @return a data frame containing features for all protein pairs across all
#' replicates
#'
#' @importFrom purrr map
#' @importFrom tidyr drop_na
#' @importFrom reshape2 melt
#'
#' @export
calculate_features = function(mats, metric = metrics()) {
metric = match.arg(metric)
# first, score the pairs
pairs = map(mats, score_pairs, metric = metric)
# then convert each pair to a data frame
feature_dfs = map(pairs, ~ melt(.x,
varnames = c('protein_A', 'protein_B'),
value.name = metric,
as.is = TRUE) %>%
drop_na() %>%
filter(protein_A < protein_B))
# then merge the features
feature_df = merge_features(feature_dfs)
}
#' Merge features across multiple replicates
#'
#' Merge features extracted from multiple replicates into a single data
#' frame for input to a classifier. Note that the merged data frame may contain
#' missing values; to replace them, use the \link{impute_missing_features}
#' function.
#'
#' @param feature_dfs a list of feature data frames, each of which has protein
#' pairs in the first two columns. Alternatively, a list of square matrices
#' can be provided as input, and will be coerced into a list of feature
#' data frames.
#'
#' @return a data frame containing features for all protein pairs across all
#' replicates
#'
#' @importFrom purrr map map_lgl
#' @importFrom reshape2 melt
#' @importFrom dplyr full_join filter
#' @importFrom tidyr drop_na
#' @importFrom tester is_square_matrix
#'
#' @export
merge_features = function(feature_dfs) {
# catch square matrices and convert them to pairwise data frames
is_square_mat = map_lgl(feature_dfs, tester::is_square_matrix)
if (all(is_square_mat)) {
feature_dfs %<>% map(~ melt(.x,
varnames = c('protein_A', 'protein_B'),
value.name = 'feature',
as.is = TRUE) %>%
drop_na() %>%
filter(protein_A < protein_B))
}
features = Reduce(function(x, y) full_join(x, y, by = colnames(x)[c(1, 2)]),
feature_dfs)
return(features)
}
#' Impute missing features with the median, plus or minus some random noise
#'
#' Replace missing data within each numeric column of a data frame with
#' the column median, plus or minus some random noise, in order to train
#' classifiers that do not readily ignore missing data (e.g. random forests or
#' support vector machines).
#'
#' @param dat the feature data frame for which to replace missing data
#' @param noise_pct the standard deviation of the random normal
#' distribution from which to draw added noise, expressed as a
#' percentage of the standard deviation of the non-missing values in each
#' column
#'
#' @return a data frame with missing values in each numeric column replaced
#' by the column median, plus or minus some random noise
#'
#' @importFrom stats rnorm median sd
#'
#' @export
impute_missing_features = function(dat, noise_pct = 0.05) {
for (col_name in colnames(dat)) {
column = dat[[col_name]]
if (!is.numeric(column))
next
## first, replace infinite values
infinite = is.infinite(column)
column[infinite] = NA
## second, replace other missing values
missing = !is.finite(column)
dat[[col_name]][missing] =
median(column, na.rm = TRUE) + rnorm(sum(missing)) *
sd(column, na.rm = TRUE) * noise_pct
}
return(dat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.