#' Aggregate peptide abundances to protein abundances
#'
#' Similar to the openMS module ProteinQuantifier, this function provides different
#' methods to aggregate peptide intensities to their parent proteins. It is mainly
#' intended for the use with (raw) Diffacto results, a table of peptide intensities
#' and covariation scores (weights) that can be used to filter peptides before aggregating
#' them up to protein abundances.
#'
#' @importFrom dplyr %>%
#' @importFrom dplyr mutate
#' @importFrom dplyr summarise
#' @importFrom dplyr .data
#' @importFrom dplyr group_by
#' @importFrom dplyr all_of
#' @importFrom dplyr across
#' @importFrom tidyr separate_rows
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr pivot_wider
#' @importFrom rlang .data
#'
#' @param data the input data frame
#' @param sample_cols (character) columns to be used for peptide aggregation
#' @param protein_col (character) column containing unique protein IDs/names
#' @param peptide_col (character) column containing unique peptide IDs/sequences
#' @param n_protein_col (character) column containing number of proteins annotated for this peptide.
#' THis column indicates ambiguous peptides whose abundance are shared between n proteins.
#' @param split_ambiguous (logical) if those protein groups should be split into individual proteins or not
#' @param split_char (character) character by which to split protein groups
#' @param weight_col (character) the column containing weights or covariance scores
#' @param weight_threshold (numeric) covariance score (weight) cutoff, Diffacto's default is 0.5
#' @param method (character) aggregation method, one of ('sum', 'weightedsum', 'mean', 'weightedmean', 'wgeomean').
#' The default is 'sum'
#'
#' @return a data frame with aggregated protein intensities, one protein at a row
#'
#' @examples
#' # load additional dependencies
#' library(dplyr)
#' library(tidyr)
#'
#' # generate data frame
#' df <- data.frame(
#' protein = c("A", "B", "C", "C/D", "C/D/E", "E", "F", "G"),
#' n_protein = c(1,1,1,2,3,1,1,1),
#' weight = rep(1,8),
#' peptide = letters[1:8],
#' ab1 = sample(1:100, 8),
#' ab2 = sample(1:100, 8),
#' ab3 = sample(1:100, 8)
#' )
#'
#' aggregate_pep(
#' data = df,
#' sample_cols = c("ab1", "ab2", "ab3"),
#' protein_col = "protein",
#' peptide_col = "peptide",
#' n_protein_col = "n_protein",
#' split_ambiguous = TRUE,
#' split_char = "/",
#' method = "sum"
#' )
#'
#' @export
aggregate_pep <- function(
data,
sample_cols,
protein_col,
peptide_col,
n_protein_col = NULL,
split_ambiguous = FALSE,
split_char = NULL,
weight_col = NULL,
weight_threshold = 0.5,
method = "sum"
) {
# methods for peptide aggregation to proteins
methods = list(
sum = function(x, weight = NULL) sum(x, na.rm = TRUE),
weightedsum = function(x, weight) sum(x*weight, na.rm = TRUE),
mean = function(x, weight = NULL) mean(x, na.rm = TRUE),
weightedmean = function(x, weight) weighted.mean(x, weight, na.rm = TRUE),
wgeomean = function(x, weight) exp(weighted.mean(log(x), weight, na.rm = TRUE))
)
# optional preprocessing: should abundance of ambiguous peptides
# be shared among two or more proteins? The default is to remove ambiguous
# peptides (openMS ProteinQuantifier). But a simple way to include them
# is to evenly disitribute ambiguous petide abundance between proteins.
# More elaborate algorithms exist in the literature, but this problem usually
# plays a minor roll in microbes (fewer homologs, no splicing variants)
if (split_ambiguous) {
if (is.character(split_char) & is.character(n_protein_col)) {
data <- data %>%
separate_rows({{protein_col}}, sep = split_char) %>%
mutate(across(sample_cols, function(x) x / .data[[n_protein_col]]))
} else {
stop("arguments 'split_char' and 'n_protein_col' must be character")
}
}
# filter peptides by weight threshold
if (!is.null(weight_col)) {
data <- filter(data, .data[[weight_col]] >= weight_threshold)
}
# add dummy column with unity weights if not supplied
if (is.null(weight_col)) {
weight_col = "weight"
data <- mutate(data, weight = 1)
}
print(sample_cols)
# rearrange data frame to long format, in order to summarise
data %>% pivot_longer(all_of(sample_cols), names_to = "sample", values_to = "intensity") %>%
# group by protein ID and sample
group_by(sample, .data[[protein_col]]) %>%
# summarise by applying method of choice
summarise(
n_peptides = length(unique(.data[[peptide_col]])),
intensity = do.call(methods[[method]], list(get("intensity"), get(weight_col)))
) %>%
# spread samples on columns again
pivot_wider(names_from = "sample", values_from = "intensity") %>%
# change order to peptide abundances as last
select(all_of(c(protein_col, "n_peptides", sample_cols)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.