#' Input validation for eBayes/DEqMS/MS-EmpiRe functions
#'
#' @param eset protein/peptide log2 intensity matrix stored as a Biobase ExpressionSet. Must describe protein_id and sample_id in metadata
#' @param model_matrix a `stats::model.matrix()` result that is supplied to `limma::lmFit()`
#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition"
validate_eset_fitdata = function(eset, model_matrix, model_matrix_result_prop) {
if(length(eset) != 1 || !"ExpressionSet" %in% class(eset)) {
append_log("parameter eset must be an ExpressionSet", type = "error")
}
f = Biobase::fData(eset)
if(!is.data.frame(f) || !"protein_id" %in% colnames(f) || anyNA(f$protein_id)) {
append_log('parameter eset must be an ExpressionSet with non-NA "protein_id" as a featuredata (i.e. Biobase::fData(eset)$protein_id)', type = "error")
}
p = Biobase::pData(eset)
if(!is.data.frame(p) || !"sample_id" %in% colnames(p) || anyNA(p$sample_id)) {
append_log('parameter eset must be an ExpressionSet with non-NA "sample_id" as a phenotypic data (i.e. Biobase::pData(eset)$sample_id)', type = "error")
}
if(!is.matrix(model_matrix)) {
append_log("parameter model_matrix must be a matrix", type = "error")
}
if(nrow(model_matrix) != nrow(p)) {
append_log("parameters eset and model_matrix must describe the same number of samples (i.e. number of columns/samples in ExpressionSet must match the number of rows in model_matrix)", type = "error")
}
if(length(model_matrix_result_prop) != 1 || !is.character(model_matrix_result_prop) ||
model_matrix_result_prop == "" || !model_matrix_result_prop %in% colnames(model_matrix)) {
append_log("parameter model_matrix_result_prop must be a single string, matching any of the column names in the model_matrix parameter", type = "error")
}
}
#' helper function to double-check we don't submit an ExpressionSet with too few values to DEA
#'
#' @param eset a Biobase ExpressionSet
#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition"
#' @param is_peptide boolean indicating peptide-level (FALSE = protein-level)
#' @param method_name label for printing error messages
#' @param min_values minimum number of data points in each condition. Alternatively, use `options(msdap.dea.minvalue = 5)`
subset_protein_eset_for_dea = function(eset, model_matrix_result_prop, is_peptide, method_name, min_values = NULL) {
# use min_values parameter if set
if(!is.null(min_values)) {
if(length(min_values) != 1 || !is.numeric(min_values) || !is.finite(min_values) || min_values < 2) {
append_log("parameter min_values must be NULL or a single integer value >= 2", type = "error")
}
} else {
# otherwise, use global option (if set)
min_values_global = getOption("msdap.dea.minvalue")
if(!is.null(min_values_global)) {
min_values = min_values_global
if(length(min_values) != 1 || !is.numeric(min_values) || !is.finite(min_values) || min_values < 2) {
append_log("options setting for 'msdap.dea.minvalue' must be NULL or a single integer value >= 2", type = "error")
}
}
}
# default to 2
if(is.null(min_values)) {
min_values = 2
}
min_values = as.integer(ceiling(min_values))
x = Biobase::exprs(eset)
ptmp = Biobase::pData(eset)
stopifnot("subset_protein_eset_for_dea invalid eset * model_matrix_result_prop combination" = model_matrix_result_prop %in% colnames(ptmp))
x_regression_var_value = unlist(ptmp[,model_matrix_result_prop]) # model_matrix_result_prop is typically "condition"
cols1 = x_regression_var_value == x_regression_var_value[1] # columns condition 1
cols2 = !cols1 # columns condition 2
# test for both NA and zeros
# (compatible with eBayes/DEqMS/MSqRob ExpressionSets that have log2 intensities,
# and with MS-EmpiRe dataset with plain intensities sans log transformation that contain zeros for missing values)
rows = rowSums(is.finite(x[,cols1,drop=FALSE]) & x[,cols1,drop=FALSE] > 0) >= min(min_values, sum(cols1)) &
rowSums(is.finite(x[,cols2,drop=FALSE]) & x[,cols2,drop=FALSE] > 0) >= min(min_values, sum(cols2))
if(sum(rows) < 10) {
append_log(sprintf("less than 10 %s have a non-zero value in both conditions: too little input data for DEA", ifelse(is_peptide, "peptides", "proteins")), type = "error")
}
# if there are any rows to remove, create a subset of the ExpressionSet + show warning (i.e. this should never occur during typical MS-DAP workflow that includes filtering)
if(any(!rows)) {
ftmp = Biobase::fData(eset)[rows,]
y = x[rows,,drop=FALSE]
eset = Biobase::ExpressionSet(
assayData = y,
featureData = Biobase::annotatedDataFrameFrom(y, byrow = T),
protocolData = Biobase::annotatedDataFrameFrom(y, byrow = F)
)
Biobase::pData(eset) = ptmp
Biobase::fData(eset) = ftmp
append_log(sprintf("DEA requires at least %d values per experimental condition; %d / %d %s were removed prior to applying %s\n(in typical MS-DAP workflows one would apply peptide-filtering upstream, e.g. with parameters in the analysis_quickstart() function)",
min_values, sum(!rows), length(rows), ifelse(is_peptide, "peptides", "proteins"), method_name), type = "warning")
}
return(eset)
}
#' Apply limma::eBayes() function to a protein-level ExpressionSet
#'
#' ref; PMID:25605792
#' ref; https://bioconductor.org/packages/release/bioc/html/limma.html
#'
#' @examples \dontrun{
#' # Minimal example:
#' # Let `m` be a log2 protein intensity matrix matrix where
#' # columns are samples and rows proteins.
#' # Let `eset` be a Biobase::ExpressionSet that contains `m`.
#' # Let `m_groups` be a character vector that describes the
#' # sample group/condition for each column in `m`.
#' # e.g. m_groups = c("WT", "WT", "WT", "KO", "KO", "KO")
#' model_matrix = stats::model.matrix(~m_groups)
#' de_ebayes(eset, model_matrix, colnames(model_matrix)[2])
#' }
#' @param eset protein-level log2 intensity matrix stored as a Biobase ExpressionSet
#' @param model_matrix a `stats::model.matrix()` result that is supplied to `limma::lmFit()`
#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition"
#' @export
de_ebayes = function(eset, model_matrix, model_matrix_result_prop) {
start_time = Sys.time()
# will throw an error if input data is invalid
validate_eset_fitdata(eset, model_matrix, model_matrix_result_prop)
# ensure there are at least 2 valid data points per condition / group @ model_matrix_result_prop (subset the expressionset otherwise)
eset = subset_protein_eset_for_dea(eset, model_matrix_result_prop, is_peptide = FALSE, method_name = "eBayes")
x = Biobase::exprs(eset)
fit = suppressWarnings(limma::eBayes(limma::lmFit(x, model_matrix)))
# !! sort.by="none" keeps the output table aligned with input matrix
result = suppressMessages(limma::topTable(fit, number = nrow(x), coef = model_matrix_result_prop, adjust.method = "fdr", sort.by = "none", confint = TRUE))
# note; fit$coefficients and fit$stdev.unscaled matrices contain the intercept, don't use in the ES and SE computation
# eBayes effect size: Cohen's d in limma, according to Gordon Smyth https://support.bioconductor.org/p/71747/#71781
result$effectsize = fit$coefficients[,model_matrix_result_prop] / sqrt(fit$s2.post)
# eBayes standard error, according to Gordon Smyth https://support.bioconductor.org/p/70175/
result$standarderror = sqrt(fit$s2.post) * fit$stdev.unscaled[,model_matrix_result_prop]
result$standarddeviation = sqrt(fit$s2.post)
# convert from data.frame to a tibble that follows the column names/format we expect downstream
result = as_tibble(result) %>%
mutate(protein_id = rownames(x)) %>%
select(protein_id, pvalue = P.Value, qvalue = adj.P.Val, foldchange.log2 = logFC, effectsize, tstatistic = t, standarddeviation, standarderror) %>%
add_column(dea_algorithm = "ebayes")
append_log_timestamp("eBayes", start_time)
return(result)
}
#' Apply DEqMS to a protein-level ExpressionSet
#'
#' ref; PMID:32205417
#' ref; https://github.com/yafeng/DEqMS
#'
#' implementation follows example code from the vignette; https://bioconductor.org/packages/release/bioc/vignettes/DEqMS/inst/doc/DEqMS-package-vignette.html
#'
#' @param eset protein-level log2 intensity matrix stored as a Biobase ExpressionSet. Note that it must contain protein metadata column 'npep' that holds integer peptide counts
#' @param model_matrix a `stats::model.matrix()` result that is supplied to `limma::lmFit()`
#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition"
#' @param doplot create a QC plot?
#' @export
de_deqms = function(eset, model_matrix, model_matrix_result_prop, doplot = FALSE) {
start_time = Sys.time()
# will throw an error if input data is invalid
validate_eset_fitdata(eset, model_matrix, model_matrix_result_prop)
# ensure there are at least 2 valid data points per condition / group @ model_matrix_result_prop (subset the expressionset otherwise)
eset = subset_protein_eset_for_dea(eset, model_matrix_result_prop, is_peptide = FALSE, method_name = "DEqMS")
# additional input validation; DEqMS requires peptide counts
tmp = Biobase::fData(eset)$npep
if(length(tmp) == 0 || !all(is.finite(tmp) & is.integer(tmp) & tmp > 0)) {
append_log("parameter eset must contain a column 'npep' (i.e. Biobase::fData(eset)) with positive integer values", type = "error")
}
rm(tmp)
# eBayes fit
fit = suppressWarnings(limma::eBayes(limma::lmFit(Biobase::exprs(eset), model_matrix)))
### bugfix for DEqMS::spectraCounteBayes()
# DEqMS will Loess fit log peptide counts versus log sigma^2
# However, in some datasets/normalizations a subset of `fit$sigma` values from limma::eBayes() fit
# Log transforming sigma^0 where sigma is zero results in a non-finite value, which crashes the loess fit
# fix: replace zero sigma's with lowest non-zero sigma (easy fix without having to update DEqMS code)
# ? perhaps it's better if DEqMS would fit against eBayes' posterior estimates of sigma instead ?
fit$sigma[!is.finite(fit$sigma) | fit$sigma <= 0] = min(fit$sigma[is.finite(fit$sigma) & fit$sigma > 0])
## peptide-per-protein counts
# note that the ebayes fit results are aligned with the intensity-value-matrix we supplied upstream,
# which in turn is aligned with the metadata in the ExpressionSet object. So we can just use its npep property as-is
fit$count = Biobase::fData(eset)$npep
# apply DEqMS, then overwrite the fit object with DEqMS's
fit = suppressWarnings(DEqMS::spectraCounteBayes(fit))
if(doplot) {
suppressWarnings(DEqMS::VarianceBoxplot(fit, n=20, main = "DEqMS QC plot", xlab="#unique peptides per protein"))
}
## analogous to our de_ebayes() implementation, extract results from the fit object but grab the DEqMS specific output columns where available (reference; DEqMS::outputResult() )
# !! sort.by="none" keeps the output table aligned with input matrix
result = suppressMessages(limma::topTable(fit, number = length(fit$count), coef = model_matrix_result_prop, adjust.method = "fdr", sort.by = "none", confint = TRUE))
# indexing by name might fail in the deqms output table for some model matrices !
if(model_matrix_result_prop %in% colnames(fit$sca.p)) {
result$pvalue = fit$sca.p[,model_matrix_result_prop]
result$tstatistic = fit$sca.t[,model_matrix_result_prop]
} else {
result$pvalue = fit$sca.p[,match(model_matrix_result_prop, colnames(fit$coefficients))] # fit$sca.p might be unnamed -> use "standard" property of similar dimensions to find index
result$tstatistic = fit$sca.t[,match(model_matrix_result_prop, colnames(fit$coefficients))]
}
result$qvalue = p.adjust(result$pvalue, method = "fdr")
## fit$coefficients and fit$stdev.unscaled contain the intercept, remove from ES and SE computation
# eBayes effect size: Cohen's d in limma, according to Gordon Smyth https://support.bioconductor.org/p/71747/#71781
result$effectsize = fit$coefficients[,model_matrix_result_prop] / sqrt(fit$sca.postvar)
# eBayes standard error, according to Gordon Smyth https://support.bioconductor.org/p/70175/
result$standarderror = sqrt(fit$sca.postvar) * fit$stdev.unscaled[,model_matrix_result_prop]
result$standarddeviation = sqrt(fit$sca.postvar)
## create a result tibble that contains all columns required for downstream compatability with this pipeline; protein_id, pvalue, qvalue, foldchange.log2, dea_algorithm
result = as_tibble(result) %>%
mutate(protein_id = rownames(fit)) %>%
select(protein_id, pvalue, qvalue, foldchange.log2 = logFC, effectsize, tstatistic, standarddeviation, standarderror) %>%
add_column(dea_algorithm = "deqms")
append_log_timestamp("DEqMS", start_time)
return(result)
}
#' MS-EmpiRe implementation, a wrapper function for msEmpiRe::de.ana()
#'
#' ref; PMID:31235637
#' ref; https://github.com/zimmerlab/MS-EmpiRe
#'
#' note; MS-EmpiRe crashes when all proteins have the same number of peptides (eg; filtering topn=1 or topn=3 and minpep=3)
#'
#' TODO: proper standarderror computation
#'
#' @param eset a Biobase ExpressionSet that contains the peptide log2 intensities
#' @param model_matrix a `stats::model.matrix()` result that is supplied to `limma::lmFit()`
#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition"
#' @export
de_msempire = function(eset, model_matrix, model_matrix_result_prop) {
start_time = Sys.time()
# issue with MS-EmpiRe we previously reported, including code to reproduce and resolve this; https://github.com/zimmerlab/MS-EmpiRe/issues/10
# bugfix: fix the random seed to ensure MS-EmpiRe results are always the same (given the exact same input data)
set.seed(123)
# will throw an error if input data is invalid
validate_eset_fitdata(eset, model_matrix, model_matrix_result_prop)
# MS-EmpiRe will crash if there are input values with all zeros
# ensure there are at least 2 valid data points per condition / group @ model_matrix_result_prop (subset the expressionset otherwise)
eset = subset_protein_eset_for_dea(eset, model_matrix_result_prop, is_peptide = FALSE, method_name = "MS-EmpiRe")
# importantly, MS-EmpiRe expects the input data matrix to be NOT be log2 transformed
x = Biobase::exprs(eset)
x = 2^x
x[!is.finite(x)] = 0
Biobase::exprs(eset) = x
# protein identifiers are expected in column "prot.id"
Biobase::fData(eset)["prot.id"] = Biobase::fData(eset)[, "protein_id"]
# experimental condition is expected in column "condition"
# we here use the values from the design matrix @ desired output column
Biobase::pData(eset)["condition"] = unname(model_matrix[,model_matrix_result_prop])
# call msempire main function while silencing all of its output
tmp = capture.output(result <- suppressWarnings(suppressMessages(msEmpiRe::de.ana(eset))))
append_log_timestamp("MS-EmpiRe", start_time)
tibble(
protein_id = as.character(result$prot.id), # enforce type to be robust against upstream R package changes (e.g. prot.id as a factor)
pvalue = as.numeric(result$p.val),
qvalue = as.numeric(result$p.adj),
foldchange.log2 = as.numeric(result$log2FC),
# update;
# In MS-EmpiRe, 'prot.sd' is the standard deviation of 'prot.s' (which underwent shrinkage),
# not of the protein foldchange. So here we compute the effectsize based on prot.s/prot.sd analogous to
# MS-EmpiRe's protein p-value computation.
#
# However, there seems to be a bug in MS-EmpiRe where the sign of prot.s does not always agree with log2FC !
# So this simple test fails on several datasets;
# stopifnot( (result$log2FC >= 0) == (result$prot.s / result$prot.sd >= 0) )
#
# As a workaround, we compute absolute effectsize and then apply the sign of log2FC
effectsize = abs(as.numeric(result$prot.s) / as.numeric(result$prot.sd)) * ifelse(foldchange.log2 < 0, -1, 1),
#previous# effectsize = as.numeric(result$log2FC) / as.numeric(result$prot.sd),
tstatistic = NA,
# approximate standard deviation for the log2FC from the standardized effectsize
standarddeviation = abs(foldchange.log2 / effectsize),
#previous# standarddeviation = as.numeric(result$prot.sd),
standarderror = NA,
dea_algorithm = "msempire"
) %>%
# if either log2FC or p.val is invalid, remove the protein from results
filter(is.finite(foldchange.log2) & is.finite(pvalue))
}
#' Wrapper function for msqrob, using the implementation from the msqrobsum package
#'
#' note that this is the final version of this algorithm provided by its authors, and is no longer maintained; https://github.com/statOmics/msqrob
#' only difference with the GitHub version is that code included here has minor adaptions to situationally improve multithreading (i.e. speeds up computation a bit, on some systems)
#'
#' ref; PMID:26566788 PMID:32321741
#' ref; https://github.com/statOmics/msqrob
#'
#' @param eset a Biobase ExpressionSet that contains the peptide log2 intensities
#' @param model_matrix a `stats::model.matrix()` result that is supplied to `limma::lmFit()`
#' @param model_matrix_result_prop the column name in `model_matrix` that should be returned as the resulting coefficient estimated by `eBayes()`. In MS-DAP workflows this is typically "condition"
#' @param use_peptide_model if `TRUE`, apply msqrob. if `FALSE`, apply msqrobsum
#' @param random_variables optionally, an array of column names in the sample metadata table that should be used as additional regression variables
#' @export
de_msqrobsum_msqrob = function(eset, model_matrix, model_matrix_result_prop, use_peptide_model, random_variables = NULL) {
start_time = Sys.time()
# will throw an error if input data is invalid
validate_eset_fitdata(eset, model_matrix, model_matrix_result_prop)
if(length(use_peptide_model) != 1 || !use_peptide_model %in% c(TRUE, FALSE)) {
append_log("use_peptide_model parameter must be either TRUE or FALSE", type = "error")
}
algorithm_name = ifelse(use_peptide_model, "msqrob", "msqrobsum")
# ensure there are at least 2 valid data points per condition / group @ model_matrix_result_prop (subset the expressionset otherwise)
eset = subset_protein_eset_for_dea(eset, model_matrix_result_prop, is_peptide = TRUE, method_name = algorithm_name)
# just to be sure, fix random seed
set.seed(123)
#### below text quoted from MSqRobsum manual @ https://github.com/statOmics/MSqRobSum/blob/master/vignettes/msqrobsum.Rmd
# We can also use the `msqrobsum()` function to perform a MSqRob analysis on peptide intensities without first summarizing to protein summaries.
# Because a protein can have intensities from multiple peptides and the intensities belonging to 1 peptide are correlated with eachother whe have to account for this in our model.
# Previously we only had 1 protein summary per sample but now we have multiple peptide intensities per sample and these are also correlated with eachother.
# Hence our model: `expression ~ (1|condition) + (1|sample) + (1|feature)`.
# However some proteins will only have intensities from 1 peptide and the model fitting wil fail if we try to use the model above. For these proteins we should use te reduced model `expression ~ (1|condition)`. `msqrobsum()`
# The `formulas ` parameter accepts a vector of formulas. Model fitting with the first model will be attempted first but if that fails it tries the second model and so on.
msnset = MSnbase::as.MSnSet.ExpressionSet(eset)
if (use_peptide_model) {
form_string = c(sprintf("expression ~ (1 | %s) + (1 | sample_id) + (1 | peptide_id)", model_matrix_result_prop), sprintf("expression ~ (1 | %s)", model_matrix_result_prop))
} else {
form_string = sprintf("expression ~ (1 | %s)", model_matrix_result_prop)
}
# strip forbidden random variables
random_variables = setdiff(random_variables, c("peptide_id", "protein_id", "sample_id", model_matrix_result_prop))
# compose updated formula if the user provided additional random variables
if(length(random_variables) > 0) {
form_string = sapply(form_string, function(x) paste(c(x, sprintf("(1 | %s)", random_variables)), collapse=" + "), USE.NAMES = FALSE)
}
# always add y~(1|condition) without any optional random_variables as a last-resort model if all other model specifications fail (due to lack of data) -->> then convert from string to formula/expression
form_string = unique(c(form_string, sprintf("expression ~ (1 | %s)", model_matrix_result_prop)))
form_eval = sapply(form_string, function(x) eval(parse(text=x)), USE.NAMES = FALSE)
append_log(sprintf("%s linear regression formula%s; %s", algorithm_name, ifelse(length(form_string)>1, "s (these are prioritized. eg; if a model fit fails due to lack of data, the next formula is used)", ""), paste(form_string, collapse = " , ")), type = "info")
# MSqRob model as re-implemented in msqrobsum package, by original authors, and updated by us (only difference is computational performance)
if (use_peptide_model) {
result = suppressWarnings(msqrobsum(data = msnset, formulas = form_eval, group_vars = "protein_id", contrasts = model_matrix_result_prop, mode = "msqrob"))
# result = suppressWarnings(msqrobsum(data = msnset, formulas = form_eval, group_vars = "protein_id", contrasts = "condition", mode = "msqrob"))
# our version 1, without user-specified random variables; form = c(expression ~ (1 | condition) + (1 | sample_id) + (1 | peptide_id), expression ~ (1 | condition))
} else {
### first use MSqRobSum to rollup to protein level, 'robust' approach appears to be an improvement over the traditional 'sum', then apply msqrob
# line 273 @ https://github.com/statOmics/MSqRobSum/blob/97e22fddb9d6f1d3c29aafbae28c382148b9471d/vignettes/msqrobsum.Rmd#L273
protset = suppressWarnings(suppressMessages(MSnbase::combineFeatures(msnset, fun = "robust", groupBy = Biobase::fData(msnset)$protein_id)))
# line 313 @ https://github.com/statOmics/MSqRobSum/blob/97e22fddb9d6f1d3c29aafbae28c382148b9471d/vignettes/msqrobsum.Rmd#L313
result = suppressWarnings(msqrobsum(data = protset, formulas = form_eval, group_vars = "protein_id", contrasts = model_matrix_result_prop, mode = "msqrobsum"))
# result = suppressWarnings(msqrobsum(data = protset, formulas = form_eval, group_vars = "protein_id", contrasts = "condition", mode = "msqrobsum"))
## our version 1, without user-specified random variables;
#result = suppressWarnings(msqrobsum(data = protset, formulas = expression ~ (1 | condition), group_vars = "protein_id", contrasts = "condition", mode = "msqrobsum"))
}
result_unpacked = as_tibble(result) %>%
dplyr::select(protein_id, contrasts) %>%
tidyr::unnest(cols = contrasts)
result_unpacked$sigma_post = result_unpacked$se / result_unpacked$sigma_contrast
result_unpacked$effectsize = result_unpacked$logFC / result_unpacked$sigma_post
# prepare output table in our standard format
result = as_tibble(result_unpacked) %>%
mutate(protein_id = as.character(protein_id),
dea_algorithm = algorithm_name) %>%
select(protein_id, pvalue, qvalue, foldchange.log2 = logFC, effectsize, tstatistic = t, standarddeviation = sigma_post, standarderror = se, dea_algorithm)
append_log_timestamp(algorithm_name, start_time)
return(result)
}
#' MSqRob test function for internal use
#'
#' @param peptides TODO
#' @param proteins TODO
#' @param samples TODO
#' @param main_variable TODO
#' @param ordered_variables TODO
#' @param min_values minimum number of data points in each condition. Alternatively, use `options(msdap.dea.minvalue = 5)`
de_msqrob_customformula = function(peptides, proteins, samples, main_variable = "condition", ordered_variables = c("batch", "condition"), min_values = NULL) {
start_time = Sys.time()
tmp = samples |> pull(!!main_variable)
stopifnot(!anyNA(tmp) && n_distinct(tmp) >= 2)
stopifnot(main_variable %in% ordered_variables)
eset = tibble_as_eset(peptides, proteins, samples)
# ensure there are at least 2 valid data points per condition / group (subset the expressionset otherwise)
eset = subset_protein_eset_for_dea(eset, main_variable, is_peptide = TRUE, method_name = "de_msqrob_customformula", min_values = min_values)
# just to be sure, fix random seed
set.seed(123)
msnset = MSnbase::as.MSnSet.ExpressionSet(eset)
allranvar = paste(paste0("(1 | ", ordered_variables, ")"), collapse = " + ")
form_string = c(sprintf("expression ~ %s + (1 | sample_id) + (1 | peptide_id)", allranvar), sprintf("expression ~ %s", allranvar))
# always add y~(1|condition) without any optional random_variables as a last-resort model if all other model specifications fail (due to lack of data) -->> then convert from string to formula/expression
form_string = unique(c(form_string, sprintf("expression ~ (1 | %s)", main_variable)))
form_eval = sapply(form_string, function(x) eval(parse(text=x)), USE.NAMES = FALSE)
append_log(paste("de_msqrob_customformula;", paste(form_string, collapse = " , ")), type = "info")
result = suppressWarnings(msqrobsum(data = msnset, formulas = form_eval, group_vars = "protein_id", contrasts = main_variable, mode = "msqrob"))
########################################################### analogous to main function ###########################################################
result_unpacked = as_tibble(result) %>%
dplyr::select(protein_id, contrasts) %>%
tidyr::unnest(cols = contrasts)
result_unpacked$sigma_post = result_unpacked$se / result_unpacked$sigma_contrast
result_unpacked$effectsize = result_unpacked$logFC / result_unpacked$sigma_post
# prepare output table in our standard format
result = as_tibble(result_unpacked) %>%
mutate(protein_id = as.character(protein_id),
dea_algorithm = "msqrob") %>%
select(protein_id, pvalue, qvalue, foldchange.log2 = logFC, effectsize, tstatistic = t, standarddeviation = sigma_post, standarderror = se, dea_algorithm)
append_log_timestamp("msqrob", start_time)
########################################################### analogous to main function ###########################################################
return(result)
}
#' Perform linear regression with limma
#'
#' @description
#' Please refer to the vignette on custom limma models at the MS-DAP GitHub repository for a walk-through with examples.
#'
#' @param x result from `get_protein_matrix(... , include_npep = TRUE)`. Alternatively, a protein log2 intensity matrix (in this case, also provide the `npep` parameter if you want to apply DEqMS)
#' @param npep optionally, the number of peptides-per-protein for each row in input protein matrix. Only needed when providing a matrix for parameter `x` (i.e. you can leave this NULL when `x` is output from `get_protein_matrix(... , include_npep = TRUE)`)
#' @param model_matrix result from `stats::model.matrix()`
#' @param contrasts contrasts defined with `limma::makeContrasts()` that should be extracted from the linear regression model
#' @param limma_block_variable option passed to `limma::lmFit()` to fit block design. To skip (default), set NULL. Otherwise, this should be a character vector of the same length as nrow(model_matrix) so it describes a block/group per respective sample in the model/design matrix
#' @param ebayes_trend option passed to `limma::eBayes()`. Defaults to `FALSE` (same as limma default)
#' @param ebayes_robust option passed to `limma::eBayes()`. Defaults to `FALSE` (same as limma default)
#' @param deqms apply DEqMS? Defaults to `TRUE`. If `FALSE`, returns `limma::eBayes()` result as-is
#' @param return_table if `TRUE`, the default, returns a table with protein_id, log2fc, pvalue, adjusted pvalue, contrast (i.e. this is the output from MS-DAP function `limma_fit_extract_stats()`). If `FALSE`, returns the limma fit object as-is
#' @export
limma_wrapper = function(
x = protein_data,
npep = NULL,
model_matrix,
contrasts,
limma_block_variable = NULL,
ebayes_trend = FALSE,
ebayes_robust = FALSE,
deqms = TRUE,
return_table = TRUE
) {
mat = NULL
### input validation
# test that we got the proper data formats (matrix and vector)
if(!is.matrix(x)) {
if(!(is.list(x) && all(c("matrix", "npep") %in% names(x)) && is.matrix(x$matrix) &&
(is.vector(x$npep) || is.array(x$npep)) && is.numeric(x$npep) && length(x$npep) == nrow(x$matrix))) {
append_log("x parameter should either be a matrix, or a list that contains a 'matrix' and 'npep' parameter (where npep is an integer vector of same length as nrow(x$matrix)). Typically this is the output from MS-DAP function: get_protein_matrix(... , include_npep = TRUE)", type = "error")
}
if(!is.null(npep)) {
append_log("when parameter x is a list (i.e. input from function get_protein_matrix(... , include_npep = TRUE)), leave the npep parameter empty", type = "error")
}
mat = x$matrix
npep = as.vector(x$npep)
} else {
if(!((is.vector(npep) || is.array(npep)) && is.numeric(npep) && length(npep) == nrow(x))) {
append_log("when parameter x is a matrix, the npep parameter should be an integer vector of same length as nrow(x)", type = "error")
}
mat = x
npep = as.vector(npep)
}
# test that the matrix is numeric and that npep only contains finite values
if(mode(mat) != "numeric") {
append_log("matrix must be numeric (you can check with the mode() function)", type = "error")
}
if(!all(is.finite(npep) & npep > 0)) {
append_log("npep must only contain finite integers, larger than zero", type = "error")
}
# finally, check simple parameters
if(!is.matrix(model_matrix)) {
append_log("parameter model_matrix must be a matrix, i.e. output from stats::model.matrix()", type = "error")
}
if(!is.matrix(contrasts)) {
append_log("parameter contrasts must be a matrix, i.e. output from limma::makeContrasts()", type = "error")
}
if(!is.null(limma_block_variable) && (
anyNA(limma_block_variable) || !is.character(limma_block_variable) ||
any(limma_block_variable == "") || length(limma_block_variable) != nrow(model_matrix))
) {
append_log("parameter limma_block_variable must be a character vector of the same length as nrow(model_matrix) so it describes a block/group per respective sample in the model/design matrix. Empty strings and NA not allowed", type = "error")
}
if(length(ebayes_trend) != 1 || !ebayes_trend %in% c(TRUE, FALSE)) {
append_log("parameter ebayes_trend must be either TRUE or FALSE", type = "error")
}
if(length(ebayes_robust) != 1 || !ebayes_robust %in% c(TRUE, FALSE)) {
append_log("parameter ebayes_robust must be either TRUE or FALSE", type = "error")
}
if(length(deqms) != 1 || !deqms %in% c(TRUE, FALSE)) {
append_log("parameter deqms must be either TRUE or FALSE", type = "error")
}
if(length(return_table) != 1 || !return_table %in% c(TRUE, FALSE)) {
append_log("parameter return_table must be either TRUE or FALSE", type = "error")
}
### input validation
### prepare data
# enforce integer type. use ceiling so 0.1 becomes 1 (i.e. we never have zeros)
npep = as.integer(ceiling(npep))
### apply limma
# adapted from limma documentation; https://bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf , section 9.7 "Multi-level Experiments"
fit = NULL
if(is.null(limma_block_variable)) {
fit = limma::lmFit(mat, model_matrix)
} else {
corfit = limma::duplicateCorrelation(mat, model_matrix, block = limma_block_variable)
append_log(sprintf('applying a limma block design: %.3f correlation between samples within the same "block"\n', corfit$consensus), type = "info")
fit = limma::lmFit(mat, model_matrix, block = limma_block_variable, correlation = corfit$consensus)
}
fit2 = limma::contrasts.fit(fit, contrasts)
fit2 = limma::eBayes(fit2, trend = ebayes_trend, robust = ebayes_robust)
### apply DEqMS, analogous to msdap::de_deqms()
if(deqms) {
# bugfix for DEqMS::spectraCounteBayes(), see
fit2$sigma[!is.finite(fit2$sigma) | fit2$sigma <= 0] = min(fit2$sigma[is.finite(fit2$sigma) & fit2$sigma > 0])
# add peptide-per-protein counts to the limma fit object, as required for DEqMS
fit2$count = npep
fit2 = suppressWarnings(DEqMS::spectraCounteBayes(fit2))
append_log("applying limma eBayes(), followed by DEqMS (to adjust protein confidences according to their respective number of peptides)", type = "info")
} else {
append_log("returning limma eBayes() results as-is (i.e. no DEqMS post-hoc correction)", type = "info")
}
if(return_table) {
return(limma_fit_extract_stats(fit2))
} else {
return(fit2)
}
}
#' extract limma fit results
#'
#' @param fit result from `limma::lmFit()`
#' @param contrast_name name of a specific contrast(s) to extract. Default value, `NULL`, will return results from all contrasts
limma_fit_extract_stats = function(fit, contrast_name = NULL) {
all_contrasts = setdiff(colnames(fit$coefficients), c("intercept", "Intercept", "(intercept)", "(Intercept"))
# naive check for limma result
if(!is.list(fit) || !all(c("coefficients", "sigma", "s2.post", "stdev.unscaled") %in% names(fit)) ) {
append_log("fit parameter must be the result from limma::eBayes() (e.g. obtained from MS-DAP function limma_wrapper() )", type = "error")
}
if(is.null(contrast_name)) {
contrast_name = all_contrasts
} else {
if(anyNA(all_contrasts) || !is.character(contrast_name) || any(!contrast_name %in% all_contrasts)) {
append_log(paste("some values in contrast_name are not found in the provided fit parameter. Available contrasts in fit:", paste(all_contrasts, collapse = ",")), type = "error")
}
}
result = NULL
for(contr in contrast_name) {
x = tibble::as_tibble(limma::topTable(fit, number = Inf, coef = contr, adjust.method = "fdr", sort.by = "none", confint = TRUE))
x$protein_id = rownames(fit)
x$contrast = contr
x$foldchange.log2 = x$logFC
# test if the deqms R package was applied to this limma::eBayes() fit/result
is_deqms = "sca.p" %in% names(fit)
if(is_deqms) {
## DEqMS code analogous to msdap:::de_deqms()
# indexing by name might fail in the deqms output table for some model matrices !
if(contr %in% colnames(fit$sca.p)) {
x$pvalue = fit$sca.p[,contr]
} else {
x$pvalue = fit$sca.p[,match(contr, colnames(fit$coefficients))] # fit$sca.p might be unnamed -> use "standard" property of similar dimensions to find index
}
x$qvalue = p.adjust(x$pvalue, method = "fdr") # in this loop, we subsetted a specific contrast so can safely call p.adjust()
# analogous to standard ebayes fit @ below else block, but note that we use "sca" parameters added by DEqMS !
x$effectsize = fit$coefficients[,contr] / sqrt(fit$sca.postvar)
x$standarderror = sqrt(fit$sca.postvar) * fit$stdev.unscaled[,contr]
x$standarddeviation = sqrt(fit$sca.postvar)
} else {
## eBayes code analogous to msdap:::de_ebayes()
x$pvalue = x$P.Value
x$qvalue = x$adj.P.Val
x$effectsize = fit$coefficients[,contr] / sqrt(fit$s2.post)
x$standarderror = sqrt(fit$s2.post) * fit$stdev.unscaled[,contr]
x$standarddeviation = sqrt(fit$s2.post)
}
result = bind_rows(
result,
x %>% select(protein_id, contrast, foldchange.log2, effectsize, standarddeviation, standarderror, pvalue, qvalue) %>% mutate_all(unname)
)
}
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.