#' Process MS data: clean, normalize and summarize before differential analysis
#'
#' @param raw name of the raw (input) data set.
#' @param logTrans base of logarithm transformation: 2 (default) or 10.
#' @param normalization normalization to remove systematic bias between MS runs.
#' There are three different normalizations supported:
#' 'equalizeMedians' (default) represents constant normalization (equalizing the medians)
#' based on reference signals is performed.
#' 'quantile' represents quantile normalization based on reference signals
#' 'globalStandards' represents normalization with global standards proteins.
#' If FALSE, no normalization is performed.
#' @param nameStandards optional vector of global standard peptide names.
#' Required only for normalization with global standard peptides.
#' @param featureSubset "all" (default) uses all features that the data set has.
#' "top3" uses top 3 features which have highest average of log-intensity across runs.
#' "topN" uses top N features which has highest average of log-intensity across runs.
#' It needs the input for n_top_feature option.
#' "highQuality" flags uninformative feature and outliers.
#' @param remove_uninformative_feature_outlier optional. Only required if
#' featureSubset = "highQuality". TRUE allows to remove
#' 1) noisy features (flagged in the column feature_quality with "Uninformative"),
#' 2) outliers (flagged in the column, is_outlier with TRUE,
#' before run-level summarization. FALSE (default) uses all features and intensities
#' for run-level summarization.
#' @param min_feature_count optional. Only required if featureSubset = "highQuality".
#' Defines a minimum number of informative features a protein needs to be considered
#' in the feature selection algorithm.
#' @param n_top_feature optional. Only required if featureSubset = 'topN'.
#' It that case, it specifies number of top features that will be used.
#' Default is 3, which means to use top 3 features.
#' @param summaryMethod "TMP" (default) means Tukey's median polish,
#' which is robust estimation method. "linear" uses linear mixed model.
#' @param equalFeatureVar only for summaryMethod = "linear". default is TRUE.
#' Logical variable for whether the model should account for heterogeneous variation
#' among intensities from different features. Default is TRUE, which assume equal
#' variance among intensities from features. FALSE means that we cannot assume equal
#' variance among intensities from features, then we will account for heterogeneous
#' variation from different features.
#' @param censoredInt Missing values are censored or at random.
#' 'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
#' '0' uses zero intensities as censored intensity.
#' In this case, NA intensities are missing at random.
#' The output from Skyline should use '0'.
#' Null assumes that all NA intensites are randomly missing.
#' @param MBimpute only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
#' TRUE (default) imputes 'NA' or '0' (depending on censoredInt option)
#' by Accelated failure model. FALSE uses the values assigned by cutoffCensored.
#' @param remove50missing only for summaryMethod = "TMP". TRUE removes the runs
#' which have more than 50\% missing values. FALSE is default.
#' @param maxQuantileforCensored Maximum quantile for deciding censored missing values, default is 0.999
#' @param fix_missing Optional, same as the `fix_missing` parameter in MSstatsConvert::MSstatsBalancedDesign function
#' @param numberOfCores Number of cores for parallel processing. When > 1,
#' a logfile named `MSstats_dataProcess_log_progress.log` is created to
#' track progress. Only works for Linux & Mac OS. Default is 1.
#' @inheritParams .documentFunction
#'
#' @importFrom utils sessionInfo
#' @importFrom data.table as.data.table
#'
#' @export
#'
#' @examples
#' # Consider a raw data (i.e. SRMRawData) for a label-based SRM experiment from a yeast study
#' # with ten time points (T1-T10) of interests and three biological replicates.
#' # It is a time course experiment. The goal is to detect protein abundance changes
#' # across time points.
#' head(SRMRawData)
#' # Log2 transformation and normalization are applied (default)
#' QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
#' head(QuantData$FeatureLevelData)
#' # Log10 transformation and normalization are applied
#' QuantData1<-dataProcess(SRMRawData, logTrans=10, use_log_file = FALSE)
#' head(QuantData1$FeatureLevelData)
#' # Log2 transformation and no normalization are applied
#' QuantData2<-dataProcess(SRMRawData,normalization=FALSE, use_log_file = FALSE)
#' head(QuantData2$FeatureLevelData)
#'
dataProcess = function(
raw, logTrans = 2, normalization = "equalizeMedians", nameStandards = NULL,
featureSubset = "all", remove_uninformative_feature_outlier = FALSE,
min_feature_count = 2, n_top_feature = 3, summaryMethod = "TMP",
equalFeatureVar = TRUE, censoredInt = "NA", MBimpute = TRUE,
remove50missing = FALSE, fix_missing = NULL, maxQuantileforCensored = 0.999,
use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL,
numberOfCores = 1
) {
MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
log_file_path,
base = "MSstats_dataProcess_log_")
getOption("MSstatsLog")("INFO", "MSstats - dataProcess function")
.checkDataProcessParams(
logTrans, normalization, nameStandards,
list(method = featureSubset, n_top = n_top_feature,
remove_uninformative = remove_uninformative_feature_outlier),
list(method = summaryMethod, equal_var = equalFeatureVar),
list(symbol = censoredInt, MB = MBimpute))
peptides_dict = makePeptidesDictionary(as.data.table(unclass(raw)), normalization)
input = MSstatsPrepareForDataProcess(raw, logTrans, fix_missing)
input = MSstatsNormalize(input, normalization, peptides_dict, nameStandards)
input = MSstatsMergeFractions(input)
input = MSstatsHandleMissing(input, summaryMethod, MBimpute,
censoredInt, maxQuantileforCensored)
input = MSstatsSelectFeatures(input, featureSubset, n_top_feature,
min_feature_count)
.logDatasetInformation(input)
getOption("MSstatsLog")("INFO",
"== Start the summarization per subplot...")
getOption("MSstatsMsg")("INFO",
" == Start the summarization per subplot...")
processed = getProcessed(input)
input = MSstatsPrepareForSummarization(input, summaryMethod, MBimpute, censoredInt,
remove_uninformative_feature_outlier)
summarized = tryCatch(MSstatsSummarizeWithMultipleCores(input, summaryMethod,
MBimpute, censoredInt,
remove50missing, equalFeatureVar,
numberOfCores),
error = function(e) {
print(e)
NULL
})
getOption("MSstatsLog")("INFO",
"== Summarization is done.")
getOption("MSstatsMsg")("INFO",
" == Summarization is done.")
output = MSstatsSummarizationOutput(input, summarized, processed,
summaryMethod, MBimpute, censoredInt)
output
}
#' Feature-level data summarization with multiple cores
#'
#' @param input feature-level data processed by dataProcess subfunctions
#' @param method summarization method: "linear" or "TMP"
#' @param equal_variance only for summaryMethod = "linear". Default is TRUE.
#' Logical variable for whether the model should account for heterogeneous variation
#' among intensities from different features. Default is TRUE, which assume equal
#' variance among intensities from features. FALSE means that we cannot assume
#' equal variance among intensities from features, then we will account for
#' heterogeneous variation from different features.
#' @param censored_symbol Missing values are censored or at random.
#' 'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
#' '0' uses zero intensities as censored intensity.
#' In this case, NA intensities are missing at random.
#' The output from Skyline should use '0'.
#' Null assumes that all NA intensites are randomly missing.
#' @param remove50missing only for summaryMethod = "TMP". TRUE removes the runs
#' which have more than 50\% missing values. FALSE is default.
#' @param impute only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
#' TRUE (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model.
#' FALSE uses the values assigned by cutoffCensored
#' @param numberOfCores Number of cores for parallel processing. When > 1,
#' a logfile named `MSstats_dataProcess_log_progress.log` is created to
#' track progress. Only works for Linux & Mac OS. Default is 1.
#'
#' @importFrom parallel makeCluster parLapply stopCluster clusterExport
#'
#' @return list of length one with run-level data.
#'
MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_symbol,
remove50missing, equal_variance, numberOfCores = 1) {
if (numberOfCores > 1) {
protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
num_proteins = length(protein_indices)
function_environment = environment()
cl = parallel::makeCluster(numberOfCores)
parallel::clusterExport(cl, c("MSstatsSummarizeSingleTMP",
"MSstatsSummarizeSingleLinear",
"input", "impute", "censored_symbol",
"remove50missing", "protein_indices",
"equal_variance"),
envir = function_environment)
cat(paste0("Number of proteins to process: ", num_proteins),
sep = "\n", file = "MSstats_dataProcess_log_progress.log")
if (method == "TMP") {
summarized_results = parallel::parLapply(cl, seq_len(num_proteins), function(i) {
if (i %% 100 == 0) {
cat("Finished processing an additional 100 proteins",
sep = "\n", file = "MSstats_dataProcess_log_progress.log", append = TRUE)
}
single_protein = input[protein_indices[[i]],]
MSstatsSummarizeSingleTMP(
single_protein, impute, censored_symbol, remove50missing)
})
} else {
summarized_results = parallel::parLapply(cl, seq_len(num_proteins), function(i) {
if (i %% 100 == 0) {
cat("Finished processing an additional 100 proteins",
sep = "\n", file = "MSstats_dataProcess_log_progress.log", append = TRUE)
}
single_protein = input[protein_indices[[i]],]
MSstatsSummarizeSingleLinear(single_protein, equal_variance)
})
}
parallel::stopCluster(cl)
return(summarized_results)
} else {
input_split = split(input, input$PROTEIN)
return(MSstatsSummarize(input_split, method, impute, censored_symbol,
remove50missing, equal_variance))
}
}
#' Feature-level data summarization
#'
#' @param proteins_list list of processed feature-level data
#' @param method summarization method: "linear" or "TMP"
#' @param equal_variance only for summaryMethod = "linear". Default is TRUE.
#' Logical variable for whether the model should account for heterogeneous variation
#' among intensities from different features. Default is TRUE, which assume equal
#' variance among intensities from features. FALSE means that we cannot assume
#' equal variance among intensities from features, then we will account for
#' heterogeneous variation from different features.
#' @param censored_symbol Missing values are censored or at random.
#' 'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
#' '0' uses zero intensities as censored intensity.
#' In this case, NA intensities are missing at random.
#' The output from Skyline should use '0'.
#' Null assumes that all NA intensites are randomly missing.
#' @param remove50missing only for summaryMethod = "TMP". TRUE removes the runs
#' which have more than 50\% missing values. FALSE is default.
#' @param impute only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
#' TRUE (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model.
#' FALSE uses the values assigned by cutoffCensored
#'
#' @importFrom data.table uniqueN
#' @importFrom utils setTxtProgressBar
#'
#' @return list of length one with run-level data.
#'
#' @export
#'
#' @examples
#' raw = DDARawData
#' method = "TMP"
#' cens = "NA"
#' impute = TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
#' input = MSstatsMergeFractions(input)
#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
#' input = MSstatsSelectFeatures(input, "all")
#' processed = getProcessed(input)
#' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
#' input_split = split(input, input$PROTEIN)
#' summarized = MSstatsSummarize(input_split, method, impute, cens, FALSE, TRUE)
#' length(summarized) # list of summarization outputs for each protein
#' head(summarized[[1]][[1]]) # run-level summary
#'
MSstatsSummarize = function(proteins_list, method, impute, censored_symbol,
remove50missing, equal_variance) {
num_proteins = length(proteins_list)
summarized_results = vector("list", num_proteins)
if (method == "TMP") {
pb = utils::txtProgressBar(min = 0, max = num_proteins, style = 3)
for (protein_id in seq_len(num_proteins)) {
single_protein = proteins_list[[protein_id]]
summarized_results[[protein_id]] = MSstatsSummarizeSingleTMP(
single_protein, impute, censored_symbol, remove50missing)
setTxtProgressBar(pb, protein_id)
}
close(pb)
} else {
pb = utils::txtProgressBar(min = 0, max = num_proteins, style = 3)
for (protein_id in seq_len(num_proteins)) {
single_protein = proteins_list[[protein_id]]
summarized_result = MSstatsSummarizeSingleLinear(single_protein,
equal_variance)
summarized_results[[protein_id]] = summarized_result
setTxtProgressBar(pb, protein_id)
}
close(pb)
}
summarized_results
}
#' Linear model-based summarization for a single protein
#'
#' @param single_protein feature-level data for a single protein
#' @param equal_variances if TRUE, observation are assumed to be homoskedastic
#'
#' @return list with protein-level data
#'
#' @importFrom stats xtabs
#'
#' @export
#'
#' @examples
#' raw = DDARawData
#' method = "linear"
#' cens = NULL
#' impute = FALSE
#' # currently, MSstats only supports MBimpute = FALSE for linear summarization
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
#' input = MSstatsMergeFractions(input)
#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
#' input = MSstatsSelectFeatures(input, "all")
#' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
#' input_split = split(input, input$PROTEIN)
#' single_protein_summary = MSstatsSummarizeSingleLinear(input_split[[1]])
#' head(single_protein_summary[[1]])
#'
MSstatsSummarizeSingleLinear = function(single_protein, equal_variances = TRUE) {
ABUNDANCE = RUN = FEATURE = PROTEIN = LogIntensities = NULL
label = data.table::uniqueN(single_protein$LABEL) > 1
single_protein = single_protein[!is.na(ABUNDANCE)]
single_protein[, RUN := factor(RUN)]
single_protein[, FEATURE := factor(FEATURE)]
counts = xtabs(~ RUN + FEATURE,
data = unique(single_protein[, list(FEATURE, RUN)]))
counts = as.matrix(counts)
is_single_feature = .checkSingleFeature(single_protein)
fit = try(.fitLinearModel(single_protein, is_single_feature, is_labeled = label,
equal_variances), silent = TRUE)
if (inherits(fit, "try-error")) {
msg = paste("*** error : can't fit the model for ", unique(single_protein$PROTEIN))
getOption("MSstatsLog")("WARN", msg)
getOption("MSstatsMsg")("WARN", msg)
result = NULL
} else {
cf = summary(fit)$coefficients[, 1]
result = unique(single_protein[, list(Protein = PROTEIN, RUN = RUN)])
log_intensities = get_linear_summary(single_protein, cf,
counts, label)
result[, LogIntensities := log_intensities]
}
list(result)
}
#' Tukey Median Polish summarization for a single protein
#'
#' @param single_protein feature-level data for a single protein
#' @inheritParams MSstatsSummarize
#'
#' @return list of two data.tables: one with fitted survival model,
#' the other with protein-level data
#'
#' @importFrom stats predict
#'
#' @export
#'
#' @examples
#' raw = DDARawData
#' method = "TMP"
#' cens = "NA"
#' impute = TRUE
#' # currently, MSstats only supports MBimpute = FALSE for linear summarization
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
#' input = MSstatsMergeFractions(input)
#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
#' input = MSstatsSelectFeatures(input, "all")
#' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
#' input_split = split(input, input$PROTEIN)
#' single_protein_summary = MSstatsSummarizeSingleTMP(input_split[[1]],
#' impute, cens, FALSE)
#' head(single_protein_summary[[1]])
#'
MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
remove50missing) {
newABUNDANCE = n_obs = n_obs_run = RUN = FEATURE = LABEL = NULL
predicted = censored = NULL
cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN",
"FEATURE", "ref"))
single_protein = single_protein[(n_obs > 1 & !is.na(n_obs)) &
(n_obs_run > 0 & !is.na(n_obs_run))]
if (nrow(single_protein) == 0) {
return(list(NULL, NULL))
}
single_protein[, RUN := factor(RUN)]
single_protein[, FEATURE := factor(FEATURE)]
if (impute & any(single_protein[["censored"]])) {
survival_fit = .fitSurvival(single_protein[LABEL == "L", cols,
with = FALSE])
single_protein[, predicted := predict(survival_fit,
newdata = .SD)]
single_protein[, predicted := ifelse(censored & (LABEL == "L"), predicted, NA)]
single_protein[, newABUNDANCE := ifelse(censored & LABEL == "L",
predicted, newABUNDANCE)]
survival = single_protein[, c(cols, "predicted"), with = FALSE]
} else {
survival = single_protein[, cols, with = FALSE]
survival[, predicted := NA]
}
single_protein = .isSummarizable(single_protein, remove50missing)
if (is.null(single_protein)) {
return(list(NULL, NULL))
} else {
single_protein = single_protein[!is.na(newABUNDANCE), ]
is_labeled = nlevels(single_protein$LABEL) > 1
result = .runTukey(single_protein, is_labeled, censored_symbol,
remove50missing)
}
list(result, survival)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.