#' Filter compounds from mass spectrometry data
#'
#' Filters mass spectrometry data using a set of criteria, described in
#' \code{Details}. Returns an object of classes \code{\link{msDat}} and
#' \code{filterMS}.
#'
#' @param msObj An object class \code{\link{msDat}}. Note that this includes
#' objects created by the functions \code{binMS} and \code{msDat}.
#'
#' @param region A vector either of mode character or mode numeric. If numeric
#' then the entries should provide the indices for the region of interest in
#' the mass spectrometry data provided as the argument for \code{msObj}. If
#' character then the entries should uniquely specify the region of interest
#' through partial string matching (see criterion 1, 4).
#'
#' @param border Either a character string \code{"all"}, or a character string
#' \code{"none"}, or a length-1 or length-2 numeric value specifying the
#' number of fractions to either side of the region of interest to comprise
#' the bordering region. If a single numeric value, then this is the number
#' of fractions to each side of the region of interest; if it is two values,
#' then the first value is the number of fractions to the left, and the
#' second value is the number of fractions to the right. If there are not
#' enough fractions in either direction to completely span the number of
#' specified fractions, then all of the available fractions to the side in
#' question are considered to be part of the bordering region (see criterion
#' 2).
#'
#' @param bord_ratio A single nonnegative numeric value. A value of 0 will not
#' admit any compounds, while a value greater than 1 will admit all
#' compounds (see criterion 2).
#'
#' @param min_inten A single numeric value. A value less than the minimum mass
#' spectrometry value in the data will admit all compounds (see criterion
#' 4).
#'
#' @param max_chg A single numeric value specifying the maximum charge which a
#' compound may exhibit (see criterion 5)
#'
#' @details Attempts to filter out candidate compounds via subject-matter
#' knowledge, with the goal of removing spurious noise from downstream
#' models. The criteria for the downstream inclusion of a candidate
#' compound is listed below.
#'
#' \enumerate{
#'
#' \item The m/z intensity maximum must fall inside the range of the
#' bioactivity region of interest
#'
#' \item The ratio of the m/z intensity of a species in the areas bordering
#' the region of interest and the species maximum intensity must be less
#' than \code{bord_ratio}. When there is no bordering area then it is
#' taken to mean that all observations satisfy this criterion.
#'
#' \item The immediately right adjacent fraction to its maximum intensity
#' fraction for a species must have a non-zero abundance. In the case
#' of ties for the maximum, it is the fraction immediately to the right
#' of the rightmost maximum fraction which cannot have zero abundance.
#' When the fraction with maximum intensity is the rightmost fraction in
#' the data for an observation, then it is taken to mean that the
#' observation satisfies this criterion.
#'
#' \item At least 1 fraction in the region of interest must have intensity
#' greater than \code{min_inten}
#'
#' \item Compound charge state must be less than or equal to \code{max_chg}
#'
#' }
#'
#' @return Returns an object of class \code{filterMS} which inherits from
#' \code{msDat}. This object is a \code{list} with elements described
#' below. The class is equipped with a \code{print}, \code{summary}, and
#' \code{extractMS} function.
#'
#' \describe{
#'
#' \item{\code{msDatObj}}{ An object of class \code{\link{msDat}} such that
#' the encapsulated mass spectrometry data corresponds to each of the
#' candidate compounds that satisfed each of the criteria. If no
#' criteria are satisfied then \code{NULL} is returned. }
#'
#' \item{\code{cmp_by_crit}}{ A list containing \code{data.frame}s, one for
#' each criterion. Each row (if any) in one of the
#' sub-\code{data.frame}s contains the mass-to-charge and charge
#' information for a candidate compound that satisfies the criterion
#' represented by the \code{data.frame}; all of the compounds that
#' satisfied the criterion are included in the data. The
#' \code{data.frame}s are named \code{c1}, ..., \code{c5}, etc
#' corresponding to criterion 1, ..., criterion 5. }
#'
#' \item{\code{summ_info}}{ A list containing information pertaining to the
#' filtering process; for use by the summary function. }
#'
#' }
#'
#' @examples
#'
#' # Load mass spectrometry data
#' data(mass_spec)
#'
#' # Convert mass_spec from a data.frame to an msDat object
#' ms <- msDat(mass_spec = mass_spec,
#' mtoz = "m/z",
#' charge = "Charge",
#' ms_inten = c(paste0("_", 11:43), "_47"))
#'
#' # Filter out potential candidate compounds
#' filter_out <- filterMS(msObj = ms,
#' region = paste0("VO_", 17:25),
#' border = "all",
#' bord_ratio = 0.01,
#' min_inten = 1000,
#' max_chg = 7)
#'
#' # print, summary function
#' filter_out
#' summary(filter_out)
#'
#' # Extract filtered mass spectrometry data as a matrix or msDat object
#' filter_matr <- extractMS(msObj = filter_out, type = "matrix")
#' filter_msDat <- extractMS(msObj = filter_out, type = "matrix")
#'
#' @export
filterMS <- function(msObj, region, border="all", bord_ratio=0.05, min_inten=1000, max_chg=7L) {
# Check validity of arguments
filterMS_check_valid(msObj, region, border, bord_ratio, min_inten, max_chg)
# Number of criterion
nCrit <- 5L
# Create pointers to mass spec variables for convenience
msDatObj <- extractMS(msObj, "msDat")
ms <- msDatObj$ms
mtoz <- msDatObj$mtoz
chg <- msDatObj$chg
# Dimensions of mass spectrometry data
ms_nr <- NROW(ms)
ms_nc <- NCOL(ms)
# Create region index variable
regIdx <- extract_idx(ms, region, TRUE)
# Create border index, i.e. the indices that surround the region of interest
borIdx <- filterMS_border_idx(border, regIdx, ms_nc)
# All indices in either region or border
allIdx <- union(regIdx, borIdx)
lastIdx <- max(allIdx)
# maxIdx: the column index per row (i.e. per observation) of the fraction
# containing the maximum intensity level. The rightmost column index is
# chosen in the case of ties. Note that we only consider columns in either
# the region of interest or the bordering region.
maxIdx <- apply(ms[, allIdx, drop=FALSE], 1, function(x) {
last_in_allIdx <- utils::tail(which(x == max(x)), 1L)
allIdx[last_in_allIdx]
})
# Evaluate criteria predicates. Note that all() returns true when there are
# no values as happens for criteria 2 when borIdx is integer(0), which is
# the desired behavior.
critBool <- data.frame( array(dim=c(ms_nr, nCrit)) )
row_seq <- seq_len(ms_nr)
critBool[, 1L] <- maxIdx %in% regIdx
critBool[, 2L] <- sapply(row_seq, function(i) all(ms[i, borIdx] < bord_ratio * ms[i, maxIdx[i]]))
critBool[, 3L] <- sapply(row_seq, function(i) (ms[i, min(maxIdx[i] + 1L, lastIdx)] > 0))
critBool[, 4L] <- sapply(row_seq, function(i) any(ms[i, regIdx] > min_inten))
critBool[, 5L] <- (chg <= max_chg)
# Create a vector of indices which satisfy every criterion
keepIdx <- which( Reduce("&", critBool) )
# Extract mass spec fraction names
ms_nm <- colnames(ms)
if (is.null(ms_nm)) {
ms_nm <- as.character(seq_len(ms_nc))
}
# Create filtered mass spectrometry data
if (length(keepIdx) > 0) {
msDatObj <- msDat(ms[keepIdx, , drop=FALSE], mtoz[keepIdx], chg[keepIdx])
}
else {
msDatObj <- NULL
warning("There are no compounds that met all of the criteria\n", call.=FALSE)
}
# Create mass-to-charge and charge datasets for each criterion
cmp_by_cr <- stats::setNames(vector("list", nCrit), paste0("c", seq_len(nCrit)))
for (j in seq_len(nCrit)) {
thisKeep <- critBool[, j]
# note: data.frame can handle the case when thisKeep has length 0
cmp_by_cr[[j]] <- data.frame(mtoz = mtoz[thisKeep],
chg = chg[thisKeep] )
}
# Construct return object
outObj <- list( msDatObj = msDatObj,
cmp_by_cr = cmp_by_cr,
summ_info = list(orig_dim = c(ms_nr, ms_nc),
reg_nm = ms_nm[regIdx],
bor_nm = ms_nm[borIdx],
border = border,
bord_ratio = bord_ratio,
min_inten = min_inten,
max_chg = max_chg ) )
structure(outObj, class=c("filterMS", "msDat"))
}
#' Basic information for class \code{filterMS}
#'
#' Displays the number of candidate compounds left in the data after filtering
#'
#' @param x An object of class \code{\link{filterMS}}
#'
#' @param ... Arguments passed to dot-dot-dot are ignored
#'
#' @export
print.filterMS <- function(x, ...) {
if (is.null(x$msDatObj)) {
cat("An object of class \"filterMS\"; no observations ",
"satisfied all of the inclusion criteria.\n", sep="")
}
else {
cat("An object of class \"filterMS\" with ", format(NROW(x$msDatObj$ms), big.mark=","),
" compounds and ", NCOL(x$msDatObj$ms), " fractions.\n", sep="")
}
cat("Use summary to see more details regarding the filtering process.\n",
"Use extractMS to extract the filtered mass spectrometry data\n\n", sep="")
}
#' Overview of the filtering process
#'
#' Prints a description of the filtering process. Displays arguments chosen for
#' the \code{filterMS} constructor, how many candidate compounds were chosen for
#' each criterion, and how many candidate compounds were chosen overall.
#'
#' @param object An object of class \code{\link{filterMS}}
#'
#' @param ... Arguments passed to dot-dot-dot are ignored
#'
#' @export
summary.filterMS <- function(object, ...) {
cat(format(object), sep="")
}
# Return a vector of strings supplying the output for summary.filterMS
format.filterMS <- function(x, ...) {
# Add pointers to summ_info variables for convenience
orig_dim <- x$summ_info$orig_dim
reg_nm <- x$summ_info$reg_nm
bor_nm <- x$summ_info$bor_nm
border <- x$summ_info$border
bord_ratio <- x$summ_info$bord_ratio
min_inten <- x$summ_info$min_inten
max_chg <- x$summ_info$max_chg
ncmp_by_cr <- sapply(x$cmp_by_cr, nrow)
msDatObj <- x$msDatObj
# A bar (i.e. ----) to place underneath the region of interest header
roi_bar <- paste0( rep("-", 53 + nchar(length(reg_nm))), collapse="" )
# The region of interest names concatenated together
roi_nm_cat <- paste0(" ", reg_nm, "\n", collapse="")
# Bordering region specification
if (identical(border, "all")) {
border_spec <- "\"all\""
} else if (identical(border, "none")) {
border_spec <- "\"none\""
} else if (identical(length(border), 1L)) {
border_spec <- paste0("each having length ", border, ":")
} else {
border_spec <- paste0("having lengths ", border[1L], " and ", border[2L], ":")
}
# A bar (i.e. ----) to place underneath the bordering region header
bor_bar <- paste0( rep("-", 40 + nchar(border_spec)), collapse="" )
# The bordering region names concatenated together
if (identical(length(bor_nm), 0L)) {
bor_nm_cat <- " * no fraction names to show *\n"
} else if (length(bor_nm) > 10) {
bor_nm_cat <- " * fraction names omitted for brevity *\n"
} else {
bor_nm_cat <- paste0(" ", bor_nm, "\n", collapse="")
}
# Filtering criteria strings with uniform width
filt_crit <- format_float( c(min_inten, max_chg, bord_ratio) )
names(filt_crit) <- c("min_inten", "max_chg", "bord_ratio")
# Prior dimension strings with uniform width
orig_dim_str <- format_int(orig_dim)
names(orig_dim_str) <- c("compounds", "fractions")
# Individual filtering criterion remaining m/z levels
nlevels_fmt <- format(orig_dim[1L], big.mark=",")
ncrit_remain <- format_int(ncmp_by_cr)
names(ncrit_remain) <- paste0("crit", 1:5)
# A bar (i.e. ----) to place underneath the ind. crit. section
ncrit_bar <- paste0( rep("-", 77 + nchar(nlevels_fmt)), collapse="" )
# Final number of remaining levels after all filtering
nfinal <- ifelse(is.null(msDatObj), 0, NROW(msDatObj))
nfinal_str <- formatC(nfinal, format="d", big.mark=",")
# Begin string construction ----------------------------
# The region of interest column names
region_of_interest <- sprintf(
paste0("The region of interest was specified as (%d fractions):\n",
"%s\n",
"%s\n"),
length(reg_nm),
roi_bar,
roi_nm_cat)
# The bordering region column names
bordering_region <- sprintf(
paste0("The bordering regions were specified as %s\n",
"%s\n",
"%s",
"\n"),
border_spec,
bor_bar,
bor_nm_cat)
# The filtering criteria specification
filtering_criteria <- sprintf(
paste0("The filtering criteria was specified as:\n",
"----------------------------------------\n",
" minimum intensity: %s\n",
" maximum charge: %s\n",
" bordering region ratio: %s\n",
"\n"),
filt_crit["min_inten"],
filt_crit["max_chg"],
filt_crit["bord_ratio"])
# Dimension of data prior to filtering
prior_dimen <- sprintf(
paste0("The mass spectrometry data prior to filtering had:\n",
"--------------------------------------------------\n",
" %s compounds\n",
" %s fractions\n",
"\n"),
orig_dim_str["compounds"],
orig_dim_str["fractions"])
# Remaining compounds after individual criterion filtering
filtering_remaining <- sprintf( paste0(
"Individually, each criterion reduced the %s m/z levels to the following number:\n",
"%s\n",
" criterion 1: %s (fraction with max. abundance is in region of interest)\n",
" criterion 2: %s (fractions in bordering region have < %s%% of max. abundance)\n",
" criterion 3: %s (nonzero abundance in right adjacent fraction to max.)\n",
" criterion 4: %s (at least 1 intensity > %s in region of interest)\n",
" criterion 5: %s (must have charge <= %d)\n",
"\n"),
nlevels_fmt,
ncrit_bar,
ncrit_remain["crit1"],
ncrit_remain["crit2"],
format(round(100 * bord_ratio), big.mark=","),
ncrit_remain["crit3"],
ncrit_remain["crit4"],
format(min_inten, big.mark=","),
ncrit_remain["crit5"],
max_chg)
# Number of compounds that satisfy all filtering criterion
filtering_final_amt <- sprintf(
paste0("The total number of candidate compounds was reduced to:\n",
"-------------------------------------------------------\n",
" %s\n",
"\n"),
nfinal_str)
c(newl = "\n",
regn = region_of_interest,
bord = bordering_region,
fcri = filtering_criteria,
prir = prior_dimen,
frem = filtering_remaining,
ffin = filtering_final_amt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.