R/merge_processed_spectra.R
In maldipickr: Dereplicate and Cherry-Pick Mass Spectrometry Spectra

Documented in merge_processed_spectra

# WARNING - Generated by {fusen} from dev/dereplicate-spectra.Rmd: do not edit by hand

#' Merge multiple processed spectra and peaks
#'
#' Aggregate multiple processed spectra, their associated peaks and metadata into a feature matrix and a concatenated metadata table.
#'
#' @param processed_spectra A [list] of the processed spectra and associated peaks and metadata in two possible formats:
#' * A list of **in-memory objects** (named `spectra`, `peaks`, `metadata`) produced by [process_spectra]. Named lists will have names dropped, see Note.
#' * `r lifecycle::badge('deprecated')` A list of **paths** to RDS files produced by [process_spectra] when using the `rds_prefix` option.
#' @param remove_peakless_spectra A logical indicating whether to discard the spectra without detected peaks.
#' @param interpolate_missing A logical indicating if intensity values for missing peaks should be interpolated from the processed spectra signal or left NA which would then be converted to 0.
#'
#' @return A *n*×*p* matrix, with *n* spectra as rows and *p* features as columns that are the peaks found in all the processed spectra.
#'
#' @note When aggregating multiple runs of processed spectra, if a named list is
#' provided, note that the names will be dropped, to prevent further downstream
#' issues when these names were being appended to the rownames of the matrix
#' thus preventing downstream metadata merge.
#'
#' @seealso [process_spectra], the "Value" section in [`MALDIquant::intensityMatrix`](https://rdrr.io/cran/MALDIquant/man/intensityMatrix-functions.html)
#' @export
#' @examples
#' # Get an example directory of six Bruker MALDI Biotyper spectra
#' directory_biotyper_spectra <- system.file(
#'   "toy-species-spectra",
#'   package = "maldipickr"
#' )
#' # Import the six spectra
#' spectra_list <- import_biotyper_spectra(directory_biotyper_spectra)
#' # Transform the spectra signals according to Strejcek et al. (2018)
#' processed <- process_spectra(spectra_list)
#' # Merge the spectra to produce the feature matrix
#' fm <- merge_processed_spectra(list(processed))
#' # The feature matrix has 6 spectra as rows and
#' #  35 peaks as columns
#' dim(fm)
#' # Notice the difference when the interpolation is turned off
#' fm_no_interpolation <- merge_processed_spectra(
#'   list(processed),
#'   interpolate_missing = FALSE
#' )
#' sum(fm == 0) # 0
#' sum(fm_no_interpolation == 0) # 68
#'
#' # Multiple runs can be aggregated using list()
#' # Merge the spectra to produce the feature matrix
#' fm_all <- merge_processed_spectra(list(processed, processed, processed))
#' # The feature matrix has 3×6=18 spectra as rows and
#' #  35 peaks as columns
#' dim(fm_all)
#'
#' # If using a list, names will be dropped and are not propagated to the matrix.
#' \dontrun{
#' fm_all <- merge_processed_spectra(
#'  list("A" = processed, "B" = processed, "C" = processed))
#' any(grepl("A|B|C", rownames(fm_all))) # FALSE
#'  }
#' 
merge_processed_spectra <- function(processed_spectra, remove_peakless_spectra = TRUE, interpolate_missing = TRUE) {
  if (any(
    is.null(processed_spectra),
    !is.list(processed_spectra),
    length(processed_spectra) == 0
  )
  ) {
    stop(
      "Either 'processed_spectra' is not a list or it is an empty list."
    )
  }
  # Determine the type of input with a sneak-peek at the first element
  if (typeof(processed_spectra[[1]]) == "character") {
    if (is_a_rds_list(processed_spectra)) {
      lifecycle::deprecate_warn(
        when = "1.2.9000",
      what = "merge_processed_spectra(processed_spectra = 'must be an in-memory object')",
      details = "Ability to read processed spectra from RDS files will be dropped in next release."
      )
      processed <- lapply(processed_spectra, readRDS)
    }
  } else {
    processed <- processed_spectra
  }

  # Names at the upper level causes problems when aggregating multiple runs by
  #  being appended to the rownames of matrix thus preventing downstream metadata
  #  merge.
  if(!is.null(names(processed))){
    processed <- unname(processed)
  }
  stopifnot(is_a_processed_spectra_list(processed))

  peakless <- list()
  # List the spectra with no peaks detected and remove them
  # Initially using rownames of dataframe, now using dplyr functions for consistency
  if (remove_peakless_spectra) {
    peakless <- lapply( # Extract the metadata object from all element of the list
      processed, `[[`, "metadata"
    ) %>%
      dplyr::bind_rows(.id = "list_identifier") %>%
      dplyr::filter(.data$peaks == 0) %>%
      dplyr::pull(.data$name)
    if (length(peakless) > 0) {
      warning(
        "No peaks were detected in the following spectra, so they will be removed\n",
        paste(peakless, collapse = "\n"), "\n"
      )
    }
  }

  # 7. Bin peaks
  peaks <- lapply( # Extract the peaks object from all element of the list
    processed, `[[`, "peaks"
  ) %>% unlist()
  names_spectra <- names(peaks)
  if (remove_peakless_spectra & length(peakless) > 0) {
    peaks <- peaks[-which(names(peaks) %in% peakless)]
  }
  peaks <- MALDIquant::binPeaks(peaks, tolerance = .002, method = "strict")

  # 8. Feature matrix construction (peaks as columns and spectra as rows)
  if (interpolate_missing) {
    # This is the default in the Strejcek et al. (2018) procedure
    spectra_list <- lapply( # Extract the spectrum object from all element of the list
      processed, `[[`, "spectra"
    ) %>% unlist()

    if (remove_peakless_spectra & length(peakless) > 0) {
      spectra_list <- spectra_list[-which(names(spectra_list) %in% peakless)]
    }
    featureMatrix <- MALDIquant::intensityMatrix(peaks, spectra_list)
  } else {
    featureMatrix <- MALDIquant::intensityMatrix(peaks)
    featureMatrix[is.na(featureMatrix)] <- 0
  }
  # Adding the correct rownames fto the matrix
  if (remove_peakless_spectra & length(peakless) > 0) {
    rownames(featureMatrix) <- names_spectra[-which(names_spectra %in% peakless)]
  } else {
    rownames(featureMatrix) <- names_spectra
  }
  return(featureMatrix)
}