R/library_merger.R

Defines functions library_merger

Documented in library_merger

#' Extracting or generating representative scans for each compound and create a new library
#'
#' The function picks or generates a single MS1 and MS2 scan for each compound ID in the spectral library.
#'
#' @param library A list generated by the function library_generator() or the name of mgf spectral library file
#' @param method  A character of method for grouping library by compound IDs.
#' \itemize{
#'   \item{consensus:}{ Consensus spectra were generated by merging spectra (MS1 and MS2) of the same compound ID. All peaks were kept, similar peaks were aligned.}
#'   \item{common_peaks:}{ Consensus spectra were generated by merging spectra (MS1 and MS2) of the same compound ID. Peaks detected in all spectra were kept and averaged.}
#'   \item{max_tic:}{ For the same compound ID, the spectrum with higher TIC was extracted and kept.}
#'   \item{max_nb_peaks:}{ For the same compound ID, the spectrum with more peaks was kept.}
#'   \item{most_recent:}{ For the same compound ID, the most recent record was kept.}
#'  }
#' @param consensus_window  m/z window (in ppm) for spectra alignment, only used when method = "consensus" or "common_peaks". To generate consensus spectra, mass peaks in different spectra within the mass window were aligned by averaging their mass values and intensities. The metadata was kept for the spectrum with higher TIC.
#' @param output_library Name of the output library, the file extension must be mgf
#'
#' @return The same as library_generator()
#'
#' @author Youzhong Liu, \email{Youzhong.Liu@uantwerpen.be}
#'
#' @examples
#'
#' data(DRUG_THERMO_LIBRARY)
#'
#' # Withholding the scan (MS1 and MS) with highest TIC for each ID:
#'
#' library2_1=library_merger(library2, method="max_tic", output_library="library_V2_filtered.mgf")
#'
#' # Generating consensus spectra for all scans (MS1 and MS) of the same ID. The scan with highest TIC is used for metadata:
#'
#' library2_2=library_merger(library2, method="consensus", consensus_window = 20, output_library="library_V2_consensus.mgf")
#'
#' @export
#'
#' @importFrom MSnbase fData readMgfData
#' @importFrom tools file_ext
#' @importFrom utils write.table

library_merger<-function(library, method = c("consensus","common_peaks","max_tic","max_nb_peaks","most_recent"),
                          consensus_window = 10, output_library=""){

  options(stringsAsFactors = FALSE)
  options(warn=-1)

  if (missing(library)){
    stop("Please provide the output of library_generator() or a .mgf file as input library!")}

  if (is.character(library)){
    if (file_ext(library)!="mgf"){
      stop("The file extension of your input library must be mgf!")
    }}

  if (is.list(library)){
    if (length(library)==2 & "complete" %in% names(library)){
      library = library$complete
    }
    if (length(library)!=2 || (!is.list(library$sp)) || !is.data.frame(library$metadata)){
      stop("Please make sure your input library is a valid output of library_generator()!")
    }}

  if (!(method  %in% c("consensus","common_peaks","max_tic","max_nb_peaks","most_recent"))){
    stop("The library processing method does not exist!")
   }

  #####################################
  ### Reading from spectral library:
  #####################################

  if (is.character(library)){ # If input is a mgf file name
    library=readMGF2(library)}

  metadata = library$metadata
  spectrum_list = library$sp

  new_spectrum_list = list()
  new_meta_data = c()
  NN = 0

  #######################
  ### Combine MS1 scan:
  #######################

  index1=which(metadata$MSLEVEL==1) # All MS1 scan

  if (length(index1)>0){

    metadata1= metadata[index1,]
    spectrum_list1 = spectrum_list[index1]
    peak_num1 = sapply(spectrum_list1,nrow) # Number of peaks in each scan
    ID_list1=unique(metadata1$ID)

    for (ID in ID_list1){

      selected_rows = which(metadata1$ID==ID)

      sub_metadata = metadata1[selected_rows,]
      sub_peak_num = peak_num1[selected_rows]
      sub_spectrum_list = spectrum_list1[selected_rows]

      # Representative scans in the old library:

      if (method=="max_nb_peaks"){
        wm = which.max(sub_peak_num)
      } else if (method=="most_recent"){
        wm = which.max(sub_metadata$SCANS)
      } else {
        wm = which.max(sub_metadata$TIC)}

      new_meta_data = rbind.data.frame(new_meta_data,sub_metadata[wm,]) # Update metadata
      NN = NN+1

      # Append spectra list if no need for spectra merge
      if (method %in% c("max_tic","max_nb_peaks","most_recent")
          || length(selected_rows)==1){
        new_spectrum_list[[NN]]=sub_spectrum_list[[wm]]}

      # Calculate consensus spectra if multiple spectra present and we want to merge them
      if (method == "consensus" & length(selected_rows)>1){
        spectrum_averaged = average_spectrum(sub_spectrum_list, consensus_window, clean = F)
        new_spectrum_list[[NN]] = spectrum_averaged
      }

      # Calculate commomn peak spectra
      if (method =="common_peaks" & length(selected_rows)>1){
        spectrum_averaged = average_spectrum(sub_spectrum_list, consensus_window, clean = T)
        new_spectrum_list[[NN]] = spectrum_averaged
      }
    }}

  #######################
  ### Consensus MS2 scan:
  #######################

  index2=which(metadata$MSLEVEL==2)

  if (length(index2)>0){

    metadata2=metadata[index2,]

    spectrum_list2 = spectrum_list[index2]
    peak_num2 = sapply(spectrum_list2,nrow) # Number of peaks in each scan
    ID_list2=unique(metadata2$ID)

    for (ID in ID_list2){

      selected_rows = which(metadata2$ID==ID)

      sub_metadata = metadata2[selected_rows,]
      sub_peak_num = peak_num2[selected_rows]
      sub_spectrum_list = spectrum_list2[selected_rows]

      # Representative scans in the old library:

      if (method=="max_nb_peaks"){
        wm = which.max(sub_peak_num)
      } else if (method=="most_recent"){
        wm = which.max(sub_metadata$SCANS)
      } else{
        wm = which.max(sub_metadata$TIC)}

      new_meta_data = rbind.data.frame(new_meta_data,sub_metadata[wm,]) # Update metadata
      NN = NN+1

      # Append spectra list if no need for spectra merge
      if (method %in% c("max_tic","max_nb_peaks","most_recent")
          || length(selected_rows)==1){
        new_spectrum_list[[NN]]=sub_spectrum_list[[wm]]}

      # Calculate consensus spectra if multiple spectra present and we want to merge them
      if (method == "consensus" & length(selected_rows)>1){
        spectrum_averaged = average_spectrum(sub_spectrum_list, consensus_window, clean = F)
        new_spectrum_list[[NN]] = spectrum_averaged
      }

      # Calculate commomn peak spectra
      if (method =="common_peaks" & length(selected_rows)>1){
        spectrum_averaged = average_spectrum(sub_spectrum_list, consensus_window, clean = T)
        new_spectrum_list[[NN]] = spectrum_averaged
      }
    }}

  ####################
  ### Return results:
  ####################

  library = list()
  library$sp = new_spectrum_list
  library$metadata = cbind.data.frame(new_meta_data, POST_PROCECSSING = method)

  if (output_library!=""){
    writeMGF2(library,output_library)
    write.table(library$metadata,paste0(output_library,".txt"),col.names = T,row.names=F,dec=".",sep="\t")
  }
  return(library)
}
daniellyz/MergeION documentation built on Oct. 19, 2022, 1:56 p.m.