R/library_similarity.R

Defines functions ppm_distance library_similarity

Documented in library_similarity

#' Matching query spectrum to existing library
#'
#' The function searches unknown MS/MS spectra in a spectral library
#'
#' @param library A list generated by the function library_generator() or the name of mgf spectral library file
#' @param query_spectrum  Two-column data matrix. Two columns represent m/z and intensity of query tandem spectrum
#' @param method Character. Method to compare the similarity between query spectrum and library spectra.
#' \itemize{
#'   \item{Fragment:}{ Counting only number of fragment matches}
#'   \item{Simple:}{ Conting number of fragment and neutral loss matches}
#'   \item{All:}{ Conting number of fragment, neutral loss and mass differences matches.}
#'   \item{Cosine:}{ cosine similarity score from OrgMassSpecR package.}
#' }
#' @param prec_mz Numeric. Precursor mass of query spectrum (if known). Default value is 0. Must NOT be 0 if method = "All" or use.prec = TRUE (see below)
#' @param use.prec Boolean. If set to TRUE, precursor mass is used to "pre-query" the library
#' @param ppm_search Numeric. Mass tolerance in ppm for precursor/fragment search.
#' @param relative Numeric between 0 and 100. The relative intensity threshold of the highest peak in each spectrum). Peaks in query spectrum below relative thresholds are not taken into account.
#' @param mirror.plot Boolean. True if the query-library comparison is visualized as mirror plot
#' @param png.out Boolean. True if plotted mirror spectra are exported as png images!
#'
#' @return
#' \itemize{
#'    \item{<plot>:}{ Comparing query spectrum to ordered "hits" in the spectrum library}
#'    \item{SELECTED:}{ Library object that contain found scans.}
#'    \item{ID_SELECTED:}{ IDs of found compounds.}
#'    \item{SCORES:}{ Similarity scores between query spectrum and each }
#' }
#'
#' @author Youzhong Liu, \email{Youzhong.Liu@uantwerpen.be}
#'
#' @examples
#'
#' data(DRUG_THERMO_LIBRARY)
#' dat = cbind(c(136.073,149.071,151.0991,180.047),c(1,1,20,3))
#' query = library_similarity(library2, dat, method = "Cosine", prec_mz = 0, use.prec =FALSE, ppm_search = 20, relative = 1, mirror.plot = F)
#'
#' # Query-library comparison via mirror plot:
#' library_visualizer_similarity(query$SELECTED, id= query$ID_SELECTED[1], query_spectrum = dat)
#'
#' @importFrom MSnbase fData readMgfData
#' @importFrom tools file_ext
#' @importFrom stringr str_replace_all fixed
#' @importFrom OrgMassSpecR SpectrumSimilarity
#'
#' @export

library_similarity<-function(library, query_spectrum=NULL, method = c("Fragment", "Simple", "All", "Cosine"),
              prec_mz = 0, use.prec = FALSE, ppm_search = 20, relative = 1, mirror.plot = T, png.out=F){

  options(stringsAsFactors = FALSE)
  options(warn=-1)
  SELECTED_LIBRARY = NULL

  #################
  ### Check inputs:
  #################

  if (missing(library)){
    stop("Please provide the output of library_generator() or a .mgf file as input library!")}

  if (is.character(library)){
    if (file_ext(library)!="mgf"){
      stop("The file extension of your input library must be mgf!")
    }}

  if (is.list(library)){
    if (length(library)==2 & "complete" %in% names(library)){
      library = library$complete
    }
    if (length(library)!=2 || (!is.list(library$sp)) || !is.data.frame(library$metadata)){
      stop("Please make sure your input library is a valid output of library_generator()!")
    }}

  if (is.null(query_spectrum)){
      stop("Please provide a 2 column query spectrum!")
  } else {
  if (ncol(query_spectrum)<2){
    stop("Spectrum must have 2 columns m/z and intensity!")
    }
  }

  if ((prec_mz == 0) && use.prec){
    stop("Please provide the precursor mass if it is used for library search!")
  }

  if ((prec_mz == 0) && (method == "All")){
    stop("Please provide the precursor mass if neutral loss are considered!")
  }

  if ((prec_mz == 0) && (method == "Simple")){
    stop("Please provide the precursor mass if neutral loss are considered!")
  }

  if ((prec_mz!=0) && use.prec){
    query = paste0("PEPMASS = ", prec_mz)
    library = library_manager(library, query = query, ppm_search = ppm_search)
    library = library$SELECTED # Filter library according to precursor mass
    if (nrow(library$metadata)==0){stop("No record with specified precursor mass found!")}
  }

  library = library_manager(library, query = "MSLEVEL = 1")$LEFT
  if (nrow(library$metadata)==0){stop("No MS/MS records!")}

  ###############################
  ### Preprocess query spectrum:
  ###############################

  dat = query_spectrum[,1:2, drop = FALSE]

  # Normalize, cut only masses smaller than precursor and filter background noise:

  dat[,2]=dat[,2]/max(dat[,2])*100

  if (prec_mz==0){prec_mz = 10000} # Not consider the precursor mass

  selected = which((dat[,1] < prec_mz+1)
                   & (dat[,2]>relative)
                   & ppm_distance(dat[,2], prec_mz)> ppm_search) # Remove non-precursor masses

  if (length(selected)>0){
    dat = dat[selected,,drop = FALSE]
    #dat = data.matrix(matrix(dat,ncol=2))
  } else {
    stop("Spectrum not valid!")
  }
  dat_mdiff =  unique(unlist(dat[,1]))

  #####################################
  ### Reading from spectral library:
  #####################################

  if (is.character(library)){ # If input is a mgf file name
    library=readMGF2(library)
  }

  metadata = library$metadata
  spectrum_list = library$sp

  NS = nrow(metadata)
  nloss_list = lapply(1:NS, function(i) as.numeric(metadata$PEPMASS[i])-spectrum_list[[i]][,1])
  mdiff_list = lapply(1:NS, function(i) unique(unlist(dist(spectrum_list[[i]][,1]))))

  #print(nloss_list)
  #nloss_list = lapply(nloss_list, function(x) x[x>0])

  ###########################
  ### Run spectra search:
  ###########################

  nb_matched_scores = rep(0,NS)
  sim_scores = rep(0,NS)

  for (i in 1:NS){


    if (method == "Fragment"){
      dist_spec = sapply(spectrum_list[[i]][,1],ppm_distance, dat[,1])
      dist_spec[lower.tri(dist_spec)] = 10000000 # Remove redundant matches
      sim_scores[i] = sum(dist_spec<=ppm_search)
     }

    if (method == "Simple"){
      dist_spec = sapply(spectrum_list[[i]][,1], ppm_distance, dat[,1])
      dist_spec[lower.tri(dist_spec)] = 10000000 # Remove redundant matches
      dist_loss = sapply(nloss_list[[i]], ppm_distance, prec_mz - dat[,1])
      dist_loss[lower.tri(dist_loss)] = 10000000 # Remove redundant matches
      sim_scores[i] = sum(dist_spec<=ppm_search) + sum(dist_loss<=ppm_search)
    }

    if (method == "All"){
      dist_spec = sapply(spectrum_list[[i]][,1], ppm_distance, dat[,1])
      dist_spec[lower.tri(dist_spec)] = 10000000 # Remove redundant matches
      dist_loss = sapply(nloss_list[[i]], ppm_distance, prec_mz - dat[,1])
      dist_loss[lower.tri(dist_loss)] = 10000000 # Remove redundant matches
      dist_mdiff = sapply(mdiff_list[[i]], ppm_distance, dat_mdiff)
      dist_mdiff[lower.tri(dist_mdiff)] = 10000000 # Remove redundant matches
      sim_scores[i] = sum(dist_spec<=ppm_search) + sum(dist_loss<=ppm_search) + sum(dist_mdiff<=ppm_search)
    }

    if (method == "Cosine"){
      abs_search = ppm_search/1000000*median(dat[,1]) # Da window for cosine spectra search
      sim_scores[i] = SpectrumSimilarity(dat, spectrum_list[[i]], t = abs_search)
    }
    graphics.off()
  }

  ###########################
  ### Plot and output results:
  ###########################

  # Top scores and output:

  indexes = order(sim_scores,decreasing=T)
  indexes = intersect(indexes,which(sim_scores>0))

  if (length(indexes>0)){
    SELECTED_LIBRARY = list()
    SELECTED_LIBRARY$sp = library$sp[indexes]
    SELECTED_LIBRARY$metadata = library$metadata[indexes,]
    sim_scores = sim_scores[indexes]

  # Plot:
    if (mirror.plot){
      for (i in 1:length(indexes)){

        if (png.out){png(paste0("SIM_SCAN_", SELECTED_LIBRARY$metadata$SCANS[i],".png"), width = 700, height = 480)}
        bottom.label = paste0("Library spectrum of ID = ", SELECTED_LIBRARY$metadata$ID[i])
        xlim = c(min(dat[,1])*0.8, max(dat[,1])*1.2)
        SpectrumSimilarity(dat, SELECTED_LIBRARY$sp[[i]], top.label="Query spectrum",bottom.label= bottom.label,xlim=xlim, print.graphic = T)

        if (png.out){dev.off()}
      }
    }
  }

  return(list(SELECTED = SELECTED_LIBRARY,
         ID_SELECTED = unique(SELECTED_LIBRARY$metadata$ID),
         SCORES = sim_scores))
}

############################
### Internal functions:
###########################

ppm_distance<-function(x,y){
  return(abs((x-y)/y*1000000))
}
daniellyz/MergeION documentation built on Oct. 19, 2022, 1:56 p.m.