R/library_manager.R

Defines functions ppm_distance library_manager

Documented in library_manager

#' Searching and managing the spectral library
#'
#' The function queries and selects or removes scans from the spectral library that satisfy user-defined conditions (query metadata)
#'
#' @param library A list generated by the function library_generator() or the name of mgf spectral library file
#' @param query  Vector of characters. Vector of conditions used for querying the library. e.g. c("IONMODE=Positive","PEPMASS=325.19"). The left-hand side must match with the medata items of the searched library.
#' @param logical Character. "AND" for selecting scans that satisfy all conditions, "OR" when selecting records that satisfy at least one condition
#' @param ppm_search Numeric. Mass tolerance in ppm. Only used when searching by precursor mass "PEPMASS=..."
#' @param rt_search Numeric. Retention time tolerance in second (although rt in the query and metadata in min). Only used when searching by retention time "RT=..."
#' @return
#' \itemize{
#'  \item{SELECTED:}{ Library object that only contain selected scans}
#'  \item{ID_SELECTED:}{ IDs of selected scans}
#'  \item{LEFT:}{ Library object that only contain unselected scans}
#'  \item{ID_SELECTED:}{ IDs of unnselected scans}
#' }
#'
#' @examples
#'
#' data(DRUG_THERMO_LIBRARY)
#'
#' # Search library using query command lines:
#' query = library_manager(library2,query=c("IONMODE=Positive","RT=1.2"), logical="AND", rt_search=6)
#'
#' # Create a new library from query:
#' new_library1 = query$SELECTED
#'
#' # Summary of found compounds:
#' library_reporter(new_library1)
#'
#' # Remove scans from current library according to query:
#' new_library2 = query$LEFT
#'
#' # Add another filter:
#' query = library_manager(new_library1,query=c("IONMODE=Positive","MSLEVEL=2","RT=1.2"))
#' new_library3 = query$SELECTED
#'
#' @export
#'
#' @importFrom MSnbase fData readMgfData
#' @importFrom tools file_ext
#' @importFrom stringr str_replace_all fixed
#'
library_manager<-function(library, query = "", logical = c("AND","OR"), ppm_search = 20, rt_search = 12){

  options(stringsAsFactors = FALSE)
  options(warn=-1)

  #################
  ### Check inputs:
  #################

  if (missing(library)){
    stop("Please provide the output of library_generator() or a .mgf file as input library!")}

  if (is.character(library)){
    if (file_ext(library)!="mgf"){
      stop("The file extension of your input library must be mgf!")
    }}

  if (is.list(library)){
    if (length(library)==2 & "complete" %in% names(library)){
      library = library$complete
    }
    if (length(library)!=2 || (!is.list(library$sp)) || !is.data.frame(library$metadata)){
      stop("Please make sure your input library is a valid output of library_generator()!")
    }}

  logical = match.arg(logical,choices=c("AND","OR"),several.ok = FALSE)

  #####################################
  ### Reading from spectral library:
  #####################################

  if (is.character(library)){ # If input is a mgf file name
    library=readMGF2(library)}

  metadata = library$metadata
  spectrum_list = library$sp

  prec_mz = as.numeric(metadata$PEPMASS)
  prec_rt = as.numeric(metadata$RT)

  ###########################
  ### Run query expressions:
  ###########################

  if (!is.character(query)){
    stop("Query expression is not valid!")}

  if (query!=""){

    indexes_list = list()
    NI = 0

    for (eps in query){
      eps1 =  str_replace_all(eps,fixed(" "),"") # Remove white space

   ## Search pepmass and rt:
      if (startsWith(eps1,"PEPMASS=")){
        target_mass = as.numeric(strsplit(eps1,"=")[[1]][2])
        if (!is.na(target_mass)){
          ppm_list = ppm_distance(target_mass,prec_mz)
          indexes = which(ppm_list<=ppm_search)}
      } else if (startsWith(eps1,"RT=")){
        target_rt = as.numeric(strsplit(eps1,"=")[[1]][2])
        if (!is.na(target_rt)){
          rtdev_list = abs(target_rt*60-prec_rt*60)
          indexes = which(rtdev_list<=rt_search)
      }} else {

    # Search other things:
      target_variable = strsplit(eps1,"=")[[1]][1]
      target_value = strsplit(eps1,"=")[[1]][2]
      cid = which(colnames(metadata) == target_variable)
      if (length(cid)==1){
        indexes = which(metadata[,cid]==target_value)}}

    # Add valid indexes:

      if (length(indexes)>0){
          NI = NI + 1
          indexes_list[[NI]] = indexes}
    }

    if (logical=="AND"){
      indexes_list = Reduce(intersect,indexes_list)
    }

    if (logical=="OR"){
      indexes_list = Reduce(union,indexes_list)
    }}

  # Ouput results:

  NN = 1:length(spectrum_list)
  left_list = setdiff(NN, indexes_list)

  SELECTED_LIBRARY = LEFT_LIBRARY = library
  SELECTED_LIBRARY$sp = library$sp[indexes_list]
  SELECTED_LIBRARY$metadata = library$metadata[indexes_list,]
  LEFT_LIBRARY$sp = library$sp[left_list]
  LEFT_LIBRARY$metadata = library$metadata[left_list,]

  return(list(SELECTED = SELECTED_LIBRARY, ID_SELECTED = unique(SELECTED_LIBRARY$metadata$ID),
              LEFT = LEFT_LIBRARY, ID_LEFT = unique(LEFT_LIBRARY$metadata$ID)))
}

############################
### Internal functions:
###########################

ppm_distance<-function(x,y){
  return(abs((x-y)/y*1000000))
}
daniellyz/MergeION documentation built on Oct. 19, 2022, 1:56 p.m.