R/process_SmartMS2.R

Defines functions decimalplaces denoise_ms2_spectrum separated_peaks2 ppm_distance process_SmartMS2

Documented in process_SmartMS2

#' Read and combine targeted MS2 scans from one LC-MS/MS file using default MergeION algorithm
#'
#' Function used by library_generator to detect MS2 scans
#' 
#' @importFrom BiocGenerics basename
#' @importFrom MSnbase readMSData rtime tic fData readMgfData precursorMz polarity chromatogram
#' @importFrom pracma findpeaks trapz
#' @importFrom mzR openMSfile header peaks
#'
#' @export
#' 
process_SmartMS2<-function(mzdatafiles = NULL, ref = NULL, 
      rt_search = 10, rt_gap = 30, ppm_search = 10, mz_search = 0.01,
      baseline= 1000, relative = 5, max_peaks = 200, normalized=T){

 options(stringsAsFactors = F)
 options(warn=-1)
 
 if ("FILENAME" %in% colnames(ref)){
   valid = c(which(basename(ref$FILENAME) == basename(mzdatafiles)),  which(ref$FILENAME =="N/A"))
   ref = ref[valid,,drop=FALSE]
 }

 ### Initialize variables

 new_MS2_meta_data = c() # Compounds detected in the ms data
 MS2_scan_list = list() # List of spectrum2 objects
 scan_number = c() # Which scan number in the raw chromatogram is found
 new_PEP_mass = c() # The real mass in samples
 mass_dev = c() # Mass deviations in ppm
 spectrum_list = list() # List of spectra to save
 NNN=0 # Numerator

 #####################
 ### Load MS2 Scans###
 #####################
 
 MS2_Janssen <- try(readMSData(mzdatafiles, msLevel = 2, verbose = FALSE, mode = "inMemory",  centroided = T),silent=T)
 
 if (class(MS2_Janssen)=="try-error"){MS2_Janssen=NULL}
 
 if (nrow(ref)==0){MS2_Janssen=NULL}
 
 if (!is.null(MS2_Janssen)){ # If data contains MS2 scan

   print("Processing MS2 scans of data file ...")
   
   ### Extract useful informations from raw data:

   NS = length(MS2_Janssen) # Total number of scans
   MS2_prec_mz = precursorMz(MS2_Janssen) # Label precursor mass
   targets =  unique(MS2_prec_mz) # All targeted masses
   MS2_prec_rt = rtime(MS2_Janssen) # In second
   MS2_tic = as.numeric(MSnbase::tic(MS2_Janssen))

   ### Filter ref because not all targeted m/z exists or fragmented in the sample! Important for not to search whatever

   dev_targets = sapply(as.numeric(ref$PEPMASS),function(x) min(abs(x-targets)))
   valid = which(dev_targets <= 1) #  Find targeted metadata in experimental file!! - faster!

   ref = ref[valid,,drop=FALSE]
   prec_theo= as.numeric(ref$PEPMASS)
   prec_rt=as.numeric(ref$RT)*60 # Allow N/A
   
   ####################################################
   ### Go through each ref item to find adequate scan##
   ####################################################
   
   if (nrow(ref)>0){

      for (i in 1:nrow(ref)){
        
        # 1. Define a wide search range based on targeted precursor mass
        
        scan_range = which(abs(MS2_prec_mz-prec_theo[i])<1 & MS2_tic>baseline) 
        if (!is.na(prec_rt[i])){
            time_range = which(MS2_prec_rt >= prec_rt[i] - rt_search & MS2_prec_rt <= prec_rt[i] + rt_search)
            scan_range = intersect(scan_range,time_range)
        }
        
        scan_range = sort(unique(as.numeric(scan_range)))
        
        # 2. Refine the search range based on exact precursor mass or mass detected in actual spectrum
        
        if (length(scan_range)>0){
          
          temp_scan_range = c() # Validated scan number
          temp_mz0 = c()
          
          for (k in scan_range){
            
            checked = 0 # The scan validation state
            Frag_data = MS2_Janssen[[k]]
            
            ppm_error = ppm_distance(MS2_prec_mz[k], prec_theo[i])
            abs_error = abs(MS2_prec_mz[k]-prec_theo[i])
            
            # Easy situation if scans are labeled correctly:
            
            if (ppm_error<=ppm_search || abs_error<=mz_search){
              checked = 1
              mz0 = MS2_prec_mz[k]
            }
          
            # If the scan is not correctly labeled, check in details actual spectrum:
            
            if (checked==0  & length(Frag_data@mz)>1){
              ppm_dis = ppm_distance(Frag_data@mz,prec_theo[i])
              mz_dev = abs(Frag_data@mz - prec_theo[i])
              prec_ind = which.min(ppm_dis)
              prec_int = Frag_data@intensity[prec_ind] # The intensity of precursor ion

              if (prec_int>baseline*2 & (ppm_dis[prec_ind]<=ppm_search || mz_dev[prec_ind]<=mz_search)){
                checked = 1
                mz0 = Frag_data@mz[prec_ind]
              } # The precursor mass must be higher than baseline
            }
            
            # Add to filters scans  
              
            if (checked==1){
              temp_scan_range = c(temp_scan_range, k)
              temp_mz0 = c(temp_mz0, mz0)
            }
          }
          
          scan_range = temp_scan_range
          scan_mz0 = temp_mz0
          
      ### 3. Select "best" scan and calculate deviation
    
          if (length(scan_range)>0){
            
            scan_rts = MS2_prec_rt[scan_range]
            scan_tics = MS2_tic[scan_range]
            ind_features = separated_peaks2(scan_range, scan_rts, scan_tics, rt_gap)

            valid_k = sapply(ind_features, function(x) x[1]) # Highest TIC scan

            NV = length(valid_k) # >1 if isomers present
            mz = scan_mz0[match(valid_k,scan_range)] # Precursor m/z of valid scan
            dev_ppm= round(sapply(mz, function(x) min(ppm_distance(x,prec_theo[i]))),2)

            scan_number = c(scan_number,valid_k)  # Save scan numbers
            new_PEP_mass = c(new_PEP_mass, mz)
            mass_dev = c(mass_dev,dev_ppm)
        
          ### 4. Calculate smart ion append spectra and metadata:
      
            for (vvv in 1:NV){
              
                ind_feature = as.numeric(ind_features[[vvv]])
                tmp_scans = MS2_Janssen[ind_feature]

                scan_best_tic = MS2_Janssen[[ind_feature[as.numeric(which.max(MSnbase::tic(tmp_scans)))]]]
                scan_best_tic = cbind(scan_best_tic@mz, scan_best_tic@intensity)

                scan_most_peaks = MS2_Janssen[[ind_feature[as.numeric(which.max(mzR::peaksCount(tmp_scans)))]]]
                scan_most_peaks = cbind(scan_most_peaks@mz, scan_most_peaks@intensity)

                scan_least_peaks = MS2_Janssen[[ind_feature[as.numeric(which.min(mzR::peaksCount(tmp_scans)))]]]
                scan_least_peaks = cbind(scan_least_peaks@mz, scan_least_peaks@intensity)

                scan_all = list(scan_best_tic, scan_most_peaks, scan_least_peaks)
                
                scan_merged = average_spectrum(scan_all, mz_window = mz_search*2)
                scan_merged = scan_merged$new_spectrum
                
                MS2_scan_list[[NNN+vvv]] = scan_merged
                new_MS2_meta_data = rbind(new_MS2_meta_data,ref[i,,drop=FALSE])
            }
            
       NNN=NNN+NV
      }
    }
  } # End of screening all reference masses
  
  #####################   
  ### Create metadata##
  #####################
    
  if (NNN>0){ 
    
    new_MS2_meta_data[,"PEPMASS"] = round(as.numeric(new_PEP_mass),5)
    new_MS2_meta_data[,"RT"] = round(MS2_prec_rt[scan_number]/60,2)  # minutes
    new_MS2_meta_data[,"FILENAME"] = rep(basename(mzdatafiles),NNN)
    new_MS2_meta_data[,"MSLEVEL"] = rep(2,NNN)
    new_MS2_meta_data[,"TIC"] = MS2_tic[scan_number]
    new_MS2_meta_data[,"PEPMASS_DEV"] = mass_dev
    new_MS2_meta_data[,"SCAN_NUMBER"] = scan_number
    
    #################################
    ### Process and collect spectra##
    #################################
  
    included = c() # Not filtered
    n0=0
      
    for (i in 1:NNN){
        
       sp0 = MS2_scan_list[[i]]
       
       if (!is.null(sp0)){
         
          if (nrow(sp0)>2){
            sp1 = denoise_ms2_spectrum(sp0, new_MS2_meta_data$PEPMASS[i], max_peaks, relative, normalized)
        
            if (!is.null(sp1)){
          
            if (nrow(sp1)>1){
              included = c(included, i)
              n0 = n0 + 1
              spectrum_list[[n0]]=sp1
        }}}}
    }
    
    new_MS2_meta_data = new_MS2_meta_data[included,,drop=FALSE]
    id_kept = unique(new_MS2_meta_data$ID)
    ref = ref[match(id_kept,ref$ID),,drop=FALSE]
   } 
   }}

  if (!is.null(new_MS2_meta_data)){
    if (nrow(new_MS2_meta_data)==0){  
      print(paste0("No MS2 scan in the data file ",mzdatafiles," matches with metadata!"))
  }} else { print(paste0("No MS2 scan in the data file ",mzdatafiles," matches with metadata!"))}

  return(list(sp=spectrum_list,metadata=new_MS2_meta_data,ref_MS2=ref))
}

###########################
### Internal functions:####
###########################

# ppm error calculation:

ppm_distance<-function(x,y){
  x = as.numeric(x)
  y = as.numeric(y)
  if (y>100){
    ppm = abs((x-y))/y*1000000
  } else {
    ppm = abs(x-y)
    ppm[ppm<0.01]=0
  }
  return(ppm)
}

# Find indexes from "ranges": separated isomer peaks according to rt and intensity

separated_peaks2<-function(ranges, rts, tics, rt_gap){

  # ranges: scan or peak number
  NR = length(ranges)
  ind_features = list()
  kf = 0
  
 # screen:
  if (NR>1){
    tmp = data.matrix(cbind(ranges,rts,tics))
    tmp = tmp[order(tmp[,3],decreasing=T),] # Order the tmp by intensity
    while (nrow(tmp)>0){
      kf = kf +1
      feature_rt = tmp[1,2]
      valid = which(abs(tmp[,2]-feature_rt)<=rt_gap)
      ind_features[[kf]] = tmp[valid, 1]
      tmp  = tmp[-valid,,drop = FALSE]
  }} else {ind_features[[1]] = ranges}
  
  # check:
  return(ind_features)
}

# Keep top peaks

denoise_ms2_spectrum<-function(sp, mz0, max_peak, min_relative, normalized = T){
  
  denoised_spectrum = matrix(c(0,0),1,2)
  
  if (nrow(sp)>0){
    
    # Check resolution:
    
    checked = any(sapply(sp[,1], decimalplaces)>2) # At least 2 values after decimal
    
    # Filter top peaks:
    
    sp = sp[order(sp[,2], decreasing = T),,drop=FALSE]
    tops = min(max_peak, nrow(sp))  
    sp = sp[1:tops,,drop=FALSE]
    
    # Normalize to 100:
    
    sp1 = sp
    sp1[,2] = sp1[,2]/max(sp1[,2])*100
    
    # Relative Intensity filter:
    
    filter = which(sp1[,2]>=min_relative & sp1[,1]<mz0-1)
    if (normalized){sp = sp1}  
    sp = sp[filter,,drop=FALSE]
    
    # Check validity:
    
    if (nrow(sp)>0 & checked){
      sp = sp[order(sp[,1]),,drop=FALSE]
      if (normalized){sp[,2] = sp[,2]/max(sp[,2])*100}
      denoised_spectrum = sp
    }
  }
  return(denoised_spectrum)
}

decimalplaces <- function(x){
  if ((x %% 1) != 0) {
    nchar(strsplit(sub('0+$', '', as.character(x)), ".", fixed=TRUE)[[1]][[2]])
  } else {
    return(0)
  }
}
daniellyz/MergeION2 documentation built on Jan. 26, 2024, 6:24 a.m.