R/alternateAnalyze.R
In RMassBank: Workflow to process tandem MS files and build MassBank records

fragDataIndexing <- function(fragData){
  index <- vector()
  fragDataWholeMass <- floor(fragData$mass*100)
  
  k <- 0
  
  for(i in 1:length(fragDataWholeMass)){
    while(k <= fragDataWholeMass[i]){
      index[k] <- i
      k <- k + 1
    }
  }

  return(index)
}

is.sub.formula <- function(formula, targetAtomList){
  atomList <- formulastring.to.list(formula)
  if(!all(names(atomList) %in% names(targetAtomList))){
    return(FALSE)
  }
  
  for(atom in names(atomList)){
    if(atomList[[atom]] > targetAtomList[[atom]]){
      return(FALSE)
    }
  }
  return(TRUE)
}

newStep2WorkFlow <- function(w, mode="pH", 
                             confirmMode=FALSE, progressbar = "progressBarHook", 
                             settings = getOption("RMassBank"), analyzeMethod="formula", fragdataFile = NA){
  ##Load the fragment data (locally or from RMassBank package)
  fragData <- read.csv(fragdataFile,colClasses = c("character","numeric"))
  
  ##Progress bar
  nLen <- length(w@files)
  nProg <- 0
  message("msmsWorkflow: Step 2. First analysis pre recalibration")
  pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen))
  
  ##Index the fragment data (for time reasons, "which" is very slow for large matrices)
  fragDataIndex <- fragDataIndexing(fragData)
  
  
  
  w@analyzedSpecs <- lapply(w@specs, function(spec) {
    #print(spec$id)
    s <- analyzeMsMs.optimized(spec, mode=mode, detail=TRUE, run="preliminary",
                     filterSettings = settings$filterSettings,
                     spectraList = settings$spectraList, method = analyzeMethod, fragData = fragData, 
                     fragDataIndex = fragDataIndex)
    # Progress:
    nProg <<- nProg + 1
    pb <- do.call(progressbar, list(object=pb, value= nProg))
    
    return(s)
  })
  
  for(f in w@files)
    w@analyzedSpecs[[basename(as.character(f))]]$name <- basename(as.character(f))
  
  suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE)))
  return(w)
}


analyzeMsMs.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary",
                        filterSettings = getOption("RMassBank")$filterSettings,
                        spectraList = getOption("RMassBank")$spectraList, method="formula", fragData,
                        fragDataIndex)
{
  .checkMbSettings()
  
  if(msmsPeaks$foundOK == FALSE)
    return(list(foundOK = FALSE, id=msmsPeaks$id))
  
  if(method=="formula")
  {
    return(analyzeMsMs.formula.optimized(msmsPeaks, mode, detail, run, filterSettings,
                               spectraList, fragData, fragDataIndex))
  }
  else if(method == "intensity")
  {
    return(analyzeMsMs.intensity(msmsPeaks, mode, detail, run, filterSettings))
  }
}

analyzeMsMs.formula.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary",
                                filterSettings = getOption("RMassBank")$filterSettings,
                                spectraList = getOption("RMassBank")$spectraList, fragData, fragDataIndex)
{
	cut <- 0
	cut_ratio <- 0
	if(run=="preliminary")
	{
		mzColname <- "mz"
		filterMode <- "coarse"
		cut <- filterSettings$prelimCut
		if(is.na(cut))
		{
		  adductProperties <- getAdductProperties(mode, msmsPeaks@formula)
		  if(adductProperties$charge > 0) cut <- 1e4
		  if(adductProperties$charge < 0) cut <- 0
		}
		cutRatio <- filterSettings$prelimCutRatio
	} else{
		mzColname <- "mzRecal"
		filterMode <- "fine"
		cut <- filterSettings$fineCut
		cut_ratio <- filterSettings$fineCutRatio
		if(is.na(cut)) cut <- 0
	}
  
	# find whole spectrum of parent peak, so we have reasonable data to feed into
	# MolgenMsMs
	parentSpectrum <- msmsPeaks$parentPeak


	# Check whether the spectra can be fitted to the spectra list correctly!
	if(nrow(msmsPeaks$childHeaders) != length(spectraList))
	{
		warning(paste0("The spectra count of the substance ", msmsPeaks$id, " (", nrow(msmsPeaks$childHeaders), " spectra) doesn't match the provided spectra list (", 
						length(spectraList), " spectra).")
	)
    return(list(specOK=FALSE))
    
  }
  
  # here follows the Rcdk analysis
  #------------------------------------
  parentPeaks <- data.frame(mzFound=msmsPeaks$mz$mzCenter, 
                            formula=msmsPeaks$formula,
                            dppm=0,
                            x1=0,x2=0,x3=0)
  
	# get the adduct additions
	adductProperties <- getAdductProperties(mode, msmsPeaks@formula)
	allowed_additions <- adductProperties$addition
	mode.charge <- adductProperties$charge
  
  # the ppm range is two-sided here.
  # The range is slightly expanded because dppm calculation of
  # generate.formula starts from empirical mass, but dppm cal-
  # culation of the evaluation starts from theoretical mass.
  # So we don't miss the points on 'the border'.
  
  if(run=="preliminary")
    ppmlimit <- 2 * max(filterSettings$ppmLowMass, filterSettings$ppmHighMass)
  else
    ppmlimit <- 2.25 * filterSettings$ppmFine
  
  parent_formula <- add.formula(msmsPeaks$formula, allowed_additions)
  dbe_parent <- dbe(parent_formula)
  # check whether the formula is valid, i.e. has no negative or zero element numbers.
  #print(parent_formula)
  if(!is.valid.formula(parent_formula))
    return(list(specOK=FALSE))
  limits <- to.limits.rcdk(parent_formula)
  
  parentAtomList <- formulastring.to.list(parent_formula)
  
  rAtoms <- which(c(!grepl("Br",parent_formula),!grepl("N",parent_formula), !grepl("S",parent_formula),!grepl("Cl",parent_formula)))
  
  if(length(rAtoms)){
    newfragData <- fragData[-which(apply(occurrenceMatrix[,rAtoms,drop=FALSE], 1, any)),]
    newfragDataIndex <- fragDataIndexing(newfragData)
  } else{
    newfragData <- fragData
    newfragDataIndex <- fragDataIndex
  }
  
  
  # On each spectrum the following function analyzeTandemShot will be applied.
  # It takes the raw peaks matrix as argument (mz, int) and processes the spectrum by
  # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity
  # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks
  # with insufficient match accuracy or no match.
  
  analyzeTandemShot <- function(shot_mat)
  {
    shot <- as.data.frame(shot_mat)
    shot_orig <- shot
    # Filter out low intensity peaks:
    shot_lo <- shot[(shot$int < cut) | (shot$int < max(shot$int) * cut_ratio),]
    shot <- shot[(shot$int >= cut) & (shot$int > max(shot$int) * cut_ratio),]
    shot_full <- shot
    
    # Is there still anything left?
    if(nrow(shot)==0)
      return(list(specOK=FALSE))
    
    # Filter out satellite peaks:
    shot <- filterPeakSatellites(shot, filterSettings)
    shot_satellite_n <- setdiff(row.names(shot_full), row.names(shot))
    shot_satellite <- shot_full[shot_satellite_n,]
    
    # Is there still anything left?
    if(nrow(shot)==0)
      return(list(specOK=FALSE))
    
    if(max(shot$int) < as.numeric(filterSettings$specOkLimit))
      return(list(specOK=FALSE))
    # Crop to 4 digits (necessary because of the recalibrated values)
    shot[,mzColname] <- round(shot[,mzColname], 5)
    
    
    # check whether formula can be fitted into precalculated fragment data
    usePrecalcData <- is.sub.formula(parent_formula,list("C"=50,"H"=70,"N"=50,"O"=50,"S"=10,"Cl"=10,"Br"=10))
    
    
    if(usePrecalcData){
      
		# if yes, split the peaks into 2 sets: Those which can be identified using the
		# fragment data, and those which can't (by maximum fragment Data m/z - 0.5)

		precalcIndex <- which(shot[,mzColname] < (max(newfragData$mass)-0.5))
		smallShots <- shot[precalcIndex,]
		bigShots <- shot[-precalcIndex,]

		# for smaller peaks use the precalculated fragment data
		if(nrow(smallShots)){
		peakmatrixSmall <- lapply(smallShots[,mzColname],function(mass){
			mass.calc <- mass + mode.charge * .emass
			maxminMass <- ppm(mass.calc, ppmlimit, l=TRUE)
			# retrieve only possibly relevant rows of the fragment Data (use the index)
			indices <- newfragDataIndex[floor(maxminMass[2]*100)]:newfragDataIndex[ceiling(maxminMass[1]*100)]
			
			fragmentedFragmentData <- newfragData[indices,]
			
			# narrow it down further using the ppm
			fragmentedFragmentData <- fragmentedFragmentData[which(fragmentedFragmentData$mass > maxminMass[2] & fragmentedFragmentData $mass < maxminMass[1]),]
			
			# return nothing if the narrowed down data is empty at this point
			if(!nrow(fragmentedFragmentData)){
				return(t(c(mzFound=as.numeric(as.character(mass)),
						formula=NA, mzCalc=NA)))
			}
			
			
			# find out if the narrowed down fragments have a sub-formula of the parentformula
			# return the indexes of relevant formulas
			fragmentedFragmentData <- fragmentedFragmentData[which(sapply(fragmentedFragmentData$formula, function(currentFormula) is.sub.formula(currentFormula,parentAtomList))),]
			
			# return nothing if the narrowed down data is empty at this point
			if(!nrow(fragmentedFragmentData)){
			return(t(c(mzFound=as.numeric(as.character(mass)),
						formula=NA, mzCalc=NA)))
			}
			
			return(t(sapply(1:nrow(fragmentedFragmentData), function(f)
			{
			mzCalc <- fragmentedFragmentData$mass[f] - mode.charge * .emass 
			c(mzFound=as.numeric(as.character(mass)),
				formula=fragmentedFragmentData$formula[f], 
				mzCalc=mzCalc)
			})))
			
        })
      } else{
        peakmatrixSmall <- list()
      }
      
      # for bigger peaks use generate.formula from rcdk
      if(nrow(bigShots)){
        system.time(peakmatrixBig <- lapply(bigShots[,mzColname], function(mass){
          # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae
          # finally back-correct calculated masses for the charge
          mass.calc <- mass + mode.charge * .emass
          peakformula <- suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), 
                                                   limits, charge=0))
          if(length(peakformula)==0){
            return(t(c(mzFound=as.numeric(as.character(mass)),
                       formula=NA, mzCalc=NA)))
          }else{
            return(t(sapply(peakformula, function(f)
            {
              mzCalc <- f@mass - mode.charge * .emass 
              c(mzFound=as.numeric(as.character(mass)),
                formula=f@string, 
                mzCalc=mzCalc)
            })))
          }
        }))
      } else{
        peakmatrixBig <- list()
      }
      
      peakmatrix <- c(peakmatrixSmall,peakmatrixBig)
      
    } else{
			peakmatrix <- lapply(shot[,mzColname], function(mass){
			# Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae
			# finally back-correct calculated masses for the charge
			mass.calc <- mass + mode.charge * .emass
			peakformula <- suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), 
												   limits, charge=0))
			if(!is.list(peakformula) || length(peakformula)==0){
				return(t(c(mzFound=as.numeric(as.character(mass)),
						formula=NA, mzCalc=NA)))
			}else{
				return(t(sapply(peakformula, function(f)
				{
					mzCalc <- f@mass - mode.charge * .emass 
					c(mzFound=mass,
					formula=f@string, 
					mzCalc=mzCalc)
				})))
			}
        })
    }
    
    childPeaks <- as.data.frame(do.call(rbind, peakmatrix))
    childPeaks$mzFound <- as.numeric(as.character(childPeaks$mzFound))
    childPeaks$formula <- as.character(childPeaks$formula)
    childPeaks$mzCalc <- as.numeric(as.character(childPeaks$mzCalc))
    childPeaks$dppm <- (childPeaks$mzFound / childPeaks$mzCalc - 1) * 1e6
    # delete the NA data out again, because MolgenMsMs doesn't have them
    # here and they will be re-added later
    # (this is just left like this for "historical" reasons)
    childPeaks <- childPeaks[!is.na(childPeaks$formula),]
    # check if a peak was recognized (here for the first time,
    # otherwise the next command would fail)
    if(nrow(childPeaks)==0)
      return(list(specOK=FALSE))
    
    # now apply the rule-based filters to get rid of total junk:
    # dbe >= -0.5, dbe excess over mother cpd < 3
    childPeaks$dbe <- unlist(lapply(childPeaks$formula, dbe))
    #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence))
    childPeaks <- childPeaks[childPeaks$dbe >= filterSettings$dbeMinLimit,] 
    # & dbe < dbe_parent + 3)
    
    # check if a peak was recognized
    if(nrow(childPeaks)==0)
      return(list(specOK=FALSE))
    
    # trim mz to 5 digits
    shot[,mzColname] <- round(shot[,mzColname], 5)
    
    childPeaksInt <- merge(childPeaks, shot, by.x = "mzFound", by.y = mzColname, all.x = TRUE, all.y = FALSE )
    # find the best ppm value
    bestPpm <- aggregate(as.data.frame(childPeaksInt$dppm), list(childPeaksInt$mzFound),
                         function(dppm) dppm[[which.min(abs(dppm))]])
    colnames(bestPpm) <- c("mzFound", "dppmBest")
    childPeaksInt <- merge(childPeaksInt, bestPpm, by="mzFound", all.x=TRUE)
    # count formulas found per mass
    countFormulasTab <- xtabs( ~formula + mzFound, data=childPeaksInt)
    countFormulas <- colSums(countFormulasTab)
    childPeaksInt$formulaCount <- countFormulas[as.character(childPeaksInt$mzFound)]
    # filter results
    childPeaksFilt <- filterLowaccResults(childPeaksInt, filterMode, filterSettings)
    childPeaksGood <- childPeaksFilt[["TRUE"]]
    childPeaksBad <- childPeaksFilt[["FALSE"]]
    # count formulas within new limits
    # (the results of the "old" count stay in childPeaksInt and are returned
    # in $childPeaks)
    if(!is.null(childPeaksGood))
    {
      countFormulasTab <- xtabs( ~formula + mzFound, data=childPeaksGood)
      countFormulas <- colSums(countFormulasTab)
      childPeaksGood$formulaCount <- countFormulas[as.character(childPeaksGood$mzFound)]
    }
    
    childPeaksUnmatched <- merge(childPeaks, shot, by.x = "mzFound", by.y = mzColname, 
                                 all.x = TRUE, all.y = TRUE )
    childPeaksUnmatched$dppmBest <- NA
    childPeaksUnmatched$formulaCount <- 0
    childPeaksUnmatched$good <- FALSE
    childPeaksUnmatched <- childPeaksUnmatched[is.na(childPeaksUnmatched$mzCalc),]
    
    # return list:
    rl <- list(
      specOK = !is.null(childPeaksGood),
      parent = parentPeaks,
      childFilt = childPeaksGood,
      childRaw=shot_orig
    )
    # if "detail" is set to TRUE, return more detailed results including
    # all the deleted peaks and the stages when they were culled
    if(detail)
    {
      rl$childRawLow = shot_lo
      rl$childRawSatellite = shot_satellite
      rl$childRawOK = shot
      rl$child =childPeaksInt
      rl$childBad = childPeaksBad
      rl$childUnmatched = childPeaksUnmatched
    }
    return(rl)
  }
  shots <- lapply(msmsPeaks$peaks, analyzeTandemShot)
  #browser()
  shots <- mapply(function(shot, scan, info)
  {
    shot$scan <- scan
    shot$info <- info
    shot$header <- msmsPeaks$childHeaders[as.character(scan),]
    return(shot)
  }, shots, msmsPeaks$childScans, spectraList, SIMPLIFY=FALSE)
  
  mzranges <- t(sapply(shots, function(p) {
    if(!is.null(p$childRaw)){
      return(range(p$childRaw[,mzColname]))
    } else {
      return(c(NA,NA))
    }
  }))
  
  mzmin <- min(mzranges[,1], na.rm=TRUE)
  mzmax <- max(mzranges[,2], na.rm=TRUE)
  
  return(list(
    msmsdata=shots,
    mzrange=c(mzmin, mzmax),
    id=msmsPeaks$id,
    mode=mode,
    parentHeader = msmsPeaks$parentHeader,
    parentMs = msmsPeaks$parentPeak,
    formula = msmsPeaks$formula,
    foundOK = TRUE))
}

findPeaksInFracData <- function(mass, fracData, fracDataIndex){
  
}
Any scripts or data that you put into this service are public.
RMassBank documentation built on Nov. 8, 2020, 6:06 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
RMassBank
Workflow to process tandem MS files and build MassBank records

R/alternateAnalyze.R
In RMassBank: Workflow to process tandem MS files and build MassBank records

Defines functions findPeaksInFracData analyzeMsMs.formula.optimized analyzeMsMs.optimized newStep2WorkFlow is.sub.formula fragDataIndexing

Try the RMassBank package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RMassBank Workflow to process tandem MS files and build MassBank records

R/alternateAnalyze.R In RMassBank: Workflow to process tandem MS files and build MassBank records

Defines functions findPeaksInFracData analyzeMsMs.formula.optimized analyzeMsMs.optimized newStep2WorkFlow is.sub.formula fragDataIndexing

Try the RMassBank package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RMassBank
Workflow to process tandem MS files and build MassBank records

R/alternateAnalyze.R
In RMassBank: Workflow to process tandem MS files and build MassBank records