R/averaging.R

Defines functions snrFilter averageCluster rsde stde clustering performHc group1d

# msPurity R package for processing MS/MS data - Copyright (C)
#
# This file is part of msPurity.
#
# msPurity is a free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# msPurity is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with msPurity.  If not, see <https://www.gnu.org/licenses/>.

group1d <- function(v, thr){

  v <- sort(v)
  a <- ''
  cl <-  1
  c <- 1

  for(i in 1:(length(v)-1)){
    a[i] <- (1e6*(v[i+1]-v[i]))/v[i]

    if(as.numeric(a[i])<thr){
      cl[i+1] <- c
    }else{
      c <- c+1
      cl[i+1] <- c
    }

  }
  return(cl)
}

performHc <- function(mzs, ppm){

  if(length(mzs)==1){return(1)}

  mdist = stats::dist(mzs)
  averageMzPair = stats::as.dist(outer(mzs, mzs, "+")/2)
  relativeErrors = averageMzPair * 0.000001
  m_massTolerance = mdist / relativeErrors
  clh <- fastcluster::hclust(m_massTolerance)
  # cut a desired level
  cl <- stats::cutree(clh, h = ppm)
  return(cl)
}


clustering <- function(mz, ppm=1.5, clustType="hc", cores=2){
  #print("Performing clustering/grouping")
  # Sort out the cores to use

  # perform clustering
  if(clustType=="hc"){
    #print("hc clustering")
    if (cores>1) {
      clust<-parallel::makeCluster(cores)
      doSNOW::registerDoSNOW(clust)
      parallelBool = TRUE

    } else {
      parallelBool = FALSE

    }

    mzsplit <- split(mz, ceiling(seq_along(mz)/5000))

    # Perform averaging on multiple single runs (1 file) or
    # multi-core (averaging scans in 1 file)
    clustlist  <- plyr::llply(mzsplit,
                              .parallel = parallelBool,
                              performHc, # FUNCTION
                              ppm=ppm)


    for (i in 1:length(clustlist)){
      if(i>1){
        clustlist[[i]] <- clustlist[[i]]+plus
      }
      plus <- max(clustlist[[i]])
    }

    cl <- unlist(clustlist)

    #     # stop any open connections
    #     if (cores>1) {
    #       parallel::stopCluster(clust)
    #       closeAllConnections()
    #     }

    return(cl)

  }else if (clustType=="simple"){
    # order by mz
    #print("group1d")
    return(group1d(mz, ppm))
  }


}

stde <- function(x) sd(x)/sqrt(length(x))
rsde <- function(x) (sd(x)/mean(x))*100


averageCluster <- function(x, av="median", minnum=1,
                           missingV="ignore", totalScans, normTIC, sumI=FALSE){
  # Filter out any that do not match the minimum
  if(nrow(x)<minnum){
    return(NULL)
  }


  # Get rsd (if normalise flagged then use the normalised intensity)
  if(normTIC){
    rsdRes <- (sd(x$inorm)/mean(x$inorm))*100
  }else{
    rsdRes <- rsde(x$i)

  }

  # Calc average first of the mz. We don't want to mess around with
  # missing values for this stage
  if(av=="median"){
    mz <- stats::median(x$mz)
  }else{
    mz <- base::mean(x$mz)
  }

  # Zero any missing values for intensity and SNR (this is the thermo approach,
  # but might not be best approach)
  if(missingV=="zero"){
    toadd <- totalScans-nrow(x)
    if(toadd>0){
      for(i in 1:toadd){
        x <- rbind(x, c(mz,0,0,unique(x$cl)))
      }

    }
  }

  # Finally calculate the averaged SNR and intensity
  if(av=="median"){
    i <- median(x$i)
    snr <- median(x$snr)
    inorm <- median(x$inorm)
    purity <- median(x$inPurity)
    ra <- median(x$ra)
  }else{
    i <- mean(x$i)
    snr <- mean(x$snr)
    inorm <- mean(x$inorm)
    purity <- mean(x$inPurity)
    ra <- mean(x$ra)
  }

  if (sumI){
    i <- sum(x$i)
  }



  return(c("mz" = mz, "i" = i, "snr" = snr, "rsd" = rsdRes,
           "inorm" = inorm, 'count' =length(x$i), 'total'=totalScans, 'inPurity'=purity, 'ra'=ra))

}

snrFilter <- function(x, snthr, snMeth){
  # If snMethod is "precalc" then the SNR does not need to
  # be calculated required
  if (snMeth=="median"){
    x$snr <- x$i/median(x$i)
  }else if(snMeth=="mean"){
    x$snr <- x$i/mean(x$i)
  }

  x <- x[x$snr>snthr, ]

  return(x)

}
computational-metabolomics/msPurity documentation built on Sept. 8, 2023, 8:04 p.m.