R/OutlierFind.R

Defines functions outlierFind outlierFind_i

Documented in outlierFind

# service function for outlierFind to identify outliers in a matrix
# using both scoresMod and boxplotMod
#
# @param     i   protein i
# @param     protClass a matrix of protein, peptide identifiers and normalized
#                        specific amounts
# @param     outlierLevel peptide for outlier spectra within peptides, or
#                          protein for outlier peptides within proteins`
# @param     numRefCols number of columns before Mass Spectrometry data columns
# @param     numDataCols how many columns in MS data
# @param     outlierMeth boxplot (recommended), scores, or none
# @param     range the range parameter used for identifying outliers
# @param     proba probability to exclude outler for scores method
# @param     eps value to add before log2 transfromations (to avoid taking log of zero)
#' @importFrom graphics boxplot
#' @importFrom outliers scores
#' @importFrom stats runif
#' @import snow
# @param     randomError  TRUE if allow it to be random
# @return indicators of outlier peptides or outlier spectra for a protein or peptide
# @export

outlierFind_i <- function(i, protClass, outlierLevel="peptide",
                          numRefCols, numDataCols, outlierMeth,
                          range, proba, eps=eps, randomError) {

  # i=9
  # ================================================================
  # boxplotMod to identify outliers in a vector
  # Outliers that are 3 times the interquartile range from either
  # quartile; note that the default for boxplots is 1.5 times the
  # interquartile range
  #
  # @param             x: a vector of numbers
  # @param  reject.vec.i: pre-specified rejection locations
  # @param         range: pre-specified times of IQR range
  #
  #================================================================

  boxplotMod <- function(x, range = range) {
    #x[reject.vec] <- NA
    outlier.x <- boxplot(x, plot = FALSE, range = range)$out
    outlier.ind <- (x %in% outlier.x)

    return(outlier.ind*1)
  }

  #================================================================
  # scoresMod to identify outliers in a vector
  # Outliers as exceeding the normal probability of proba
  #
  # @param             x: a vector of numbers
  # @param  reject.vec.i: pre-specified rejection locations
  # @param         proba: pre-specified normal probability
  #
  #================================================================

  scoresMod <- function(x, reject.vec = NULL, proba = proba) {
    x[reject.vec] <- NA
    non.na.indices <- (seq_len(length(x)))[!is.na(x)]
    ind <- scores(x[!is.na(x)], prob = proba)
    ind[is.na(ind)] <- FALSE
    outlier.ind <- rep(FALSE, length(x))
    outlier.ind[non.na.indices] <- ind

    return(outlier.ind*1)
  }

  if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
  if (outlierLevel=="protein") uniqueLabel <- protClass$protId
  protClass.i <- protClass[uniqueLabel == i,]
  nProt <- nrow(protClass.i)  #(or number of peptides)
  nczfMatLog.i <- log2(protClass.i[,(numRefCols+1):(numRefCols+numDataCols)] + eps)
  if (nProt > 1) {
    proteinsAllData <- nczfMatLog.i
    n.spectra <- nrow(proteinsAllData)

    outIndMat.i <- data.frame(matrix(NA, ncol=numDataCols, nrow=n.spectra))
    #out.boxplotMod.i <- outIndMat.i
    names(outIndMat.i) <- names(nczfMatLog.i)
    for (j in seq_len(numDataCols)) {
      # j=1
      xx <- as.numeric(proteinsAllData[,j])
      pp <- 2^xx - eps  # convert back to original scale
      # add uniform (0, eps) random number, if "randomError=TRUE"
      if (randomError==TRUE) {
        n.pp <- length(pp)
        eps.random <- runif(n=n.pp, min=0, max=eps)*(pp < 10^(-5))
        pp.r <- pp + eps.random
        xx <- log2(pp.r + eps)
      }

      # identify outliers that are 3 times the interquartile range from either quartile
      # note that the default for boxplots is 1.5 times the interquartile range


      if (outlierMeth=="boxplot") outIndMat.i[,j] <- boxplotMod(xx, range=range)
      if (outlierMeth=="scores") outIndMat.i[,j] <- scoresMod(xx, proba=proba)
      if (outlierMeth=="none") outIndMat.i[,j] <- 0*length(xx)
    }
    outlier.num <- apply(outIndMat.i, 1, sum)

  }
  if (nProt == 1) {
    outlier.num <- 0
  }
  if (outlierLevel=="peptide") outlier.num.spectra <- outlier.num
  if (outlierLevel=="protein") outlier.num.peptides <- outlier.num
  if (nProt >= 1) {
    if (outlierLevel=="peptide") result <- cbind(protClass.i[,seq_len(numRefCols)],outlier.num.spectra,
                       protClass.i[,(numRefCols+1):(numRefCols+numDataCols + 2)])   # include protId and pepId
    if (outlierLevel=="protein") result <- cbind(protClass.i[,seq_len(numRefCols)],outlier.num.peptides,
                       protClass.i[,(numRefCols+1):(numRefCols+numDataCols + 4)])   # include protId and pepId
  }
  if (nProt == 0) result <- NULL
  names(result)[1] <- "prot"
  return(result)
}



# ================================================================
#' outlierFind to identify outliers in a matrix
#' using both scoresMod and boxplotMod
#'
#' @param     protClass a matrix of protein, peptide identifiers and normalized
#'                        specific amounts
#' @param     outlierLevel peptide for outlier spectra within peptides, or
#'                          protein for outlier peptides within proteins`
#' @param     numRefCols number of columns before Mass Spectrometry data columns
#' @param     numDataCols how many columns in MS data
#' @param     outlierMeth boxplot (recommended), scores, or none
#' @param     range the range parameter used for identifying outliers
#' @param     proba probability to exclude outler for scores method
#' @param     eps value to add before log2 transfromations (to avoid taking log of zero)
#' @param     cpus  number of cpus to use for parallel processing
#' @param     randomError  T if allow it to be random
#' @return    additional column of indicators of outlier peptides or outlier
#'            spectra for a set of proteins or peptides
#' @examples
#' set.seed(17356)
#' eps <- 0.029885209
#' data(TLN1_test)
#' flagSpectraBox <- outlierFind(protClass=TLN1_test,
#'                               outlierLevel="peptide", numRefCols=5, numDataCols=9,
#'                               outlierMeth="boxplot", range=3, eps=eps,
#'                               cpus=1, randomError=TRUE)
#' # examine numbers of spectra that are outliers
#' table(flagSpectraBox$outlier.num.spectra)
#' @importFrom snowfall sfInit
#' @importFrom snowfall sfLibrary
#' @importFrom snowfall sfExport
#' @importFrom snowfall sfLapply
#' @importFrom snowfall sfStop
#' @export    outlierFind

outlierFind <- function(protClass, outlierLevel="peptide",
                        numRefCols=5, numDataCols=9, outlierMeth="boxplot", range=3,
                        proba=0.99, eps=eps, cpus=4, randomError=TRUE) {
  if (cpus > 1) {
  #library(snowfall)
  #sfInit(parallel=TRUE, cpus=10)
  sfInit(parallel=TRUE, cpus=cpus)
  #sfLibrary(snowfall)
  #sfLibrary("protsummarize")
  #sfExport("boxplotMod")
  #sfExport("protsCombineC")
  #sfExport("protsMatLabels")
  #sfExport("uniqueLabel")
  #sfExport("scoresMod")
  sfExport("numDataCols")
  sfExport("numRefCols")
  sfExport("eps")


  #sfLibrary(outliers)

  if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
  if (outlierLevel=="protein") uniqueLabel <- protClass$protId

  indList <- unique(uniqueLabel)
  system.time(result <- sfLapply(indList, outlierFind_i, protClass=protClass,
                            outlierLevel=outlierLevel, numRefCols=numRefCols, numDataCols=numDataCols,
                            outlierMeth=outlierMeth, range=range, proba=proba,
                            eps=eps, randomError=randomError))                    # but this works fine!

  sfStop()
  }
  if (cpus==1) {
    if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
    if (outlierLevel=="protein") uniqueLabel <- protClass$protId

    indList <- unique(uniqueLabel)
    system.time(result <- lapply(indList, outlierFind_i, protClass=protClass,
                                   outlierLevel=outlierLevel, numRefCols=numRefCols, numDataCols=numDataCols,
                                   outlierMeth=outlierMeth, range=range, proba=proba,
                                   eps=eps, randomError=randomError))
  }
  #browser()
  protsCombineCnew <- do.call(what="rbind", result)  # convert list of matrices to one matrix
  protsCombineCnew
}
mooredf22/protlocassign0p1p1 documentation built on Feb. 7, 2022, 1:55 a.m.