R/OutlierFind.R

Defines functions outlierFind outlierFind.i

Documented in outlierFind outlierFind.i

# service function for outlierFind
outlierFind.i <- function(i, protClass, outlierLevel="peptide",
                          numRefCols=numRefCols, numDataCols=numDataCols, outlierMeth,
                          range=range, proba=proba, eps=eps, randomError=randomError) {
  # find outliers for group i (might be a gene or a gene-sequence)
  # for (i in 1:10) {

  # i=9
  #'================================================================
  #' boxplotMod to identify outliers in a vector
  #' Outliers that are 3 times the interquartile range from either
  #' quartile; note that the default for boxplots is 1.5 times the
  #' interquartile range
  #'
  #' @param             x: a vector of numbers
  #' @param  reject.vec.i: pre-specified rejection locations
  #' @param         range: pre-specified times of IQR range
  #' @output             : TRUE (outliers) or FALSE (keeping)
  #'================================================================

  boxplotMod <- function(x, range = range) {
    #x[reject.vec] <- NA
    outlier.x <- boxplot(x, plot = F, range = range)$out
    outlier.ind <- (x %in% outlier.x)

    return(outlier.ind*1)
  }

  #'================================================================
  #' scoresMod to identify outliers in a vector
  #' Outliers as exceeding the normal probability of proba
  #'
  #' @param             x: a vector of numbers
  #' @param  reject.vec.i: pre-specified rejection locations
  #' @param         proba: pre-specified normal probability
  #' @output             : TRUE (outliers) or FALSE (keeping)
  #' @note               : when sd(x)=0, scores() generates NA, mark
  #'                       NA as FALSE
  #'================================================================

  scoresMod <- function(x, reject.vec = NULL, proba = proba) {
    x[reject.vec] <- NA
    non.na.indices <- (1:length(x))[!is.na(x)]
    ind <- scores(x[!is.na(x)], prob = proba)
    ind[is.na(ind)] <- F
    outlier.ind <- rep(F, length(x))
    outlier.ind[non.na.indices] <- ind

    return(outlier.ind*1)
  }

  if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
  if (outlierLevel=="protein") uniqueLabel <- protClass$protId
  protClass.i <- protClass[uniqueLabel == i,]
  nProt <- nrow(protClass.i)  #(or number of peptides)
  nczfMatLog.i <- log2(protClass.i[,(numRefCols+1):(numRefCols+numDataCols)] + eps)
  if (nProt > 1) {
    proteinsAllData <- nczfMatLog.i
    n.spectra <- nrow(proteinsAllData)

    outIndMat.i <- data.frame(matrix(NA, ncol=numDataCols, nrow=n.spectra))
    #out.boxplotMod.i <- outIndMat.i
    names(outIndMat.i) <- names(nczfMatLog.i)
    for (j in 1:numDataCols) {
      # j=1
      xx <- as.numeric(proteinsAllData[,j])
      pp <- 2^xx - eps  # convert back to original scale
      # add uniform (0, eps) random number, if "randomError=T"
      if (randomError==T) {
        n.pp <- length(pp)
        eps.random <- runif(n=n.pp, min=0, max=eps)*(pp < 10^(-5))
        pp.r <- pp + eps.random
        xx <- log2(pp.r + eps)
      }

      # identify outliers that are 3 times the interquartile range from either quartile
      # note that the default for boxplots is 1.5 times the interquartile range

      #box.xx <- boxplot(xx, plot=F, range=3)
      #outlier.xx <- box.xx$out
      #ind.out.xx <- {xx %in% outlier.xx}
      #outIndMat.i[,j] <- as.numeric(ind.out.xx)
      if (outlierMeth=="boxplot") outIndMat.i[,j] <- boxplotMod(xx, range=range)
      if (outlierMeth=="scores") outIndMat.i[,j] <- scoresMod(xx, proba=proba)
      if (outlierMeth=="none") outIndMat.i[,j] <- 0*length(xx)
    }
    outlier.num <- apply(outIndMat.i, 1, sum)

  }
  if (nProt == 1) {
    outlier.num <- 0
  }
  if (outlierLevel=="peptide") outlier.num.spectra <- outlier.num
  if (outlierLevel=="protein") outlier.num.peptides <- outlier.num
  if (nProt >= 1) {
    if (outlierLevel=="peptide") result <- cbind(protClass.i[,1:numRefCols],outlier.num.spectra,
                       protClass.i[,(numRefCols+1):(numRefCols+numDataCols + 2)])   # include protId and pepId
    if (outlierLevel=="protein") result <- cbind(protClass.i[,1:numRefCols],outlier.num.peptides,
                       protClass.i[,(numRefCols+1):(numRefCols+numDataCols + 4)])   # include protId and pepId
  }
  if (nProt == 0) result <- NULL
  names(result)[1] <- "prot"
  return(result)
}



#'================================================================
#' OutlierFind to identify outliers in a matrix
#' using both scoresMod and boxplotMod
#'
#' @param     protClass: a matrix of protein, peptide identifiers and normalized
#'                        specific amounts
#' @param     outlierLevel: peptide for outlier spectra within peptides, or
#'                          protein for outlier peptides within proteins`
#' @param     numRefCols: number of columns before Mass Spectrometry data columns
#' @param     numDataCols: how many columns in MS data
#' @param     outlierMeth: boxplot (recommended), scores, or none
#' @param     range: the range parameter used for identifying outliers
#' @param     eps: value to add before log2 transfromations (to avoid taking log of zero)
#' @param     cpus:  number of cpus to use for parallel processing
#' @output             : Input.Data plus outlier indicators
#'================================================================
outlierFind <- function(protClass, outlierLevel="peptide",
                        numRefCols=5, numDataCols=9, outlierMeth="boxplot", range=3,
                        proba=0.99, eps=eps, cpus=4, randomError=T) {
  if (cpus > 1) {
  library(snowfall)
  #sfInit(parallel=T, cpus=10)
  sfInit(parallel=T, cpus=cpus)
  sfLibrary(snowfall)
  #sfLibrary("protsummarize")
  #sfExport("boxplotMod")
  #sfExport("protsCombineC")
  #sfExport("protsMatLabels")
  #sfExport("uniqueLabel")
  #sfExport("scoresMod")
  sfExport("numDataCols")
  sfExport("numRefCols")
  sfExport("eps")


  sfLibrary(outliers)
  #system.time(result <- sfSapply(indList, outlierFind, simplify=T))    # gives error message about "negative index"
  if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
  if (outlierLevel=="protein") uniqueLabel <- protClass$protId
  #if (outlierLevel=="peptide") uniqueLabel <- protClass$peptide
  #if (outlierLevel=="protein") uniqueLabel <- protClass$prot
  indList <- unique(uniqueLabel)
  system.time(result <- sfLapply(indList, outlierFind.i, protClass=protClass,
                            outlierLevel=outlierLevel, numRefCols=numRefCols, numDataCols=numDataCols,
                            outlierMeth=outlierMeth, range=range, proba=proba,
                            eps=eps, randomError=randomError))                    # but this works fine!

  sfStop()
  }
  if (cpus==1) {
    if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
    if (outlierLevel=="protein") uniqueLabel <- protClass$protId
    #if (outlierLevel=="peptide") uniqueLabel <- protClass$peptide
    #if (outlierLevel=="protein") uniqueLabel <- protClass$prot
    indList <- unique(uniqueLabel)
    system.time(result <- lapply(indList, outlierFind.i, protClass=protClass,
                                   outlierLevel=outlierLevel, numRefCols=numRefCols, numDataCols=numDataCols,
                                   outlierMeth=outlierMeth, range=range, proba=proba,
                                   eps=eps, randomError=randomError))
  }
  #browser()
  protsCombineCnew <- do.call(what="rbind", result)  # convert list of matrices to one matrix
  protsCombineCnew
}
mooredf22/protsummarize2 documentation built on May 16, 2021, 10:12 p.m.