# service function for outlierFind
outlierFind.i <- function(i, protClass, outlierLevel="peptide",
numRefCols=numRefCols, numDataCols=numDataCols, outlierMeth,
range=range, proba=proba, eps=eps, randomError=randomError) {
# find outliers for group i (might be a gene or a gene-sequence)
# for (i in 1:10) {
# i=9
#'================================================================
#' boxplotMod to identify outliers in a vector
#' Outliers that are 3 times the interquartile range from either
#' quartile; note that the default for boxplots is 1.5 times the
#' interquartile range
#'
#' @param x: a vector of numbers
#' @param reject.vec.i: pre-specified rejection locations
#' @param range: pre-specified times of IQR range
#' @output : TRUE (outliers) or FALSE (keeping)
#'================================================================
boxplotMod <- function(x, range = range) {
#x[reject.vec] <- NA
outlier.x <- boxplot(x, plot = F, range = range)$out
outlier.ind <- (x %in% outlier.x)
return(outlier.ind*1)
}
#'================================================================
#' scoresMod to identify outliers in a vector
#' Outliers as exceeding the normal probability of proba
#'
#' @param x: a vector of numbers
#' @param reject.vec.i: pre-specified rejection locations
#' @param proba: pre-specified normal probability
#' @output : TRUE (outliers) or FALSE (keeping)
#' @note : when sd(x)=0, scores() generates NA, mark
#' NA as FALSE
#'================================================================
scoresMod <- function(x, reject.vec = NULL, proba = proba) {
x[reject.vec] <- NA
non.na.indices <- (1:length(x))[!is.na(x)]
ind <- scores(x[!is.na(x)], prob = proba)
ind[is.na(ind)] <- F
outlier.ind <- rep(F, length(x))
outlier.ind[non.na.indices] <- ind
return(outlier.ind*1)
}
if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
if (outlierLevel=="protein") uniqueLabel <- protClass$protId
protClass.i <- protClass[uniqueLabel == i,]
nProt <- nrow(protClass.i) #(or number of peptides)
nczfMatLog.i <- log2(protClass.i[,(numRefCols+1):(numRefCols+numDataCols)] + eps)
if (nProt > 1) {
proteinsAllData <- nczfMatLog.i
n.spectra <- nrow(proteinsAllData)
outIndMat.i <- data.frame(matrix(NA, ncol=numDataCols, nrow=n.spectra))
#out.boxplotMod.i <- outIndMat.i
names(outIndMat.i) <- names(nczfMatLog.i)
for (j in 1:numDataCols) {
# j=1
xx <- as.numeric(proteinsAllData[,j])
pp <- 2^xx - eps # convert back to original scale
# add uniform (0, eps) random number, if "randomError=T"
if (randomError==T) {
n.pp <- length(pp)
eps.random <- runif(n=n.pp, min=0, max=eps)*(pp < 10^(-5))
pp.r <- pp + eps.random
xx <- log2(pp.r + eps)
}
# identify outliers that are 3 times the interquartile range from either quartile
# note that the default for boxplots is 1.5 times the interquartile range
#box.xx <- boxplot(xx, plot=F, range=3)
#outlier.xx <- box.xx$out
#ind.out.xx <- {xx %in% outlier.xx}
#outIndMat.i[,j] <- as.numeric(ind.out.xx)
if (outlierMeth=="boxplot") outIndMat.i[,j] <- boxplotMod(xx, range=range)
if (outlierMeth=="scores") outIndMat.i[,j] <- scoresMod(xx, proba=proba)
if (outlierMeth=="none") outIndMat.i[,j] <- 0*length(xx)
}
outlier.num <- apply(outIndMat.i, 1, sum)
}
if (nProt == 1) {
outlier.num <- 0
}
if (outlierLevel=="peptide") outlier.num.spectra <- outlier.num
if (outlierLevel=="protein") outlier.num.peptides <- outlier.num
if (nProt >= 1) {
if (outlierLevel=="peptide") result <- cbind(protClass.i[,1:numRefCols],outlier.num.spectra,
protClass.i[,(numRefCols+1):(numRefCols+numDataCols + 2)]) # include protId and pepId
if (outlierLevel=="protein") result <- cbind(protClass.i[,1:numRefCols],outlier.num.peptides,
protClass.i[,(numRefCols+1):(numRefCols+numDataCols + 4)]) # include protId and pepId
}
if (nProt == 0) result <- NULL
names(result)[1] <- "prot"
return(result)
}
#'================================================================
#' OutlierFind to identify outliers in a matrix
#' using both scoresMod and boxplotMod
#'
#' @param protClass: a matrix of protein, peptide identifiers and normalized
#' specific amounts
#' @param outlierLevel: peptide for outlier spectra within peptides, or
#' protein for outlier peptides within proteins`
#' @param numRefCols: number of columns before Mass Spectrometry data columns
#' @param numDataCols: how many columns in MS data
#' @param outlierMeth: boxplot (recommended), scores, or none
#' @param range: the range parameter used for identifying outliers
#' @param eps: value to add before log2 transfromations (to avoid taking log of zero)
#' @param cpus: number of cpus to use for parallel processing
#' @output : Input.Data plus outlier indicators
#'================================================================
outlierFind <- function(protClass, outlierLevel="peptide",
numRefCols=5, numDataCols=9, outlierMeth="boxplot", range=3,
proba=0.99, eps=eps, cpus=4, randomError=T) {
if (cpus > 1) {
library(snowfall)
#sfInit(parallel=T, cpus=10)
sfInit(parallel=T, cpus=cpus)
sfLibrary(snowfall)
#sfLibrary("protsummarize")
#sfExport("boxplotMod")
#sfExport("protsCombineC")
#sfExport("protsMatLabels")
#sfExport("uniqueLabel")
#sfExport("scoresMod")
sfExport("numDataCols")
sfExport("numRefCols")
sfExport("eps")
sfLibrary(outliers)
#system.time(result <- sfSapply(indList, outlierFind, simplify=T)) # gives error message about "negative index"
if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
if (outlierLevel=="protein") uniqueLabel <- protClass$protId
#if (outlierLevel=="peptide") uniqueLabel <- protClass$peptide
#if (outlierLevel=="protein") uniqueLabel <- protClass$prot
indList <- unique(uniqueLabel)
system.time(result <- sfLapply(indList, outlierFind.i, protClass=protClass,
outlierLevel=outlierLevel, numRefCols=numRefCols, numDataCols=numDataCols,
outlierMeth=outlierMeth, range=range, proba=proba,
eps=eps, randomError=randomError)) # but this works fine!
sfStop()
}
if (cpus==1) {
if (outlierLevel=="peptide") uniqueLabel <- protClass$pepId
if (outlierLevel=="protein") uniqueLabel <- protClass$protId
#if (outlierLevel=="peptide") uniqueLabel <- protClass$peptide
#if (outlierLevel=="protein") uniqueLabel <- protClass$prot
indList <- unique(uniqueLabel)
system.time(result <- lapply(indList, outlierFind.i, protClass=protClass,
outlierLevel=outlierLevel, numRefCols=numRefCols, numDataCols=numDataCols,
outlierMeth=outlierMeth, range=range, proba=proba,
eps=eps, randomError=randomError))
}
#browser()
protsCombineCnew <- do.call(what="rbind", result) # convert list of matrices to one matrix
protsCombineCnew
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.