# find mean profiles for proteins, or for protein-peptide combinations
# requires "snowfall" package for multi-core processing
meansByProteins <- function(i, protsCombineCnew, numRefCols, numDataCols,
GroupBy, eps, outlierExclude) {
# outlierExclude:
# none: don't exclude any outlers
# spectra: exclude only spectra within peptides
# spectraAndPeptide: exclude spectra-within-peptide outliers and peptide outliers
# i=217
# i=2
protId <- protsCombineCnew$protId
pepId <- protsCombineCnew$pepId
#uniqueLabel <- protsCombineCnew$uniqueLabel
#if (GroupBy == "protId") protData.i <- protsCombineCnew[protId == i,]
#if (GroupBy == "peptideId") protData.i <- protsCombineCnew[pepId == i,]
if (GroupBy == "protId") uniqueLabel <- protId
if (GroupBy == "peptideId") uniqueLabel <- pepId
protData.i <- protsCombineCnew[uniqueLabel == i, ]
prot.i <- protData.i$prot[1]
protId.i <- protData.i$protId[1]
pep.i <- protData.i$peptide[1]
pepId.i <- protData.i$pepId[1]
#pep.i <- protData.i$peptide[1]
#protId.i <-
#if (nrow(geneData.i) > 2) {
#profileAll.i <- geneData.i[,numRefCols + c(1:7,9)] # log2 transformed profiles + geneSeqID # for proteins
profileAll.i.x <- cbind(protData.i[,numRefCols + c(1:numDataCols)], protData.i$protId, protData.i$pepId)
names(profileAll.i.x)[ numDataCols + 1] <- "protId"
names(profileAll.i.x)[ numDataCols + 2] <- "pepId"
profileAll.i <- profileAll.i.x
profileAll.i[,1:numDataCols] <- log2(profileAll.i.x[,1:numDataCols] + eps) # transform to log2 scale
# identify and eliminate outliers
#use.i <- {protData.i$outCountBoxPlot == 0}
if (outlierExclude == "none") {
use.i <- rep(T, nrow(protData.i))
}
if (outlierExclude == "spectra") {
use.i <- {{protData.i$outlier.num.spectra == 0} & {!is.na(protData.i$outlier.num.spectra)}}
}
if (outlierExclude == "spectraAndPeptide") {
use.i <- {{protData.i$outlier.num.spectra == 0} & {!is.na(protData.i$outlier.num.spectra)} &
{protData.i$outlier.num.peptides == 0} & {!is.na(protData.i$outlier.num.peptides)} }
}
profileXX.i <- profileAll.i[use.i,]
Nspectra <- nrow(profileXX.i)
if (Nspectra == 0) {
coef.est <- rep(NA, numDataCols)
secoef.est <- rep(NA, numDataCols)
Npep <- 0
result.i <- c(coef.est, secoef.est, Nspectra, Npep, protId.i, pepId.i)
return(result.i)
}
Npep <- length(unique(profileXX.i$pepId))
# need at least 4 spectra and 3 unique peptides
lmerGood <- {{Nspectra >= 4} & {Npep >= 3}}
if (Nspectra == Npep) lmerGood <- F # don't do if one seq per spectrum
if (GroupBy == "peptideId") lmerGood <- F # makes no sense if by peptide
if (lmerGood) {
coef.est <- rep(NA, numDataCols)
secoef.est <- rep(NA, numDataCols)
for (k in 1:numDataCols) { # do for each fraction
#k=1
y.k <- profileXX.i[,k] # k'th column (fraction)
varOK <- {var(y.k) > 0.001}
#if (varOK) {
result.k <- try(lmer(y.k ~ 1 + (1 | pepId), data=profileXX.i))
try(coef.est[k] <- fixef(result.k))
try(secoef.est[k] <- sqrt(as.numeric(vcov(result.k))))
# }
if (!varOK | is.na(coef.est[k])) { # variance too small for lmer;
coef.est[k] <- mean(y.k)
secoef.est[k] <- 0.001
}
if (class(result.k) == "try-error") {
coef.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,mean))
secoef.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,sd))/sqrt(Nspectra)
if (secoef.est < 0.01) se.coef.est <- 0.01
}
}
}
if ({!lmerGood} & {Nspectra == 1}) {
coef.est <- as.numeric(profileXX.i[1,1:numDataCols])
secoef.est <- rep(NA, numDataCols)
}
if ({!lmerGood} & {Nspectra == 2}) {
coef.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,mean))
secoef.est <- rep(NA, numDataCols)
}
if ({!lmerGood} & {Nspectra >= 3} ) {
coef.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,mean))
secoef.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,sd))/sqrt(Nspectra)
coef.median.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,median))
}
# apply(profile99.i,2,mean) # test; should be similar to coef.est
raw.coef.orig <- 2^coef.est - eps
facAdj <- sum(raw.coef.orig) # adjustment factor to make things add to 1
raw.coef <- raw.coef.orig/facAdj
coef.est <- log2(raw.coef + eps)
# if GroupBy = "protId", prot.i is the protein number, and i is the same
# if GroupBy = "pepId", prot.i is the protein number, and i is the peptide number
result.i <- c(coef.est, secoef.est, Nspectra, Npep, protId.i, pepId.i)
return(result.i)
}
if (F) {
# only do this if you don't use multiprocessing
# outXX="box", numRefCols=4, n.chan=9, GroupBy = "protId",recoveryOnly=F,
eps=eps
result <- NULL
for (i in 1:n.prots) {
print(i)
result.i <- meansByProteins(i, protsCombineCnew=protsCombineCnew, numRefCols=numRefCols, numDataCols=numDataCols,
GroupBy=GroupBy, eps=eps, outlier.num=outlier.num)
#result.i <- meansByProteins(i, protsCombineCnew, outXX="box", numRefCols=4,
# n.chan=9, GroupBy="protId", eps=eps, outlier.num=outlier.num)
result <- rbind(result, result.i)
}
temp <- result
temp.df <- data.frame(temp)
#n.chan <- (numDataCols(temp.df) - 3)/2
names.channels <- names(protsCombineCnew)[numRefCols + c(1:n.chan)]
#if (dataUse == "ProgenesisRep") names.channels <- names(protsCombineCnew)[numRefCols + 1 + c(1:n.chan)]
names(temp.df)[ 1:n.chan] <- names.channels
names(temp.df)[(n.chan + 1):( n.chan + n.chan)] <- paste(names.channels, ".se", sep="")
names(temp.df)[ 2*n.chan + 1:3] <- c("Nspectra", "Npeptides", "ProtID")
protInd <- !duplicated(protsCombineCnew$prot)
protName <- protsCombineCnew$prot[protInd]
protProfileSummary <- data.frame(protName, temp.df)
protProfileSummary
}
#'================================================================
#' protProfileSummarize calculates mean and SE for each channel in
#' each prot using random effect model or arithmetic mean
#' random effect model can avoid dominance of a sequence with
#' too many spectra
#'
#' @param protsCombineCnew: a matrix of protein information and normalized specific amounts and
#' outlier information
#' @param numRefCols: number of columns before Mass
#' Spectrometry data columns
#' @param numDataCols: how many columns in MS data
#' @param eps: epsilon to avoid log(0)
#' @param GroupBy: "protId" if average by protein; "peptideId" if average by peptide
#' @param outlierExclude: "none", "spectra", or "spectraAndpeptide" (default) according to exclusion level
#' @param cpus: number of cpus to use for parallel processing
#' @output : mean or weighted mean normalized specific amount profiles
#'
#'================================================================
profileSummarize <- function(protsCombineCnew, numRefCols, numDataCols, refColsKeep=c(1,2),
eps, GroupBy="protId", outlierExclude="spectraAndPeptide", logDataOut=F, cpus=4) {
# outlierExclude:
# none: don't exclude any outlers
# spectra: (default) exclude only spectra within peptides
# spectraAndPeptide: exclude spectra-within-peptide outliers and peptide outliers
# # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # #
if (GroupBy == "protId") indList <- unique(protsCombineCnew$protId)
if (GroupBy == "peptideId") indList <- unique(protsCombineCnew$pepId)
if (!(outlierExclude %in% c("none", "spectra", "spectraAndPeptide"))) {
cat("outlierExclude must be one of none, spectra, or peptide\n")
stop()
}
if ((GroupBy == "peptideId") & (outlierExclude == "spectraAndPeptide")) {
cat("if GroupBy == \'peptideId\' then outlierExclude cannot equal \'spectraAndPeptide\' \n")
stop()
}
#indList <- unique(uniqueLabel)
n.prots <- length(indList) # either proteins or proteins/peptides
library(lme4)
# indList <- 1:16
library(snowfall)
sfInit(parallel=T, cpus=cpus)
sfExport("protsCombineCnew") # main data set
#sfExport("meansByProteins") # function
sfExport("GroupBy")
#sfExport("outlier")
sfExport("numDataCols")
sfExport("numRefCols")
sfLibrary(lme4)
system.time(result <- sfLapply(indList, meansByProteins, protsCombineCnew=protsCombineCnew, numRefCols=numRefCols, numDataCols=numDataCols,
GroupBy=GroupBy, eps=eps, outlierExclude=outlierExclude))
sfStop()
temp <- do.call(what="rbind", result) # convert list of matrices to one matrix
temp.df <- data.frame(temp)
names.channels <- names(protsCombineCnew)[numRefCols + c(1:numDataCols)]
names(temp.df)[ 1:numDataCols] <- names.channels
names(temp.df)[(numDataCols + 1):( numDataCols + numDataCols)] <- paste(names.channels, ".se", sep="")
names(temp.df)[ 2*numDataCols + 1:4] <- c("Nspectra", "Npep", "protId", "pepId")
if (GroupBy == "protId") refColsKeep <- 2 # only keep the "prot" column
index <- match(temp.df$pepId, protsCombineCnew$pepId) # indices of second data set
protsMini <- protsCombineCnew[index, refColsKeep]
if (F) {
protInd <- !duplicated(protsCombineCnew$prot)
pepInd <- !duplicated(protsCombineCnew$peptide)
protName <- protsCombineCnew$prot[protInd]
pepName <- protsCombineCnew$peptide[pepInd]
protIdShort <- protsCombineCnew$protId[protInd]
pepIdShort <- protsCombineCnew$pepId[pepInd]
# now set up the look-up table using row names
names(protIdShort) <- protName # this is a table, one for each protein
protNameLong <- names(protIdShort[temp.df$protId]) # protein names, one for each peptide
names(pepIdShort) <- pepName
pepNameLong <- names(pepIdShort[temp.df$pepId])
}
if (GroupBy == "protId") {
temp.df <- subset(temp.df, select=-c(protId, pepId)) # drop protId and pepId (only for GroupBy="protd")
protProfileSummaryRes <- data.frame(protsMini, temp.df) # "protsMini" is protein names
names(protProfileSummaryRes)[1] <- "prot"
}
if (GroupBy == "peptideId") {
#protProfileSummaryRes <- data.frame(protNameLong, pepNameLong,temp.df)
#names(protProfileSummaryRes)[1:2] <- c("prot", "peptide")
protProfileSummaryRes <- data.frame(protsMini,temp.df)
}
# protProfileSummaryRes
# }
#protProfileSummaryRes <- wrapup(result, GroupBy)
# for "Identity" (untransformed) results, drop the standard error terms, which don't apply
if (GroupBy == "protId") {
# note that first two columns are protein and peptide names
protProfileSummaryIdentity <- protProfileSummaryRes[,-((2 + numDataCols):(1 + 2*numDataCols))]
# now reverse the log2 transformation
protProfileSummaryIdentity[,2:(1 + numDataCols)] <- 2^protProfileSummaryRes[,c(2:(1 + numDataCols))] - eps
# now eliminate negative values due to roundoff error
negVals <- {{protProfileSummaryIdentity[,2:(1 + numDataCols)] < 0} & {protProfileSummaryIdentity[,2:(1 + numDataCols)] > -1E-10}}
protProfileSummaryIdentity[,2:(1 + numDataCols)][negVals] <- 0
}
if (GroupBy == "peptideId") {
nKeep <- length(refColsKeep)
# note that first column is protein name. (There is no peptide name here)
#protProfileSummaryIdentity <- protProfileSummaryRes[,-((2 + numDataCols):(1 + 2*numDataCols))]
# drop the standard errors of the mean profiles (9 columns)
protProfileSummaryIdentity <- protProfileSummaryRes[,-((nKeep+1 + numDataCols):(nKeep + 2*numDataCols))]
# now reverse the log2 transformation
protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)] <- 2^protProfileSummaryRes[,(nKeep+1):(nKeep + numDataCols)] - eps
# now eliminate negative values due to roundoff error
negVals <- {{protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)] < 0} & {protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)] > -1E-10}}
protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)][negVals] <- 0
}
protProfileSummaryIdentity
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.