R/profileSummarize.r

Defines functions profileSummarize meansByProteins

# find mean profiles for proteins, or for protein-peptide combinations
# requires "snowfall" package for multi-core processing

meansByProteins <- function(i, protsCombineCnew, numRefCols, numDataCols,
                            GroupBy, eps, outlierExclude) {
  # outlierExclude:
  #   none: don't exclude any outlers
  #   spectra:    exclude only spectra within peptides
  #   spectraAndPeptide:  exclude spectra-within-peptide outliers and peptide outliers
  # i=217
  # i=2

  protId <- protsCombineCnew$protId
  pepId <- protsCombineCnew$pepId
  #uniqueLabel <- protsCombineCnew$uniqueLabel
  #if (GroupBy == "protId") protData.i <- protsCombineCnew[protId == i,]
  #if (GroupBy == "peptideId") protData.i <- protsCombineCnew[pepId == i,]
  if (GroupBy == "protId") uniqueLabel <- protId
  if (GroupBy == "peptideId") uniqueLabel <- pepId
  protData.i <- protsCombineCnew[uniqueLabel == i, ]
  prot.i <- protData.i$prot[1]
  protId.i <- protData.i$protId[1]
  pep.i <- protData.i$peptide[1]
  pepId.i <- protData.i$pepId[1]
  #pep.i <- protData.i$peptide[1]
  #protId.i <-
  #if (nrow(geneData.i) > 2) {
  #profileAll.i <- geneData.i[,numRefCols + c(1:7,9)]  # log2 transformed profiles  + geneSeqID    # for proteins
  profileAll.i.x <- cbind(protData.i[,numRefCols + c(1:numDataCols)], protData.i$protId, protData.i$pepId)
  names(profileAll.i.x)[ numDataCols + 1] <- "protId"
  names(profileAll.i.x)[ numDataCols + 2] <- "pepId"
  profileAll.i <- profileAll.i.x
  profileAll.i[,1:numDataCols] <- log2(profileAll.i.x[,1:numDataCols] + eps)  # transform to log2 scale



   # identify and eliminate outliers
    #use.i <- {protData.i$outCountBoxPlot == 0}
    if (outlierExclude == "none") {
      use.i <- rep(T, nrow(protData.i))
    }
  if (outlierExclude == "spectra") {
    use.i <- {{protData.i$outlier.num.spectra == 0} & {!is.na(protData.i$outlier.num.spectra)}}
  }
  if (outlierExclude == "spectraAndPeptide") {
    use.i <- {{protData.i$outlier.num.spectra == 0} & {!is.na(protData.i$outlier.num.spectra)} &
              {protData.i$outlier.num.peptides == 0} & {!is.na(protData.i$outlier.num.peptides)} }
  }
    profileXX.i <- profileAll.i[use.i,]


  Nspectra <- nrow(profileXX.i)

  if (Nspectra == 0) {
    coef.est <- rep(NA, numDataCols)
    secoef.est <- rep(NA, numDataCols)
    Npep <- 0
    result.i <- c(coef.est, secoef.est, Nspectra, Npep, protId.i, pepId.i)
    return(result.i)
  }
  Npep <- length(unique(profileXX.i$pepId))

  # need at least 4 spectra and 3 unique peptides
  lmerGood <- {{Nspectra >= 4} & {Npep >= 3}}
  if (Nspectra == Npep) lmerGood <- F   # don't do if one seq per spectrum
  if (GroupBy == "peptideId")    lmerGood <- F   # makes no sense if by peptide

  if (lmerGood)   {
    coef.est <- rep(NA, numDataCols)
    secoef.est <- rep(NA, numDataCols)
    for (k in 1:numDataCols) {  # do for each fraction
      #k=1
      y.k <- profileXX.i[,k]  # k'th column (fraction)

      varOK <- {var(y.k) > 0.001}
      #if (varOK) {
      result.k <- try(lmer(y.k ~ 1 + (1 | pepId), data=profileXX.i))

      try(coef.est[k] <- fixef(result.k))
      try(secoef.est[k] <- sqrt(as.numeric(vcov(result.k))))
      #  }
      if (!varOK | is.na(coef.est[k])) { # variance too small for lmer;
        coef.est[k] <- mean(y.k)

        secoef.est[k] <- 0.001
      }

      if (class(result.k) == "try-error") {
        coef.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,mean))
        secoef.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,sd))/sqrt(Nspectra)
        if (secoef.est < 0.01) se.coef.est <- 0.01
      }
    }
  }

  if ({!lmerGood} & {Nspectra == 1}) {
    coef.est <- as.numeric(profileXX.i[1,1:numDataCols])
    secoef.est <- rep(NA, numDataCols)
  }
  if ({!lmerGood} & {Nspectra == 2}) {
    coef.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,mean))
    secoef.est <- rep(NA, numDataCols)
  }
  if ({!lmerGood} & {Nspectra >= 3} ) {
    coef.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,mean))
    secoef.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,sd))/sqrt(Nspectra)
    coef.median.est <- as.numeric(apply(profileXX.i[,1:numDataCols],2,median))
  }

  # apply(profile99.i,2,mean)  # test; should be similar to coef.est
  raw.coef.orig <- 2^coef.est - eps
  facAdj <- sum(raw.coef.orig)  # adjustment factor to make things add to 1
  raw.coef <- raw.coef.orig/facAdj
  coef.est <- log2(raw.coef + eps)

  # if GroupBy = "protId", prot.i is the protein number, and i is the same
  # if GroupBy = "pepId", prot.i is the protein number, and i is the peptide number
  result.i <- c(coef.est, secoef.est, Nspectra, Npep, protId.i, pepId.i)
  return(result.i)
}



if (F) {
  # only do this if you don't use multiprocessing
  # outXX="box", numRefCols=4, n.chan=9, GroupBy = "protId",recoveryOnly=F,
  eps=eps
  result <- NULL
  for (i in 1:n.prots) {
    print(i)
    result.i <- meansByProteins(i, protsCombineCnew=protsCombineCnew, numRefCols=numRefCols, numDataCols=numDataCols,
                       GroupBy=GroupBy, eps=eps, outlier.num=outlier.num)
    #result.i <- meansByProteins(i, protsCombineCnew, outXX="box", numRefCols=4,
    #                            n.chan=9, GroupBy="protId", eps=eps, outlier.num=outlier.num)
    result <- rbind(result, result.i)
  }

  temp <- result
  temp.df <- data.frame(temp)
  #n.chan <- (numDataCols(temp.df) - 3)/2
  names.channels <- names(protsCombineCnew)[numRefCols + c(1:n.chan)]
  #if (dataUse == "ProgenesisRep") names.channels <- names(protsCombineCnew)[numRefCols + 1 + c(1:n.chan)]
  names(temp.df)[ 1:n.chan] <- names.channels
  names(temp.df)[(n.chan + 1):( n.chan + n.chan)] <- paste(names.channels, ".se", sep="")
  names(temp.df)[ 2*n.chan + 1:3] <- c("Nspectra", "Npeptides", "ProtID")

  protInd <- !duplicated(protsCombineCnew$prot)
  protName <- protsCombineCnew$prot[protInd]


  protProfileSummary <- data.frame(protName, temp.df)
  protProfileSummary
}


#'================================================================
#' protProfileSummarize calculates mean and SE for each channel in
#' each prot using random effect model or arithmetic mean
#' random effect model can avoid dominance of a sequence with
#' too many spectra
#'
#' @param  protsCombineCnew: a matrix of protein information and normalized specific amounts and
#'                           outlier information
#' @param            numRefCols: number of columns before Mass
#'                           Spectrometry data columns
#' @param              numDataCols: how many columns in MS data
#' @param               eps: epsilon to avoid log(0)
#' @param         GroupBy: "protId" if average by protein; "peptideId" if average by peptide
#' @param       outlierExclude: "none", "spectra", or "spectraAndpeptide" (default) according to exclusion level
#' @param              cpus: number of cpus to use for parallel processing
#' @output                 : mean or weighted mean normalized specific amount profiles
#'
#'================================================================
profileSummarize <- function(protsCombineCnew, numRefCols, numDataCols, refColsKeep=c(1,2),
                           eps, GroupBy="protId", outlierExclude="spectraAndPeptide", logDataOut=F, cpus=4) {

  # outlierExclude:
  #   none: don't exclude any outlers
  #   spectra:  (default) exclude only spectra within peptides
  #   spectraAndPeptide:  exclude spectra-within-peptide outliers and peptide outliers

  # # # # # # # # # # # # # # # # # # # # # # # # # # #
  # # # # # # # # # # # # # # # # # # # # # # # # # # #

  if (GroupBy == "protId") indList <- unique(protsCombineCnew$protId)
  if (GroupBy == "peptideId") indList <- unique(protsCombineCnew$pepId)

  if (!(outlierExclude %in% c("none", "spectra", "spectraAndPeptide"))) {
    cat("outlierExclude must be one of none, spectra, or peptide\n")
    stop()
  }
  if ((GroupBy == "peptideId") & (outlierExclude == "spectraAndPeptide")) {
    cat("if GroupBy == \'peptideId\' then outlierExclude cannot equal \'spectraAndPeptide\' \n")
    stop()
  }

  #indList <- unique(uniqueLabel)
  n.prots <- length(indList)  # either proteins or proteins/peptides

  library(lme4)

  # indList <- 1:16
  library(snowfall)
  sfInit(parallel=T, cpus=cpus)
  sfExport("protsCombineCnew")    # main data set
  #sfExport("meansByProteins")     # function
  sfExport("GroupBy")
  #sfExport("outlier")
  sfExport("numDataCols")
  sfExport("numRefCols")

  sfLibrary(lme4)

  system.time(result <- sfLapply(indList, meansByProteins, protsCombineCnew=protsCombineCnew, numRefCols=numRefCols, numDataCols=numDataCols,
                                 GroupBy=GroupBy, eps=eps, outlierExclude=outlierExclude))
  sfStop()


    temp <- do.call(what="rbind", result)  # convert list of matrices to one matrix
    temp.df <- data.frame(temp)

    names.channels <- names(protsCombineCnew)[numRefCols + c(1:numDataCols)]
    names(temp.df)[ 1:numDataCols] <- names.channels
    names(temp.df)[(numDataCols + 1):( numDataCols + numDataCols)] <- paste(names.channels, ".se", sep="")
    names(temp.df)[ 2*numDataCols + 1:4] <- c("Nspectra", "Npep", "protId", "pepId")

    if (GroupBy == "protId") refColsKeep <- 2  # only keep the "prot" column
    index <- match(temp.df$pepId, protsCombineCnew$pepId)  # indices of second data set
    protsMini <- protsCombineCnew[index, refColsKeep]

    if (F) {
    protInd <- !duplicated(protsCombineCnew$prot)
    pepInd <- !duplicated(protsCombineCnew$peptide)
    protName <- protsCombineCnew$prot[protInd]
    pepName <- protsCombineCnew$peptide[pepInd]
    protIdShort <- protsCombineCnew$protId[protInd]
    pepIdShort <- protsCombineCnew$pepId[pepInd]
    # now set up the look-up table using row names
    names(protIdShort) <- protName   # this is a table, one for each protein
    protNameLong <- names(protIdShort[temp.df$protId])  # protein names, one for each peptide

    names(pepIdShort) <- pepName
    pepNameLong <- names(pepIdShort[temp.df$pepId])
    }


    if (GroupBy == "protId") {
       temp.df <- subset(temp.df, select=-c(protId, pepId))  # drop protId and pepId (only for GroupBy="protd")
       protProfileSummaryRes <- data.frame(protsMini, temp.df) # "protsMini" is protein names
       names(protProfileSummaryRes)[1] <- "prot"
    }
    if (GroupBy == "peptideId") {
        #protProfileSummaryRes <- data.frame(protNameLong, pepNameLong,temp.df)
        #names(protProfileSummaryRes)[1:2] <- c("prot", "peptide")
      protProfileSummaryRes <- data.frame(protsMini,temp.df)
    }
 #   protProfileSummaryRes
 # }

  #protProfileSummaryRes <- wrapup(result, GroupBy)

  # for "Identity" (untransformed) results, drop the standard error terms, which don't apply
  if (GroupBy == "protId") {
    # note that first two columns are protein and peptide names
    protProfileSummaryIdentity <- protProfileSummaryRes[,-((2 + numDataCols):(1 + 2*numDataCols))]
    # now reverse the log2 transformation
    protProfileSummaryIdentity[,2:(1 + numDataCols)] <- 2^protProfileSummaryRes[,c(2:(1 + numDataCols))] - eps
    # now eliminate negative values due to roundoff error
    negVals <- {{protProfileSummaryIdentity[,2:(1 + numDataCols)] < 0} & {protProfileSummaryIdentity[,2:(1 + numDataCols)] > -1E-10}}
    protProfileSummaryIdentity[,2:(1 + numDataCols)][negVals] <- 0
  }
  if (GroupBy == "peptideId") {
      nKeep <- length(refColsKeep)
      # note that first column is protein name. (There is no peptide name here)
      #protProfileSummaryIdentity <- protProfileSummaryRes[,-((2 + numDataCols):(1 + 2*numDataCols))]

      # drop the standard errors of the mean profiles (9 columns)
      protProfileSummaryIdentity <- protProfileSummaryRes[,-((nKeep+1 + numDataCols):(nKeep + 2*numDataCols))]
      # now reverse the log2 transformation
      protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)] <- 2^protProfileSummaryRes[,(nKeep+1):(nKeep + numDataCols)] - eps
      # now eliminate negative values due to roundoff error
      negVals <- {{protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)] < 0} & {protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)] > -1E-10}}
      protProfileSummaryIdentity[,(nKeep+1):(nKeep + numDataCols)][negVals] <- 0
    }

  protProfileSummaryIdentity
}
mooredf22/protsummarize2 documentation built on May 16, 2021, 10:12 p.m.