protlocassign0p1p1: Compute constrained proportional assignments for fractionated protein abundance

Documented in assignCPAloc fCPAone fitCPA locationProfileSetup protCap protIndex vectorizeAll vecUnitize

#' Set up reference profiles for constrained proportional assignment
#'
#' @param profile data frame of protein identifiers (protName) and their relative abundance in centrifugation fractions.
#' @param markerList List of reference proteins and their subcellular locations
#' @param numDataCols Number of channels of abundance levels
#' @return A matrix refLocationProfiles giving the abundance level profiles of the subcellular locations
#' @import knitr
#' @export
#' @examples
#' refLocationProfilesNSA <- locationProfileSetup(profile=protNSA_AT5tmtMS2,
#'   markerList=markerListJadot, numDataCols=9)


locationProfileSetup <- function(profile, markerList, numDataCols) {
  # Find the mean profiles of the subcellular locations

  # "profile" must be a data frame with proteins as row names and these columns:
  #  columns 1 - numDataCols:  normalized specific amounts or specific amounts
  #  Last two columns:   Nspectra (number of spectra) and Nseq (number of distince peptide sequences)

  # "markerList" must be a list of (1) the reference proteins and (2) the corresponding subcelluar locations

  if (!is.data.frame(profile)) print("profile must be a data frame")
  if (!is.data.frame(markerList)) print("markerList must be a data frame")
  if (is.null(names(markerList)[1])) print("first markerList variable must be 'protName'")
  if (is.null(names(markerList)[2])) print("second markerList variable must be 'referenceCompartment'")
  if ({names(markerList)[1] != "protName"} ) {
    print("first markerList variable must be 'protName'")
  }
  if ({names(markerList)[2] != "referenceCompartment"} ) {
    print("second markerList variable must be 'referenceCompartment'")
  }
  # make names all upper case
  protNamesUpper <- toupper(rownames(profile))

  rownames(profile) <- protNamesUpper
  profileWithProts <- data.frame(protNamesUpper, profile)
  names(profileWithProts)[1] <- "protName"  # this has a column of protnames for merge

  names(markerList)[1] <- "protName"
  markerList$protName <- toupper(markerList$protName)  # all must by upper case
  if (nrow(markerList) >= 1) {
    meanReferenceProts <- merge(x=markerList, y=profileWithProts,
         by.x="protName", by.y="protName", all.x=FALSE, sort=FALSE)
  }
  if (nrow(meanReferenceProts) == 0) {
    print("Error from locationProfileSetup; no protein found")
  }

  # Find mean profiles for each sub-cellular location, using reference proteins

  location.list <- as.character(unique(meanReferenceProts$referenceCompartment))
  n.loc <- length(location.list)
  meanProfile <- NULL
  #for (i in 1:n.loc) {
  for (i in seq_len(n.loc)) {
    # i=1
    loc.i <- location.list[i]

    #profile.i <- meanReferenceProts[meanReferenceProts$referenceCompartment == loc.i,2+1:numDataCols]
    profile.i <- meanReferenceProts[meanReferenceProts$referenceCompartment == loc.i,2+seq_len(numDataCols)]
    meanProfile.i <- apply(profile.i, 2, mean, na.rm=TRUE)
    meanProfile <- rbind(meanProfile, meanProfile.i)
    }
  row.names(meanProfile) <- location.list

  # Change name to "markerLoc" and its transpose "refLocationProfiles" for compatibility with past
  # Then plot the profiles

  markerLoc <- t(meanProfile)
  refLocationProfiles <- as.data.frame(t(markerLoc))
  refLocationProfiles
  }



# internal function; goodness-of-fit measure
# @param pvec vector of assignment proportions to compartments; to be estimated
# @param y observed data for amounts of protein in fractions
# @param gmat reference protein amounts
# @param methodQ either "sumsquares" (default) or "sumabsvalue"
#' @export
# @return value of goodness-of-fit function
#
Qfun4 <- function(pvec, y, gmat, methodQ="sumsquares") {
#   Assign sub-cellular location probabilities to
#   each protein. We use the "spg" function in package "BB".

   resultA <- y - pvec%*%t(gmat)
   if (methodQ=="sumsquares") result <- sum(resultA^2)
   if (methodQ=="sumabsvalue") result <- sum(abs(resultA))
   result
}
# internal function; goodness-of-fit measure for subset of varying parameters
# @param pvec.vary vector of assignment proportions to compartments; to be estimated
# @param yy observed data for amounts of protein in fractions
# @param gmat reference protein amounts
# @param methodQ either "sumsquares" (default) or "sumabsvalue"
# @param ind.vary if not NULL, indexes of varying parameters
# @param ind.fixed indexes of fixed parameters; all parameters must be one or the other
# @param par.fixed values of fixed parameters; typically 0
#' @export
# @return value of goodness-of-fit function
Qfun4subset <- function(pvec.vary, yy, gmat, methodQ="sumsquares",
                        ind.vary, ind.fixed, par.fixed) {
  #   Assign sub-cellular location probabilities to
  #   each protein. We use the "spg" function in package "BB".

  # reassemble
  vals.order <- c(ind.vary, ind.fixed)


  pvec.unorder <- c(pvec.vary, par.fixed)
  n.vec <- length(pvec.unorder)
  pvec.orig <- rep(NA, n.vec)
  #for (i in 1:n.vec) {
  for (i in seq_len(n.vec)) {
    pvec.orig[vals.order[i]] <- pvec.unorder[i]
  }

  y <- yy
  pvec <- pvec.orig
  resultA <- y - pvec%*%t(gmat)

  if (methodQ=="sumsquares") result <- sum(resultA^2)
  if (methodQ=="sumabsvalue") result <- sum(abs(resultA))
  result
}

# internal function; project an n-dim vector y to the simplex S_n with sum constrained to 1
# @param y n-dim vector
# @return simplex of dimension one lower
# @export
projSimplex <- function(y) {
  #####
  # project an n-dim vector y to the simplex S_n
  # S_n = { x | x \in R^n, 0 <= x <= 1, sum(x) = 1}
  # Derived from:
  # Copyright:  Ravi Varadhan, Johns Hopkins University
  # August 8, 2012

  #####

  n <- length(y)
  sy <- sort(y, decreasing=TRUE)
  csy <- cumsum(sy)
  #rho <- max(which(sy > (csy - 1)/(1:n)))
  rho <- max(which(sy > (csy - 1)/seq_len(n)))
  theta <- (csy[rho] - 1) / rho
  return(pmax(0, y - theta))
  }


#' return index of a protein name, or (if exactMatch=TRUE) indices of proteins starting with the string given in "protName"
#' @param protName  name of protein to search for
#' @param profile data frame of protein identifiers (protName) and their relative abundance in centrifugation fractions
#' @param exactMatch  default is F
#' @export
#' @return the protein name and its index (row in profile)
#' @examples
#' data(protNSA_AT5tmtMS2)
#' protIndex("TLN1", profile=protNSA_AT5tmtMS2)

protIndex <- function(protName, profile, exactMatch=FALSE) {
  # return index of a protein name, or (if exactMatch=TRUE) indices of proteins starting with
  #   the string given in "protName"
  n.prot <- nrow(profile)

  prot.list <- toupper(as.character(rownames(profile)))  # must be in column 1
  #if (exactMatch) inx <- grep(paste("^", protName, "$", sep=""), prot.list, ignore.case=TRUE)
  #if (exactMatch) inx <- (1:n.prot)[protName == prot.list]
  if (exactMatch) inx <- (seq_len(n.prot))[protName == prot.list]
  if (!exactMatch) inx <- grep(paste("^",protName, sep=""), prot.list, ignore.case=TRUE)
  if (length(inx) == 0) inx <- NA

  if (!is.na(inx[1])) {
    result <- data.frame(inx, prot.list[inx])
    names(result) <- c("Prot index number", "Prot name")
    }
  if (is.na(inx[1])) {
    result <- NA
    cat("protein not found\n")
  }
  result
  }

#' Carry out constrained proportional assignment for protein i; service function for fitCPA
#' @param profile data frame of one protein name (row name) and relative abundance levels for that protein
#' @param refLocationProfiles A matrix giving the abundance level profiles of the subcellular locations
#' @param numDataCols Number of channels of abundance levels
#' @param startProps starting valuese for proportional assignements; set equal if this is null (default)
#' @param maxit maximum number of iterations (default is 10000)
#' @param ind.vary if not NULL, indexes of proportions allowed to vary
#' @param minVal default is false. If true, return minimum value of goodness of fit
#' @examples
#' data(protNSA_test)
#' data(markerListJadot)
#' nTestProts <- nrow(protNSA_test)
#' refLocationProfilesNSA <- locationProfileSetup(profile=protNSA_test,
#'     markerList=markerListJadot, numDataCols=9)
#' protCPAfromNSA_test <- fCPAone(profile=protNSA_markTLN1[1,],
#'                               refLocationProfiles=refLocationProfilesNSA,
#'                               numDataCols=9, startProps=NULL,
#'                               maxit=10000,
#'                               ind.vary=NULL, minVal=FALSE)
#' @export
#' @return assignProbsOut  Data frame of proportionate assignments of
#'    each protein to compartments. also nspectra and npeptides if in profile input
#'    If ind.vary specified, only the referenced proportion cpa estimates are returned

#protLocAssign <- function(i, profile, refLocationProfiles, numDataCols,
fCPAone <- function(profile, refLocationProfiles, numDataCols,
                        startProps=NULL,
                        maxit=10000, showProgress=TRUE,
                        ind.vary=NULL, minVal=FALSE){

  # maxit and assignPRobsStart must be specified
  # assignProbsStart must be NULL or have a column "protName" and assignment probabilities to use as starting values
  # use the spg function (in package BB) to assign proportionate assignments to compartments


  SpectraSeqInd <- TRUE    # extra columns for numbers of spectra and sequences
  if (numDataCols == ncol(profile)) SpectraSeqInd <- FALSE  # no extra columns


  n.compartments <- nrow(refLocationProfiles)
  #allPeptideProfilesMat <- profile[1,1:numDataCols]
  allPeptideProfilesMat <- profile[1,seq_len(numDataCols)]     # just the data, and only row 1
       # (there should only be one row)

  yyT <- allPeptideProfilesMat



  yy <- as.numeric(yyT)       # leave them alone


  if (SpectraSeqInd) {  #if these variables are present
    Nspectra.i <- profile$Nspectra[1]   # this is the number of spectra for a protein
    Npep.i <- profile$Npep[1] # number of unique sequences
  }
  if (!SpectraSeqInd) {
    Nspectra.i <- NULL
    Nseq.i <- NULL
  }
  channelsMeanProb.i <- matrix(NA, nrow=1, ncol=length(yy))
  parEstTemp <- channelsMeanProb.i
 #if (!anyNA(yy)) {
  # start with uniform probabilities:
  startProps <- rep(1/n.compartments, n.compartments)
  if (!is.null(startProps)) {
    if (length(startProps) != n.compartments) {
      cat("invalid startProps\n")
    }
    if (length(startProps) == n.compartments) startVals <- startProps/sum(startProps)
  }
    # start with uniform probabilities
 # startVals <- as.numeric(assignProbsStart[i, 2:(n.compartments + 1)]) # start with

   # first attempt at minimization: use uniform starting values
   if (is.null(ind.vary)) {  # ordinary procedure; find optimum over all parameters
     temp <- try(BB::spg(startVals, fn=Qfun4, project=projSimplex, y=yy,
                      gmat=t(refLocationProfiles),
                      methodQ="sumsquares", quiet=TRUE, method=3,
                      control=list(maxit=maxit, trace=FALSE, ftol = 1.e-12,
                                   gtol = 1.e-07, eps=1.e-09)))
   }
   if (!is.null(ind.vary)) { # only find optimum over varying parameters
     ind.vary <- sort(ind.vary)  # parameters that may vary
     # here are the parameters that are fixed (complement of ind.vary)
     #ind.fixed <- (1:length(startVals))[!((1:length(startVals)) %in% ind.vary)]
     ind.fixed <- (seq_len(length(startVals)))[!((seq_len(length(startVals))) %in% ind.vary)]
     #yy.vary <- yy[ind.vary]
     #yy.fixed <- yy[ind.fixed]
     #startVals.vary <- startVals[ind.vary]  # params that vary
     startVals.vary <- rep(1/length(ind.vary), length(ind.vary))

     if (length(ind.fixed) > 0) startVals.fixed <- rep(0,length(ind.fixed))
     if (length(ind.fixed) == 0) startVals.fixed <- NULL
     temp <- try(BB::spg(par=startVals.vary, fn=Qfun4subset, project=projSimplex,
                         yy=yy,
                       gmat=t(refLocationProfiles),
                       methodQ="sumsquares", ind.vary=ind.vary,
                       ind.fixed=ind.fixed, par.fixed=startVals.fixed,
                       quiet=TRUE, alertConvergence=FALSE, method=3,
                                  control=list(maxit=maxit, trace=FALSE, ftol = 1.e-12,
                                               gtol = 1.e-07, eps=1.e-09)))
   }
   if (!is.atomic(temp)) convergeInd <- as.numeric((temp$message == "Successful convergence"))

   # If starting values are given (assignProbsStart is not null) and if the first attempt failed, try the starting values
   #if ({convergeInd != 1} & {!is.null(assignProbsStart)}) {
   #temp <- try(BB::spg(startVals, fn=Qfun4, project=projSimplex, y=yy, gmat=t(refLocationProfiles),
   #                    methodQ="sumsquares", quiet=T,
   #                     control=list(maxit=maxit, trace=FALSE)))

  # }

  convergeInd <- as.numeric(!inherits(temp, "try-error"))

  if (is.null(ind.vary))  channelsMeanProb.i <- rep(NA, n.compartments)
  if (!is.null(ind.vary)) channelsMeanProb.i <- rep(NA, length(ind.vary))

  convCrit.i <- NA
  feval.i <- NA
  value <- NA
  if (!is.atomic(temp)) {
     channelsMeanProb.i <- temp$par
     convCrit.i <- temp$gradient
     feval.i <- temp$feval
     value <- temp$value
     convergeInd <- as.numeric((temp$message == "Successful convergence"))
     }
  #nNoConverge.i <- 0
  if (convergeInd != 1) cat(paste("cpa does not converge for a protein", "\n",
         "returning missing values for cpa estimates for that protein", "\n"))


  if (!is.null(ind.vary)) {
    vals.order <- c(ind.vary, ind.fixed)
    pvec.vary <- channelsMeanProb.i
    par.fixed <- rep(0, n.compartments - length(ind.vary))
    pvec.unorder <- c(pvec.vary, par.fixed)
    n.vec <- length(pvec.unorder)
    pvec.orig <- rep(NA, n.vec)
    #for (i in 1:n.vec) {
    for (i in seq_len(n.vec)) {
      pvec.orig[vals.order[i]] <- pvec.unorder[i]
    }
    parEstTemp <- pvec.orig

  }
    if (is.null(ind.vary)) parEstTemp <- channelsMeanProb.i
# }


    if (SpectraSeqInd) parEst <- c(parEstTemp, Nspectra.i, Npep.i)
    if (!SpectraSeqInd) parEst <- parEstTemp
    if (minVal) parEst <- c(parEst, value)


   parEst
   }




#' Carry out constrained proportional assignment on all proteins
#'
#' @param profile a data frame of protein names (row names) and relative abundance levels.
#' @param refLocationProfiles A matrix giving the abundance level profiles of the subcellular locations
#' @param numDataCols Number of channels of abundance levels
#' @param startProps starting valuese for proportional assignements; set equal if this is null (default)
#' @param maxit maximum number of iterations (default is 10000)
#' @param showProgress print out progress if TRUE, the default
#' @param ind.vary if not NULL, indexes of parameters to allow to vary
#' @param minVal default is false. If true, return minimum value of goodness of fit
#' @export
#' @return assignProbsOut  Data frame of proportionate assignments of each protein to compartments
#' @examples
#' data(protNSA_test)
#' data(markerListJadot)
#' refLocationProfilesNSA <- locationProfileSetup(profile=protNSA_test, markerList=markerListJadot, numDataCols=9)
#' protCPAfromNSA_test <- fitCPA(profile=protNSA_markTLN1,
#'                               refLocationProfiles=refLocationProfilesNSA,
#'                               numDataCols=9)

fitCPA <- function(profile, refLocationProfiles, numDataCols,
                    startProps=NULL, maxit=10000, showProgress=TRUE,
                    ind.vary=NULL, minVal=FALSE) {


  # # # # # # # # # # # #
  n.prot <- nrow(profile)
  #indList <- 1:n.prot
  indList <- seq_len(n.prot)
  SpectraSeqInd <- TRUE    # extra columns for numbers of spectra and sequences
  if (numDataCols == ncol(profile)) SpectraSeqInd <- FALSE  # no extra columns

  #result <- sapply(indList, fCPAone,
  #                 profile=profile, refLocationProfiles=refLocationProfiles,
  #                 numDataCols=numDataCols, startProps=startProps, showProgress=FALSE,
  #                 simplify=T, maxit=maxit, ind.vary, minVal)

  assignProbs <- NULL
  #for (i in 1:n.prot) {
  for (i in seq_len(n.prot)) {
    assignProbsI <- fCPAone(profile=profile[i,], refLocationProfiles=refLocationProfiles,
                            numDataCols=numDataCols, startProps=startProps,
                            maxit=maxit, ind.vary=ind.vary, minVal=minVal)
    assignProbs <- rbind(assignProbs, assignProbsI)
    if (showProgress) {
      if (i == 500) print(paste(i, "profiles fit"))
      if ((i %% 1000) == 0) print(paste(i, "profiles fit"))
    }
  }
  #browser()
  assignProbs <- data.frame(assignProbs)
  #if (is.null(ind.vary)) ind.vary <- 1:nrow(refLocationProfiles)
  if (is.null(ind.vary)) ind.vary <- seq_len(nrow(refLocationProfiles))
  # (so the next check works either with or without ind.vary specified)
  #checkCols <- {ncol(assignProbs) >= nrow(refLocationProfiles[ind.vary,]) + 2}

  #if (checkCols) {
  if (SpectraSeqInd) {
    #if (is.null(ind.vary)) ind.vary <- 1:nrow(refLocationProfiles) # for when ind.vary not specified
    #names(assignProbs) <- c(row.names(refLocationProfiles[ind.vary,]), "Nspectra", "Npeptides")
    names(assignProbs)[seq_len(nrow(refLocationProfiles) + 2)] <- c(row.names(refLocationProfiles), "Nspectra", "Npeptides")
    protNames <- rownames(profile)  # make sure it is character
    assignProbsOut <- data.frame(assignProbs)
    rownames(assignProbsOut) <- protNames[indList]
    # name minVal column if present
    if (minVal) names(assignProbsOut)[ncol(assignProbs)] <- "value" # name of last column
  }
  if (!SpectraSeqInd) {
    #names(assignProbs)[1:(nrow(refLocationProfiles))] <- row.names(refLocationProfiles)
    names(assignProbs)[seq_len(nrow(refLocationProfiles))] <- row.names(refLocationProfiles)
    protNames <- rownames(profile)
    assignProbsOut <- data.frame(assignProbs)
    rownames(assignProbsOut) <- protNames[indList]
    # name minVal column if present
    if (minVal) names(assignProbsOut)[ncol(assignProbs)] <- "value"
  }
  assignProbsOut
}


#' convert single protein name to capitalize first character only
#' @param x a protein name
#' @return single protein name to capitalize first character only
#' @export
#' @examples
#' protCap("tpp1")
protCap <- function(x) {
    # convert single protein name to capitalize first character only
    # Typically, human proteins are all in capitals and
    #   rat are capitalized (first character only) with the rest in lower case.
    firstLetter <- toupper(substring(x, 1, 1))
    remainingLetters <- tolower(substring(x,2))
    paste(firstLetter, remainingLetters, sep='')
}

#' assign proteins to a subcellular location using CPA estimates
#'
#' @param assignLocProps matrix of proportion estimates for each protein
#' @param cutoff cutoff for assigning a protein to a location
#' @param Locations list of subcellular locations
#' @return protLoc assigned location of protein
#' @examples
#' data(protNSA_test)
#' data(markerListJadot)
#' refLocationProfilesNSA <- locationProfileSetup(profile=protNSA_test, markerList=markerListJadot, numDataCols=9)
#' protCPAfromNSA_test <- fitCPA(profile=protNSA_markTLN1,
#'                               refLocationProfiles=refLocationProfilesNSA,
#'                               numDataCols=9)
#' Locations <- unique(markerListJadot$referenceCompartment)
#' table(apply(protCPAfromNSA_test[,1:8],1,assignCPAloc, cutoff=0.8, Locations=Locations))
#' @importFrom stats complete.cases
#' @export

assignCPAloc <- function(assignLocProps, cutoff=0.8, Locations) {
  # assign location to the one with a proportion > cutoff
  # assignLocProps <- assignPropsAll[1,]
  if (cutoff <= 0.5) cutoff <- 0.5001
  exceedCutoff <- {assignLocProps > cutoff}
  if (sum(exceedCutoff) > 0) {
    indLoc <- which(exceedCutoff)
    protLoc <- Locations[indLoc]
  }
  if (sum(exceedCutoff) == 0) protLoc <- "unclassified"
  protLoc
}

# unitize functions
# # # #
#' normalize vector xx to have unit length
#' @param xx vector
#' @return nomalized vector of unit length
#' @examples
#' xx <- c(0.5, 0.1, 0.6, 0.9)
#' vecUnitize(xx)
#' @export
vecUnitize <- function(xx) {
  if (anyNA(xx)) xx.norm <- rep(NA, length(xx))
  else {
    xxsq <- xx^2
    xx.norm <- sqrt(sum(xxsq))
    xx.norm <- xx/xx.norm
  }
  xx.norm
}

#' normalize all rows of a matrix to have unit length
#' @param protMatOrig matrix of profiles
#' @return matrix with all rows having unit length
#' @examples
#' data(protNSA_test)
#' vectorizeAll(protNSA_test[,1:9])
#' @export
vectorizeAll <- function(protMatOrig) {
  protUnitMat <- NULL
  #for (i in 1:nrow(protMatOrig)) {
  for (i in seq_len(nrow(protMatOrig))) {
    # i=1
    temp <- vecUnitize(protMatOrig[i,])
    protUnitMat <- rbind(protUnitMat, temp)
  }
  row.names(protUnitMat) <- row.names(protMatOrig)
  protUnitMat
}
mooredf22/protlocassign0p1p1 documentation built on Feb. 7, 2022, 1:55 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mooredf22/protlocassign0p1p1
Compute constrained proportional assignments for fractionated protein abundance

R/cpaProgs.R
In mooredf22/protlocassign0p1p1: Compute constrained proportional assignments for fractionated protein abundance

Defines functions vectorizeAll vecUnitize assignCPAloc protCap fitCPA fCPAone protIndex projSimplex Qfun4subset Qfun4 locationProfileSetup

Documented in assignCPAloc fCPAone fitCPA locationProfileSetup protCap protIndex vectorizeAll vecUnitize

R Package Documentation

Browse R Packages

We want your feedback!

mooredf22/protlocassign0p1p1 Compute constrained proportional assignments for fractionated protein abundance

R/cpaProgs.R In mooredf22/protlocassign0p1p1: Compute constrained proportional assignments for fractionated protein abundance

Defines functions vectorizeAll vecUnitize assignCPAloc protCap fitCPA fCPAone protIndex projSimplex Qfun4subset Qfun4 locationProfileSetup

Documented in assignCPAloc fCPAone fitCPA locationProfileSetup protCap protIndex vectorizeAll vecUnitize

R Package Documentation

Browse R Packages

We want your feedback!

mooredf22/protlocassign0p1p1
Compute constrained proportional assignments for fractionated protein abundance

R/cpaProgs.R
In mooredf22/protlocassign0p1p1: Compute constrained proportional assignments for fractionated protein abundance