R/proteinDataPrep.R

Defines functions proteinDataPrep

# The program ensures that spectra are nested within peptides, and these
#  are nested within proteins, and all peptide and protein data are
#  contiguous. That is, not split up into separate locations
# This program should be run first on a comma-separeted data file
#
# The first column must contain prot name, and the second must be the peptide
# The columns after "numRefCols" contain relative abundance levels
# There should be "numDataCols" abundance level columns


# # # # # #
proteinDataPrep <- function(protClass, numRefCols, numDataCols)  {
  names(protClass)[1:2] <- c("prot", "peptide")
  # remove "(prot NAME)"
  protNamesUpper <- toupper(protClass$prot)
  protClass$prot <- protNamesUpper

  # strip any whitespace before or after protein name
  protClass$prot <- trimws(protClass$prot)

  # ensure that there are no extra columns after the data columns
  protClassOrig <- protClass
  protClass <- protClassOrig[,1:(numRefCols+numDataCols)]

  # # # # # # # # # #
  # get unique peptides by pasting the protein and peptide names
  protSeqProteinModifTemp <- paste(protClass$prot, protClass$peptide, sep="::")
  uniquePeptideList <- unique(protSeqProteinModifTemp)         # list of unique peptides
  uniqueProtList <- unique(protClass$prot)

  # order proteins, assuring that the associated peptides travel with them
  uniquePeptideOrderInd <- order(protSeqProteinModifTemp)
  protClassSort <- protClass[uniquePeptideOrderInd,]

  protPep <- paste(protClassSort$prot, protClassSort$peptide, sep="::")

  protId <- cumsum(!duplicated(protClassSort$prot))  # gives a unique number to each protein
  pepId <- cumsum(!duplicated(protPep))       # unique number to each peptide

  # replace plain peptide with protPep, in column 2, to protect against non-unique peptide names
  protClassSort[,2] <- protPep
  protClassExtend <- data.frame(protClassSort, protId, pepId)
  protClassExtend
}
mooredf22/protsummarize2 documentation built on May 16, 2021, 10:12 p.m.