R/add.protein.R

Defines functions aasum add.protein seq2aa

Documented in aasum add.protein seq2aa

# CHNOSZ/add.protein.R
# Calculate properties of proteins 20061109 jmd
# Reorganize protein functions 20120513

# Count numbers of amino acids in a sequence
seq2aa <- function(sequence, protein = NA) {
  # Remove newlines and whitespace
  sequence <- gsub("\\s", "", gsub("[\r\n]", "", sequence))
  # Make a data frame from counting the amino acids in the sequence
  caa <- count.aa(sequence)
  colnames(caa) <- aminoacids(3)
  # Now make the data frame
  po <- strsplit(as.character(protein), "_")[[1]]
  aa <- data.frame(protein = po[1], organism = po[2], ref = NA, abbrv = NA, stringsAsFactors = FALSE)
  # chains = 1 for any sequence, chains = 0 for no sequence
  chains <- sum(nchar(sequence) > 0)
  aa <- cbind(aa, chains = chains, caa)
  return(aa)
}

# Add amino acid counts to thermo()$protein (returns iprotein)
add.protein <- function(aa, as.residue = FALSE) {
  # Add a properly constructed data frame of 
  # amino acid counts to thermo()$protein
  thermo <- get("thermo", CHNOSZ)
  if(!identical(colnames(aa), colnames(thermo$protein)))
    stop("'aa' does not have the same columns as thermo()$protein")
  # Check that new protein IDs are unique 20220418
  po <- paste(aa$protein, aa$organism, sep = "_")
  idup <- duplicated(po)
  if(any(idup)) stop(paste("some protein IDs are duplicated:", paste(unique(po[idup]), collapse = " ")))
  # Normalize by protein length if as.residue = TRUE 20220416
  if(as.residue) {
    pl <- protein.length(aa)
    aa[, 5:25] <- aa[, 5:25] / pl
  }
  # Find any protein IDs that are already present
  ip <- pinfo(po)
  ip.present <- !is.na(ip)
  # Now we're ready to go
  tp.new <- thermo$protein
  if(!all(ip.present)) tp.new <- rbind(tp.new, aa[!ip.present, ])
  if(any(ip.present)) tp.new[ip[ip.present], ] <- aa[ip.present, ]
  rownames(tp.new) <- NULL
  thermo$protein <- tp.new
  assign("thermo", thermo, CHNOSZ)
  # Return the new rownumbers
  ip <- pinfo(po)
  # Make some noise
  if(!all(ip.present)) message("add.protein: added ", nrow(aa)-sum(ip.present), " new protein(s) to thermo()$protein")
  if(any(ip.present)) message("add.protein: replaced ", sum(ip.present), " existing protein(s) in thermo()$protein")
  return(ip)
}

# Combine amino acid counts (sum, average, or weighted sum by abundance)
aasum <- function(aa, abundance = 1, average = FALSE, protein = NULL, organism = NULL) {
  # Returns the sum of the amino acid counts in aa,
  #   multiplied by the abundances of the proteins
  abundance <- rep(abundance, length.out=nrow(aa))
  # Drop any NA rows or abundances
  ina.aa <- is.na(aa$chains)
  ina.ab <- is.na(abundance)
  ina <- ina.aa | ina.ab
  if(any(ina)) {
    aa <- aa[!ina, ]
    abundance <- abundance[!ina]
    message("aasum: dropped ", sum(ina), " proteins with NA composition and/or abundance")
  }
  # Multiply
  aa[, 6:25] <- aa[, 6:25] * abundance
  # Sum
  out <- aa[1, ]
  out[, 5:25] <- colSums(aa[, 5:25])
  # Average if told to do so
  if(average) {
    # Polypeptide chains by number of proteins, residues by frequency
    out[, 5] <- out[, 5]/nrow(aa)
    out[, 6:25] <- out[, 6:25]/sum(abundance)
  }
  # Add protein and organism names if given
  if(!is.null(protein)) out$protein <- protein
  if(!is.null(organism)) out$organism <- organism
  return(out)
}

Try the CHNOSZ package in your browser

Any scripts or data that you put into this service are public.

CHNOSZ documentation built on March 31, 2023, 7:54 p.m.