R/AminoAcids.R
In alakazam: Immunoglobulin Clonal Lineage and Diversity Analysis

Documented in aliphatic aminoAcidProperties bulk charge countPatterns gravy isValidAASeq polar

# Amino acid sequence properties

#### Chemical property functions ####

#' Calculates the average bulkiness of amino acid sequences
#'
#' \code{bulk} calculates the average bulkiness score of amino acid sequences. 
#' Non-informative positions are excluded, where non-informative is defined as any 
#' character in \code{c("X", "-", ".", "*")}.
#' 
#' @param    seq         vector of strings containing amino acid sequences.
#' @param    bulkiness   named numerical vector defining bulkiness scores for 
#'                       each amino acid, where names are single-letter amino acid 
#'                       character codes. If \code{NULL}, then the Zimmerman et al, 1968
#'                       scale is used.
#' 
#' @return   A vector of bulkiness scores for the sequence(s).
#' 
#' @references
#' \enumerate{
#'   \item  Zimmerman JM, Eliezer N, Simha R. The characterization of amino acid sequences 
#'            in proteins by statistical methods. J Theor Biol 21, 170-201 (1968).
#' }
#' @seealso 
#' For additional size related indices see \link[seqinr]{aaindex}.
#'
#' @examples
#' # Default bulkiness scale
#' seq <- c("CARDRSTPWRRGIASTTVRTSW", "XXTQMYVRT")
#' bulk(seq)
#'
#' # Use the Grantham, 1974 side chain volumn scores from the seqinr package
#' library(seqinr)
#' data(aaindex)
#' x <- aaindex[["GRAR740103"]]$I
#' # Rename the score vector to use single-letter codes
#' names(x) <- translateStrings(names(x), ABBREV_AA)
#' # Calculate average volume
#' bulk(seq, bulkiness=x)
#' 
#' @export
bulk <- function(seq, bulkiness=NULL) {
    # Get bulkiness scores
    if (is.null(bulkiness)) {
        bulkiness <- BULKINESS_ZIMJ68
    }
    # Remove non-informative positions
    seq <- gsub("[X\\.\\*-]", "", as.character(seq))
    # Create character vector from string
    aa <- strsplit(seq, "")
    # Calculate average bulkiness
    aa_bulk <- sapply(aa, function(x) sum(bulkiness[x]) / length(x))
    
    return(aa_bulk)
}


#' Calculates the average polarity of amino acid sequences
#'
#' \code{polar} calculates the average polarity score of amino acid sequences. 
#' Non-informative positions are excluded, where non-informative is defined as any 
#' character in \code{c("X", "-", ".", "*")}.
#' 
#' @param    seq         vector of strings containing amino acid sequences.
#' @param    polarity    named numerical vector defining polarity scores for 
#'                       each amino acid, where names are single-letter amino acid 
#'                       character codes. If \code{NULL}, then the Grantham, 1974
#'                       scale is used.
#' 
#' @return   A vector of bulkiness scores for the sequence(s).
#' 
#' @references
#' \enumerate{
#'   \item  Grantham R. Amino acid difference formula to help explain protein evolution. 
#'            Science 185, 862-864 (1974).
#' }
#' @seealso 
#' For additional size related indices see \code{\link[seqinr]{aaindex}}.
#'
#' @examples
#' # Default scale
#' seq <- c("CARDRSTPWRRGIASTTVRTSW", "XXTQMYVRT")
#' polar(seq)
#'
#' # Use the Zimmerman et al, 1968 polarity scale from the seqinr package
#' library(seqinr)
#' data(aaindex)
#' x <- aaindex[["ZIMJ680103"]]$I
#' # Rename the score vector to use single-letter codes
#' names(x) <- translateStrings(names(x), ABBREV_AA)
#' # Calculate polarity
#' polar(seq, polarity=x)
#' 
#' @export
polar <- function(seq, polarity=NULL) {
    # Get bulkiness scores
    if (is.null(polarity)) {
        polarity <- POLARITY_GRAR74
    }
    # Remove non-informative positions
    seq <- gsub("[X\\.\\*-]", "", as.character(seq))
    # Create character vector from string
    aa <- strsplit(seq, "")
    # Calculate average polarity
    aa_polar <- sapply(aa, function(x) sum(polarity[x]) / length(x))
    
    return(aa_polar)
}


#' Calculates the hydrophobicity of amino acid sequences
#'
#' \code{gravy} calculates the Grand Average of Hydrophobicity (gravy) index 
#' of amino acid sequences using the method of Kyte & Doolittle. Non-informative
#' positions are excluded, where non-informative is defined as any character in 
#' \code{c("X", "-", ".", "*")}.
#' 
#' @param    seq         vector of strings containing amino acid sequences.
#' @param    hydropathy  named numerical vector defining hydropathy index values for 
#'                       each amino acid, where names are single-letter amino acid 
#'                       character codes. If \code{NULL}, then the Kyte & Doolittle
#'                       scale is used.
#' 
#' @return   A vector of gravy scores for the sequence(s).
#' 
#' @references
#' \enumerate{
#'   \item  Kyte J, Doolittle RF. A simple method for displaying the hydropathic character 
#'            of a protein. J Mol Biol. 157, 105-32 (1982).
#' }
#' @seealso 
#' For additional hydrophobicity indices see \code{\link[seqinr]{aaindex}}.
#'
#' @examples
#' # Default scale
#' seq <- c("CARDRSTPWRRGIASTTVRTSW", "XXTQMYVRT")
#' gravy(seq)
#'
#' # Use the Kidera et al, 1985 scores from the seqinr package
#' library(seqinr)
#' data(aaindex)
#' x <- aaindex[["KIDA850101"]]$I
#' # Rename the score vector to use single-letter codes
#' names(x) <- translateStrings(names(x), ABBREV_AA)
#' # Calculate hydrophobicity
#' gravy(seq, hydropathy=x)
#' 
#' @export
gravy <- function(seq, hydropathy=NULL) {
    # Get hydrophobicity scores
    if (is.null(hydropathy)) {
        hydropathy <- HYDROPATHY_KYTJ82
    }

    # Remove non-informative positions
    seq <- gsub("[X\\.\\*-]", "", as.character(seq))
    # Create character vector from string
    aa <- strsplit(seq, "")
    # Calculate gravy
    aa_gravy <- sapply(aa, function(x) sum(hydropathy[x]) / length(x))
    
    return(aa_gravy)
}


#' Calculates the aliphatic index of amino acid sequences
#' 
#' \code{aliphatic} calculates the aliphatic index of amino acid sequences using 
#' the method of Ikai. Non-informative positions are excluded, where non-informative 
#' is defined as any character in \code{c("X", "-", ".", "*")}.
#'
#' @param    seq        vector of strings containing amino acid sequences.
#' @param    normalize  if \code{TRUE} then divide the aliphatic index of each amino acid 
#'                      sequence by the number of informative positions. Non-informative 
#'                      position are defined by the presence any character in 
#'                      \code{c("X", "-", ".", "*")}. If \code{FALSE} then return the raw
#'                      aliphatic index.
#'                      
#' @return   A vector of the aliphatic indices for the sequence(s).
#'
#' @references 
#' \enumerate{
#'   \item  Ikai AJ. Thermostability and aliphatic index of globular proteins. 
#'            J Biochem. 88, 1895-1898 (1980).
#' }
#' 
#' @examples 
#' seq <- c("CARDRSTPWRRGIASTTVRTSW", NA, "XXTQMYVRT")
#' aliphatic(seq)
#'
#' @export
aliphatic <- function(seq, normalize=TRUE) {
    # Calculate aliphatic index for valid amino acids
    ala <- countOccurrences(seq, "[A]")
    val <- countOccurrences(seq, "[V]")
    leu_ile <- countOccurrences(seq, "[LI]")
    
    aa_aliphatic = ala + 2.9 * val + 3.9 * leu_ile
    
    if (normalize) {
        aa_aliphatic <- aa_aliphatic / stri_length(gsub("[X\\.\\*-]", "", seq))
    }
    
    return(aa_aliphatic)
}


#' Calculates the net charge of amino acid sequences.
#'
#' \code{charge} calculates the net charge of amino acid sequences using 
#' the method of Moore, 1985, with exclusion of the C-terminus and N-terminus charges.
#' 
#' @param    seq        vector strings defining of amino acid sequences.
#' @param    pH         environmental pH.
#' @param    pK         named vector defining pK values for each charged amino acid,
#'                      where names are the single-letter amino acid character codes
#'                      \code{c("R", "H", "K", "D", "E", "C", "Y")}). If \code{NULL}, 
#'                      then the EMBOSS scale is used.
#' @param    normalize  if \code{TRUE} then divide the net charge of each amino acid 
#'                      sequence by the number of informative positions. Non-informative 
#'                      position are defined by the presence any character in 
#'                      \code{c("X", "-", ".", "*")}. If \code{FALSE} then return the raw
#'                      net charge.
#' 
#' @return   A vector of net charges for the sequence(s).
#'
#' @references
#' \enumerate{
#'   \item  Moore DS. Amino acid and peptide net charges: A simple calculational procedure. 
#'            Biochem Educ. 13, 10-11 (1985).
#'   \item  \url{https://emboss.sourceforge.net/apps/cvs/emboss/apps/iep.html}
#'   }
#'   
#' @seealso 
#' For additional pK scales see \code{\link[seqinr]{pK}}.
#' 
#' @examples 
#' seq <- c("CARDRSTPWRRGIASTTVRTSW", "XXTQMYVRT") 
#' # Unnormalized charge
#' charge(seq)
#' # Normalized charge
#' charge(seq, normalize=TRUE)
#' 
#' # Use the Murray et al, 2006 scores from the seqinr package
#' library(seqinr)
#' data(pK)
#' x <- setNames(pK[["Murray"]], rownames(pK))
#' # Calculate charge
#' charge(seq, pK=x)
#'
#' @export
charge <- function(seq, pH=7.4, pK=NULL, normalize=FALSE) {
    
    # Get charge data
    if(is.null(pK)) {
        pK <- PK_EMBOSS
    }
    
    # Calculate charge
    arg <- countOccurrences(seq, "R") * (1/(1 + 10^(1 * (pH - pK["R"]))))
    his <- countOccurrences(seq, "H") * (1/(1 + 10^(1 * (pH - pK["H"]))))
    lys <- countOccurrences(seq, "K") * (1/(1 + 10^(1 * (pH - pK["K"]))))
    asp <- countOccurrences(seq, "D") * (-1/(1 + 10^(-1 * (pH - pK["D"]))))
    glu <- countOccurrences(seq, "E") * (-1/(1 + 10^(-1 * (pH - pK["E"]))))
    cys <- countOccurrences(seq, "C") * (-1/(1 + 10^(-1 * (pH - pK["C"]))))
    tyr <- countOccurrences(seq, "Y") * (-1/(1 + 10^(-1 * (pH - pK["Y"]))))
    aa_charge <- arg + lys + his + asp + glu + tyr + cys
    
    if (normalize) {
        aa_charge <- aa_charge / stri_length(gsub("[X\\.\\*-]", "", seq))
    }
    
    return(aa_charge)
}

#' Validate amino acid sequences
#'
#' \code{isValidAASeq} checks that a set of sequences are valid non-ambiguous 
#' amino acid sequences. A sequence is considered valid if it contains only 
#' characters in the the non-ambiguous IUPAC character set or any characters in 
#' \code{c("X", ".", "-", "*")}.
#'  
#' @param    seq  character vector of sequences to check.
#' 
#' @return   A logical vector with \code{TRUE} for each valid amino acid sequences 
#'           and \code{FALSE} for each invalid sequence.
#' @seealso 
#' See \link{ABBREV_AA} for the set of non-ambiguous amino acid characters.
#' See \link{IUPAC_AA} for the full set of ambiguous amino acid characters.
#' 
#' @examples 
#' seq <- c("CARDRSTPWRRGIASTTVRTSW", "XXTQMYVR--XX", "CARJ", "10") 
#' isValidAASeq(seq)
#' 
#' @export
isValidAASeq <- function(seq) {
    
    # Get valid amino acids from seqinr
    # for consistency with `gravy` and other
    # amino acid properties that don't consider
    # amino acid ambiguities and special encoded amino acids
    # http://pir.georgetown.edu/resid/faq.shtml#q01
    # Also include here characters for non informative positions
    valid_AA <- c(names(ABBREV_AA), "X", ".", "*", "-")
    
    .isValid <- function(aa) {
        all(aa %in% valid_AA)
    }
    
    return(sapply(strsplit(seq, ""), .isValid))
    
#    valid_AA <- paste(c(names(ABBREV_AA),"X.*-"),collapse="")
#    valid <- !grepl(paste0("[^",valid_AA,"]"), seq) & !is.na(seq)
#    valid
    
}

# Count patterns
# 
# Counts the number of times a "pattern" occurs in "x", a string
#
# @param   x        a string (usually amino acids)
# @param   pattern  regular expression to be matched in string
#  
# @return  number of times the regular expression was found
countOccurrences <- function(x, pattern) {
    return(sapply(gregexpr(pattern, x), function(y) { sum(y > 0) }))
}


#' Count sequence patterns
#' 
#' \code{countPatterns} counts the fraction of times a set of character patterns occur 
#' in a set of sequences.
#'
#' @param   seq         character vector of either DNA or amino acid sequences.
#' @param   patterns    list of sequence patterns to count in each sequence. If the 
#'                      list is named, then names will be assigned as the column names of 
#'                      output data.frame.
#' @param   nt		    if \code{TRUE} then \code{seq} are DNA sequences and and will be 
#'                      translated before performing the pattern search.
#' @param   trim        if \code{TRUE} remove the first and last codon or amino acid from 
#'                      each sequence before the pattern search. If \code{FALSE} do
#'                      not modify the input sequences.
#' @param   label       string defining a label to add as a prefix to the output 
#'                      column names.
#'  
#' @return  A data.frame containing the fraction of times each sequence pattern was 
#'          found.
#' 
#' @examples 
#' seq <- c("TGTCAACAGGCTAACAGTTTCCGGACGTTC",
#'          "TGTCAGCAATATTATATTGCTCCCTTCACTTTC",
#'          "TGTCAAAAGTATAACAGTGCCCCCTGGACGTTC")
#' patterns <- c("A", "V", "[LI]")
#' names(patterns) <- c("arg", "val", "iso_leu")
#' countPatterns(seq, patterns, nt=TRUE, trim=TRUE, label="cdr3")
#'             
#' @export
countPatterns <- function(seq, patterns, nt=TRUE, trim=FALSE, label="region") {
    # Translate sequences if nucleotide
    region_aa <- if (nt) { translateDNA(seq, trim=trim) } else { seq }
    
    # TODO: What is the proper length normalization? With or without non-informative position?
    
    # Calculate region lengths
    aa_length <- stri_length(region_aa)
    # Count occurrence of each amino acid pattern for each sequence
    out_df <- data.frame(matrix(0, nrow=length(region_aa), ncol=length(patterns)))
    # If patterns are unnamed, make the names X1...Xn
    if(is.null(names(patterns))) { names(patterns) <- names(out_df) }
    # If region name, append to names of patterns
    if(label != '') { 
        names(out_df) <- paste(label, names(patterns), sep="_") 
    } else {
        names(out_df) <- names(patterns)
    }
    # Iterate over patterns
    for(i in 1:length(patterns)) {
        out_df[, i] <- countOccurrences(region_aa, patterns[i]) / aa_length
    }
    return(out_df)
}


#' Calculates amino acid chemical properties for sequence data
#'
#' \code{aminoAcidProperties} calculates amino acid sequence physicochemical properties, including
#' length, hydrophobicity, bulkiness, polarity, aliphatic index, net charge, acidic residue
#' content, basic residue content, and aromatic residue content.
#'
#' @param   data          \code{data.frame} containing sequence data.
#' @param   property      vector strings specifying the properties to be calculated. Defaults
#'                        to calculating all defined properties.
#' @param   seq           \code{character} name of the column containing input 
#'                        sequences.
#' @param   nt      	  boolean, TRUE if the sequences (or sequence) are DNA and will be translated.
#' @param   trim          if \code{TRUE} remove the first and last codon/amino acids from each
#'                        sequence before calculating properties. If \code{FALSE} do
#'                        not modify input sequences.
#' @param   label         name of sequence region to add as prefix to output column names.
#' @param   ...           additional named arguments to pass to the functions 
#'                        \link{gravy}, \link{bulk}, \link{aliphatic}, \link{polar} or \link{charge}.
#' 
#' @return  A modified \code{data} data.frame with the following columns:
#'          \itemize{
#'            \item  \code{*_aa_length}:     number of amino acids.
#'            \item  \code{*_aa_gravy}:      grand average of hydrophobicity (gravy) index.
#'            \item  \code{*_aa_bulk}:       average bulkiness of amino acids.
#'            \item  \code{*_aa_aliphatic}:  aliphatic index.
#'            \item  \code{*_aa_polarity}:   average polarity of amino acids.
#'            \item  \code{*_aa_charge}:     net charge.
#'            \item  \code{*_aa_basic}:      fraction of informative positions that are 
#'                                           Arg, His or Lys.
#'            \item  \code{*_aa_acidic}:     fraction of informative positions that are 
#'                                           Asp or Glu.
#'            \item  \code{*_aa_aromatic}:   fraction of informative positions that are 
#'                                           His, Phe, Trp or Tyr.
#'            
#'          }
#'          
#'          Where \code{*} is the value from \code{label} or the name specified for 
#'          \code{seq} if \code{label=NULL}.
#'          
#' @details 
#' For all properties except for length, non-informative positions are excluded, 
#' where non-informative is defined as any character in \code{c("X", "-", ".", "*")}.
#' 
#' The scores for gravy, bulkiness and polarity are calculated as simple averages of the 
#' scores for each informative positions. The basic, acid and aromatic indices are 
#' calculated as the fraction of informative positions falling into the given category.
#' 
#' The aliphatic index is calculated using the Ikai, 1980 method.
#' 
#' The net charge is calculated using the method of Moore, 1985, excluding the N-terminus and
#' C-terminus charges, and normalizing by the number of informative positions.  The default 
#' pH for the calculation is 7.4.
#' 
#' The following data sources were used for the default property scores:
#' \itemize{
#'   \item  hydropathy:  Kyte & Doolittle, 1982.  
#'   \item  bulkiness:   Zimmerman et al, 1968. 
#'   \item  polarity:    Grantham, 1974.
#'   \item  pK:          EMBOSS.
#' }
#' 
#' @references
#' \enumerate{
#'   \item  Zimmerman JM, Eliezer N, Simha R. The characterization of amino acid sequences 
#'            in proteins by statistical methods. J Theor Biol 21, 170-201 (1968).
#'   \item  Grantham R. Amino acid difference formula to help explain protein evolution. 
#'            Science 185, 862-864 (1974).
#'   \item  Ikai AJ. Thermostability and aliphatic index of globular proteins. 
#'            J Biochem 88, 1895-1898 (1980).
#'   \item  Kyte J, Doolittle RF. A simple method for displaying the hydropathic character 
#'            of a protein. J Mol Biol 157, 105-32 (1982).
#'   \item  Moore DS. Amino acid and peptide net charges: A simple calculational procedure. 
#'            Biochem Educ 13, 10-11 (1985).
#'   \item  Wu YC, et al. High-throughput immunoglobulin repertoire analysis distinguishes 
#'            between human IgM memory and switched memory B-cell populations. 
#'            Blood 116, 1070-8 (2010).
#'   \item  Wu YC, et al. The relationship between CD27 negative and positive B cell 
#'            populations in human peripheral blood. 
#'            Front Immunol 2, 1-12 (2011).
#'   \item  \url{https://emboss.sourceforge.net/apps/cvs/emboss/apps/iep.html}
#' }
#' 
#' @seealso 
#' See \link{countPatterns} for counting the occurance of specific amino acid subsequences.
#' See \link{gravy}, \link{bulk}, \link{aliphatic}, \link{polar} and \link{charge} for functions 
#' that calculate the included properties individually.
#' 
#' @examples
#' # Subset example data
#' db <- ExampleDb[c(1,10,100), c("sequence_id", "junction")]
#' 
#' # Calculate default amino acid properties from DNA sequences
#' aminoAcidProperties(db, seq="junction")
#
#' # Calculate default amino acid properties from amino acid sequences
#' # Use a custom output column prefix
#' db$junction_aa <- translateDNA(db$junction)
#' aminoAcidProperties(db, seq="junction_aa", label="junction", nt=FALSE)
#' 
#' # Use the Grantham, 1974 side chain volume scores from the seqinr package
#' # Set pH=7.0 for the charge calculation
#' # Calculate only average volume and charge
#' # Remove the head and tail amino acids from the junction, thus making it the CDR3
#' library(seqinr)
#' data(aaindex)
#' x <- aaindex[["GRAR740103"]]$I
#' # Rename the score vector to use single-letter codes
#' names(x) <- translateStrings(names(x), ABBREV_AA)
#' # Calculate properties
#' aminoAcidProperties(db, property=c("bulk", "charge"), seq="junction", 
#'                     trim=TRUE, label="cdr3", bulkiness=x, pH=7.0)
#'
#' @export
aminoAcidProperties <- function(data, property=c("length", "gravy", "bulk",
                                                 "aliphatic","polarity","charge",
                                                 "basic","acidic", "aromatic"),
                                seq="junction", nt=TRUE, trim=FALSE, label=NULL, ...) {
    # Check arguments
    property <- match.arg(property, several.ok=TRUE)
    
    # Define the data.frame that will be returned with amino acid properties
    prop_colnames <- list(
        "length"    = "aa_length",
        "gravy"     = "aa_gravy",
        "bulk"      = "aa_bulk",
        "aliphatic" = "aa_aliphatic",
        "polarity"  = "aa_polarity",
        "charge"    = "aa_charge",
        "basic"     = "aa_basic",
        "acidic"    = "aa_acidic",
        "aromatic"  = "aa_aromatic"
    )
    # If no label, use sequence column name
    if (is.null(label)) { label <- seq }
    prop_colnames <- lapply(prop_colnames, function(x) { paste(label,x,sep="_") })
    
    out_df <- data.frame(matrix(NA, nrow=nrow(data), ncol=length(property)))
    colnames(out_df) <- prop_colnames[property]
    
    # Check if out_df column names already existed in data
    # if yes, throw warning
    check <- checkColumns(data, colnames(out_df))
    if (any(check == TRUE)) { warning("Duplicated columns found. Overwriting previous values.")}
    # Check input
    if (length(seq) > 1) {
        stop("You may specify only one sequence column; seq must be a vector of length 1.")
    }
    check <- checkColumns(data, seq)
    if (check != TRUE) { stop(check) }
    
    # Assign ellipsis arguments to correct function
    dots <- list(...)
    args_gravy <- dots[names(dots) %in% names(formals(gravy))]
    args_bulk <- dots[names(dots) %in% names(formals(bulk))]
    args_aliphatic <- dots[names(dots) %in% names(formals(aliphatic))]
    args_polar <- dots[names(dots) %in% names(formals(polar))]
    args_charge <- dots[names(dots) %in% names(formals(charge))]
    
    # Get sequence vector and translate if required
    region <- as.character(data[[seq]])
    region_aa <- if (nt) { translateDNA(region, trim=trim) } else { region }
    
    ## Will retrieve properties for valid sequences only
    ## keep index to fill results data.frame
    valid_seq <- isValidAASeq(region_aa)
    if (any(valid_seq == F) ){
        not_valid_num <- sum(!valid_seq)
        warning(paste0("Found ", not_valid_num , " sequences with non valid amino acid symbols"))
    }
    valid_seq_idx <- which(valid_seq)
    region_aa <- region_aa[valid_seq_idx]
    
    # Calculate region lengths
    if ("length" %in% property) {
        aa_length <- stri_length(region_aa)
        out_df[valid_seq_idx , prop_colnames$length] <- aa_length
    }
    # Average hydrophobicity
    if ("gravy" %in% property) {
        #aa_gravy <- gravy(region_aa, hydropathy)
        aa_gravy <- do.call('gravy', c(list(seq=region_aa), args_gravy))
        out_df[valid_seq_idx , prop_colnames$gravy] <- aa_gravy
    }
    # Average bulkiness
    if ("bulk" %in% property) {
        #aa_bulk <- bulk(region_aa)
        aa_bulk <- do.call('bulk', c(list(seq=region_aa), args_bulk))
        out_df[valid_seq_idx , prop_colnames$bulk] <- aa_bulk
    }
    if ("aliphatic" %in% property) {
        # Normalizes aliphatic index
        aa_aliphatic <- do.call('aliphatic', c(list(seq=region_aa), args_aliphatic))
        out_df[valid_seq_idx , prop_colnames$aliphatic] <- aa_aliphatic
    }
    # Average polarity
    if ("polarity" %in% property) {
        #aa_polarity <- polar(region_aa)
        aa_polarity <- do.call('polar', c(list(seq=region_aa), args_polar))
        out_df[valid_seq_idx , prop_colnames$polarity] <- aa_polarity
    }
    # Normalized net charge
    if ("charge" %in% property) {
        #aa_charge <- charge(region_aa)
        aa_charge <- do.call('charge', c(list(seq=region_aa), args_charge))
        out_df[valid_seq_idx , prop_colnames$charge] <- aa_charge
    }
    
    # Count of informative positions
    aa_info <- stri_length(gsub("[X\\.\\*-]", "", region_aa))
    # Fraction of amino acid that are basic
    if ("basic" %in% property) {
        aa_basic <- countOccurrences(region_aa, "[RHK]") / aa_info
        out_df[valid_seq_idx , prop_colnames$basic] <- aa_basic
    }
    # Fraction of amino acid that are acidic
    if ("acidic" %in% property) {
        aa_acidic <- countOccurrences(region_aa, "[DE]") / aa_info
        out_df[valid_seq_idx , prop_colnames$acidic] <- aa_acidic
    }
    # Count fraction of aa that are aromatic
    if ("aromatic" %in% property) {
        aa_aromatic <- countOccurrences(region_aa, "[FWHY]") / aa_info
        out_df[valid_seq_idx , prop_colnames$aromatic] <- aa_aromatic
    }
    
    data_cols <- colnames(data) %in% colnames(out_df) == FALSE
    return(cbind(data[, data_cols], out_df))
}
Any scripts or data that you put into this service are public.
alakazam documentation built on Sept. 30, 2023, 9:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
alakazam
Immunoglobulin Clonal Lineage and Diversity Analysis

R/AminoAcids.R
In alakazam: Immunoglobulin Clonal Lineage and Diversity Analysis

Defines functions aminoAcidProperties countPatterns countOccurrences isValidAASeq charge aliphatic gravy polar bulk

Documented in aliphatic aminoAcidProperties bulk charge countPatterns gravy isValidAASeq polar

Try the alakazam package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

alakazam Immunoglobulin Clonal Lineage and Diversity Analysis

R/AminoAcids.R In alakazam: Immunoglobulin Clonal Lineage and Diversity Analysis

Defines functions aminoAcidProperties countPatterns countOccurrences isValidAASeq charge aliphatic gravy polar bulk

Documented in aliphatic aminoAcidProperties bulk charge countPatterns gravy isValidAASeq polar

Try the alakazam package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

alakazam
Immunoglobulin Clonal Lineage and Diversity Analysis

R/AminoAcids.R
In alakazam: Immunoglobulin Clonal Lineage and Diversity Analysis