R/structuralTendency.R

Defines functions structuralTendencyPlot structuralTendency

Documented in structuralTendency structuralTendencyPlot

#' Structural Tendency of Amino Acid Residues
#'
#' Each amino acid residue has a tendency to impact the order / disorder
#'   of the amino acid sequence. Some residues are disorder promoting, meaning
#'   they tend to favor disorder over ordered structures. These are typically
#'   hydrophilic, charged, or small residues. Order promoting residues tend
#'   to be aliphatic, hydrophobic, aromatic, or form tertiary structures.
#'   Disorder neutral residues neither favor order nor disordered structures.
#' @inheritParams sequenceCheck
#' @param printCitation logical, FALSE by default.
#'    When \code{printCitation = TRUE}, a citation to Uversky, V. N. (2013)
#'    is printed. This is the paper categorizing the structural impact of each
#'    residue that is used as the default settings.
#' @param disorderPromoting,disorderNeutral,orderPromoting character vectors
#'    of individual residues to be matched with the input sequence. Defaults:
#'    \itemize{
#'      \item disorderPromoting = c("P", "E", "S", "Q", "K", "A", "G")
#'      \item orderPromoting =
#'         c("M", "N", "V", "H", "L", "F", "Y", "I", "W", "C")
#'      \item disorderNeutral = c("D", "T", "R")
#'    }
#'    It is not recommended to change these. These definitions are from
#'    Uversky (2013).
#' @return a data frame containing each residue from the sequence
#'   matched with its structural tendency, defined by disorderPromoting,
#'   disorderNeutral, and orderPromoting.
#'   For convenient plotting see \code{\link{structuralTendencyPlot}}.
#' @family structural tendency
#' @references
#'   Uversky, V. N. (2013). A decade and a half of protein intrinsic disorder:
#'   Biology still waits for physics. Protein Science, 22(6), 693-724.
#'   \url{https://doi.org/10.1002/pro.2261}. \cr
#'   Kulkarni, Prakash, and Vladimir N. Uversky. "Intrinsically
#'   disordered proteins: the dark horse of the dark proteome."
#'   Proteomics 18.21-22 (2018): 1800061.
#'   \url{https://doi.org/10.1002/pmic.201800061}.
#' @export
#' @examples
#' #Amino acid sequences can be character strings
#' aaString <- "ACDEFGHIKLMNPQRSTVWY"
#' #Amino acid sequences can also be character vectors
#' aaVector <- c("A", "C", "D", "E", "F",
#'               "G", "H", "I", "K", "L",
#'               "M", "N", "P", "Q", "R",
#'              "S", "T", "V", "W", "Y")
#' #Alternatively, .fasta files can also be used by providing
#' ##The path to the file as a character string
#'
#' exampleDF <- structuralTendency(aaString)
#' head(exampleDF)
#' exampleDF <- structuralTendency(aaVector)
#' head(exampleDF)
#'
#' #This example shows if a user changes the default definition of residues.
#' ##These residues are labeled as such from Dunker et al (2001),
#' ##"Intrinsically disordered protein."
#' exampleDF <- structuralTendency(aaString,
#'                disorderPromoting = c("A", "R", "G", "Q", "S", "P", "E", "K"),
#'                disorderNeutral = c("H", "M", "T", "D"),
#'                orderPromoting = c("W", "C", "F", "I", "Y", "V", "L", "N"))
#' head(exampleDF)

structuralTendency <- function(
    sequence,
    disorderPromoting = c("P", "E", "S", "Q", "K", "A", "G"),
    disorderNeutral = c("D", "T", "R"),
    orderPromoting = c("M", "N", "V", "H", "L", "F", "Y", "I", "W", "C"),
    printCitation = FALSE) {
    #-----
    seqCharacterVector <- sequenceCheck(
        sequence = sequence,
        method = "stop",
        outputType = "vector",
        suppressOutputMessage = TRUE)
    sequenceLength <- length(seqCharacterVector)

    #----- Matches residue with tendency
    structuralTendencyVector <- rep(NA, sequenceLength)

    disorderedResidues <- seqCharacterVector %in% disorderPromoting
    structuralTendencyVector[disorderedResidues] <- "Disorder Promoting"

    orderedResidues <- seqCharacterVector %in% orderPromoting
    structuralTendencyVector[orderedResidues] <- "Order Promoting"

    neutralResidues <- seqCharacterVector %in% disorderNeutral
    structuralTendencyVector[neutralResidues] <- "Disorder Neutral"

    #----- makes the data frame for output
    structureTendencyDF <- data.frame(Position = seq_len(sequenceLength),
                                        AA = seqCharacterVector,
                                        Tendency = structuralTendencyVector)

    structureTendencyDF$AA <- as.character(structureTendencyDF$AA)
    structureTendencyDF$Tendency <- as.character(structureTendencyDF$Tendency)
    structureTendencyDF$Position <- as.numeric(structureTendencyDF$Position)

    if (printCitation) {
        residueCitation <- "Uversky, V. N. (2013).
            A decade and a half of protein intrinsic disorder:
            Biology still waits for physics.
            Protein Science, 22(6), 693-724.
            doi:10.1002/pro.2261"
        print(residueCitation)
    }
    return(structureTendencyDF)
}



#' Plotting Structural Tendency of Amino Acid Sequence
#'
#' Convenient graphing for the \code{\link{structuralTendency}} function.
#'
#' @param sequence amino acid sequence (or pathway to a fasta file)
#'    as a character string. Supports multiple sequences / files, as a
#'    character vector of strings.
#' @param graphType character string, required.
#'   graphType must be set to c("pie", "bar", "none").
#'   When \code{graphType = "pie"}, the output is a pie chart.
#'   When \code{graphType = "bar"}, the output is a bar chart.
#'   When \code{graphType = "none"}, the output is the data frame that would
#'   otherwise be used to plot the data.
#' @param summarize logical value, FALSE by default.
#'   When \code{summarize = TRUE}, each residue is aggregated into Disorder
#'   Tendency Groups. (See \code{\link{structuralTendency}} for more details).
#'   When \code{summarize = FALSE}, residue identity is preserved, and
#'   the output is colored by Disorder Tendency Groups.
#' @param alphabetical logical value, FALSE by default.
#'   Order of residues on plot axis. Only relevant when
#'    \code{summarize = FALSE}, otherwise is ignored.
#'    If FALSE, ordering is grouped by Disorder Tendency (P, E, S, ..., W, C).
#'    If TRUE, the residues are ordered alphabetically (A, C, D, E, ..., W, Y).
#' @param proteinName, optional character string. NA by default.
#'   Used to either add the name of the protein to the plot title.
#' @param ... additional arguments to be passed to
#'   \code{\link{structuralTendency}} and
#'   \code{\link[ggplot2]{ggplot}}
#' @param disorderPromoting,disorderNeutral,orderPromoting character vectors
#'    of individual residues to be matched with the input sequence. Defaults:
#'    \itemize{
#'      \item disorderPromoting = c("P", "E", "S", "Q", "K", "A", "G")
#'      \item orderPromoting =
#'         c("M", "N", "V", "H", "L", "F", "Y", "I", "W", "C")
#'      \item disorderNeutral = c("D", "T", "R")
#'    }
#'    It is not recommended to change these.
#' @return a data frame containing each residue from the sequence
#'   matched with its structural tendency, defined by disorderPromoting,
#'   disorderNeutral, and orderPromoting.
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @family structural tendency
#' @references
#'   Uversky, V. N. (2013). A decade and a half of protein intrinsic disorder:
#'   Biology still waits for physics. Protein Science, 22(6), 693-724.
#'   \url{https://doi.org/10.1002/pro.2261}. \cr
#'   Kulkarni, Prakash, and Vladimir N. Uversky. "Intrinsically
#'   disordered proteins: the dark horse of the dark proteome."
#'   Proteomics 18.21-22 (2018): 1800061.
#'   \url{https://doi.org/10.1002/pmic.201800061}.
#'
#' @section Plot Colors:
#'   For users who wish to keep a common aesthetic, the following colors are
#'   used when graphType = "bar" or "pie" \cr
#'   \itemize{
#'   \item Disorder Neutral = "#F0B5B3"
#'   \item Disorder Promoting = "darkolivegreen3" or "#A2CD5A"
#'   \item Order Promoting = "darkorchid1" or "#BF3EFF"
#'   }
#'
#' @export
#' @examples
#' #Amino acid sequences can be character strings
#' aaString <- "ACDEFGHIKLMNPQRSTVWY"
#' #Amino acid sequences can also be character vectors
#' aaVector <- c("A", "C", "D", "E", "F",
#'               "G", "H", "I", "K", "L",
#'               "M", "N", "P", "Q", "R",
#'               "S", "T", "V", "W", "Y")
#' #Alternatively, .fasta files can also be used by providing
#' ##The path to the file as a character string

#' structuralTendencyPlot(aaString)
#' structuralTendencyPlot(aaVector)
#'
#' #The plot can be a pie chart (default)
#' structuralTendencyPlot(aaString,
#'                     graphType = "pie")
#'
#' #Or the plot can be a bar graph
#' structuralTendencyPlot(aaString,
#'                     graphType = "bar")
#'
#' #To display general tendency rather than residues, set summarize = T
#' structuralTendencyPlot(aaString,
#'                     graphType = "pie",
#'                     summarize = TRUE)
#'
#' structuralTendencyPlot(aaString,
#'                     graphType = "bar",
#'                     summarize = TRUE)
#'
#' #If you wish to export this as a dataframe, set graphType = "none"
#' exampleDF <- structuralTendencyPlot(aaString,
#'                                   graphType = "none")
#' head(exampleDF)
#'
#' #If using a different definition of disordered residues
#' ##These residues are labeled as such from Dunker et al (2001),
#' ##"Intrinsically disordered protein."
#' structuralTendencyPlot(aaString,
#'               disorderPromoting = c("A", "R", "G", "Q", "S", "P", "E", "K"),
#'               disorderNeutral = c("H", "M", "T", "D"),
#'               orderPromoting = c("W", "C", "F", "I", "Y", "V", "L", "N"),
#'               graphType = "bar",
#'               alphabetical = TRUE)

structuralTendencyPlot <- function(sequence, graphType = "pie",
    summarize = FALSE, proteinName = NA, alphabetical = FALSE,
    disorderPromoting = c("P", "E", "S", "Q", "K", "A", "G"),
    disorderNeutral = c("D", "T", "R"),
    orderPromoting = c("M", "N", "V", "H", "L", "F", "Y", "I", "W", "C"),
    ...) {
    if (is.logical(summarize) == FALSE || is.logical(alphabetical) == FALSE) {
        stop("summarize and alphabetical must be logical values")
    }
    if (!graphType %in% c("pie", "bar", "none")) {
        stop("invalid argument for graphType. Please see documentation")
    }
    structuralTendencyDF <- structuralTendency(sequence = sequence,
                                        disorderPromoting = disorderPromoting,
                                        disorderNeutral = disorderNeutral,
                                        orderPromoting = orderPromoting)
    sequenceLength <- nrow(structuralTendencyDF)
    if (summarize) {
        structuralTendencyDF <- data.frame(table(structuralTendencyDF$Tendency))
        names(structuralTendencyDF) <- c("Tendency", "Total")
        structuralTendencyDF$Frequency <- structuralTendencyDF$Total /
                                                sequenceLength * 100
        structuralTendencyDF$AA <- as.character(structuralTendencyDF$Tendency)
    } else {
        structuralTendencyDF <- structuralTendencyDF[, 2:3]
        residueFrequencyDF <- data.frame(table(structuralTendencyDF$AA))
        names(residueFrequencyDF) <- c("AA", "Total")
        structuralTendencyDF <- unique(structuralTendencyDF)
        structuralTendencyDF <- merge(structuralTendencyDF, residueFrequencyDF)
        structuralTendencyDF$Frequency <- round(structuralTendencyDF$Total /
                                                sequenceLength * 100, 3)
        names(structuralTendencyDF) <- c("AA", "Tendency", "Total", "Frequency")
        if (!alphabetical) {
            aaOrder <- c("P", "E", "S", "Q", "K", "A", "G", "D", "T", "R",
                        "M", "N", "V", "H", "L", "F", "Y", "I", "W", "C")
            structuralTendencyDF$AA <- factor(structuralTendencyDF$AA,
                                                levels = aaOrder)
        }
    }
    if (!graphType == "none") {
        if (graphType == "bar") {
            gg <- ggplot2::ggplot(data = structuralTendencyDF,
                            ggplot2::aes_(x = ~ AA, y = ~ Frequency,
                                    fill = ~ Tendency,  group = ~ Tendency)) +
                    ggplot2::geom_bar(stat = "identity") + ggplot2::theme_bw()
        }
        if (graphType == "pie") {
            #------Data needs mutated to label residues
            structuralTendencyDF <- structuralTendencyDF %>%
                dplyr::arrange(dplyr::desc(.data$Tendency)) %>%
                dplyr::mutate(prop = .data$Total / sum(.data$Total) * 100) %>%
                dplyr::mutate(ypos = cumsum(.data$prop) - 0.5 * .data$prop)
            gg <- ggplot2::ggplot(structuralTendencyDF,
                        ggplot2::aes_(x = "", y = ~ prop, fill = ~ Tendency)) +
            ggplot2::geom_bar(stat = "identity", width = 1, color = "white") +
            ggplot2::coord_polar("y", start = 0) + ggplot2::theme_void() +
            ggplot2::geom_text(ggplot2::aes_(y = ~ ypos, label = ~ AA),
                              color = "white", size = 4)
        }
        plotTitle <- "Compositional Profile"
        if (!is.na(proteinName)) {
            plotTitle <- paste("Compositional Profile of ", proteinName,
                           sep = "", collapse = "")
        }
        yTitle <- "Amino Acid Composition (as % of sequence length)"
        gg <- gg + ggplot2::labs(title = plotTitle, y = yTitle) +
        ggplot2::theme(legend.position = "top",
                       plot.title = ggplot2::element_text(hjust = 0.5)) +
        ggplot2::scale_fill_manual(values = c("#F0B5B3", "#A2CD5A", "#BF3EFF"))
        return(gg)
    } else {
        return(structuralTendencyDF)
    }
}
wmm27/idpr documentation built on Jan. 12, 2023, 8:45 a.m.