R/make_plot_data.R

Defines functions make_diagonal make_matrix

Documented in make_diagonal make_matrix

#' Construct data of percent-spliced-in (PSI) matrices and "diagonal" for
#' heatmaps and scatter plots
#'
#' `make_matrix()` constructs a matrix of PSI values of the given alternative
#' splicing events (ASEs).\cr\cr
#' `make_diagonal()` constructs a table of "average" PSI values, with samples
#' grouped by two given conditions (e.g. "group A" and "group B") of a given
#' condition category (e.g. condition "treatment"). See details below.\cr\cr
#' @details
#' Note that this function takes the geometric mean of PSI, by first converting
#'   all values to logit(PSI), taking the average logit(PSI) values of each
#'   condition, and then converting back to PSI using inverse logit.
#'
#' Samples with low splicing coverage (either due to insufficient sequencing
#' depth or low gene expression) are excluded from calculation of mean PSIs.
#' The threshold can be set using `depth_threshold`. Excluding these samples is
#' appropriate because the uncertainty of PSI is high when the total included /
#' excluded count is low. Note that events where all samples in a condition is
#' excluded will return a value of `NaN`.
#'
#' Using logit-transformed PSI values is appropriate because PSI values are
#'   bound to the (0,1) interval, and are often thought to be beta-distributed.
#'   The link function often used with beta-distributed models is the logit
#'   function, which is defined as `logit(x) = function(x) log(x / (1 - x))`,
#' and is equivalent to [stats::qlogis]. Its inverse is equivalent to
#' [stats::plogis].
#'
#' Users wishing to calculate arithmetic means of PSI are advised to use
#'   [make_matrix], followed by [rowMeans] on subsetted sample columns.
#' @param se (Required) A \linkS4class{NxtSE} object generated by [MakeSE]
#' @param event_list A character vector containing the row names of ASE events
#'   (as given by the `EventName` column of differential ASE results table
#'   using `limma_ASE()` or `DESeq_ASE()`)
#' @param sample_list (default = `colnames(se)`) In `make_matrix()`, a list of
#'   sample names in the given experiment to be included in the returned matrix
#' @param method In `make_matrix()`, rhe values to be returned
#'   (default = "PSI"). It can
#'   alternately be "logit" which returns logit-transformed PSI values, or
#'   "Z-score" which returns Z-score-transformed PSI values
#' @param depth_threshold (default = 10) Samples with the number of reads
#'   supporting either included or excluded isoforms below this values are
#'   excluded
#' @param logit_max (default = 5) PSI values close to 0 or 1 are rounded up/down
#'  to `plogis(-logit_max)` and `plogis(logit_max)`, respectively. See details.
#' @param na.percent.max (default = 0.1) The maximum proportion of values in
#'   the given dataset that were transformed to `NA` because of low splicing
#'   depth. ASE events where there are a higher proportion (default 10%) `NA`
#'   values will be excluded from the final matrix. Most heatmap
#'   functions will spring an error if there are too many NA values in any
#'   given row. This option caps the number of NA values to avoid returning
#'   this error.
#' @param condition The name of the column containing the condition values in
#'   `colData(se)`
#' @param nom_DE The condition to be contrasted, e.g. `nom_DE = "treatment"`
#' @param denom_DE The condition to be contrasted against, e.g.
#'   `denom_DE = "control"`
#' @return
#' For `make_matrix`: A matrix of PSI (or alternate) values, with
#' columns as samples and rows as ASE events.
#'
#' For `make_diagonal`: A 3 column data frame, with the first column containing
#'   `event_list` list of ASE events, and the last 2 columns containing the
#'   average PSI values of the nominator and denominator conditions.
#' @examples
#' se <- NxtIRF_example_NxtSE()
#'
#' colData(se)$treatment <- rep(c("A", "B"), each = 3)
#'
#' event_list <- rowData(se)$EventName
#'
#' mat <- make_matrix(se, event_list[1:10])
#'
#' diag_values <- make_diagonal(se, event_list,
#'   condition = "treatment", nom_DE = "A", denom_DE = "B"
#' )
#' @name make_plot_data
#' @aliases make_matrix make_diagonal
#' @md
NULL

#' @describeIn make_plot_data constructs a matrix of PSI values of the given
#' alternative splicing events (ASEs)
#' @export
make_matrix <- function(
        se,
        event_list,
        sample_list = colnames(se),
        method = c("PSI", "logit", "Z-score"),
        depth_threshold = 10, logit_max = 5, na.percent.max = 0.1
) {
    if (!any(event_list %in% rownames(se))) .log(
        "None of events in event_list matches those in the NxtSE object")

    method <- match.arg(method)
    inc <- as.matrix(assay(se, "Included")[
        event_list, sample_list, drop = FALSE])
    exc <- as.matrix(assay(se, "Excluded")[
        event_list, sample_list, drop = FALSE])
    mat <- inc / (inc + exc)
    mat[inc + exc < depth_threshold] <- NA
    mat <- mat[rowSums(is.na(mat)) < na.percent.max * ncol(mat), , drop = FALSE]
    if (method == "PSI") {
        # essentially M/Cov
        return(mat)
    } else if (method == "logit") {
        mat <- qlogis(mat)
        mat[mat > logit_max] <- logit_max
        mat[mat < -logit_max] <- -logit_max
        return(mat)
    } else if (method == "Z-score") {
        mat <- mat - rowMeans(mat)
        mat <- mat / rowSds(mat)
        return(mat)
    }

}


#' @describeIn make_plot_data constructs a table of "average" PSI values
#' @export
make_diagonal <- function(
        se, event_list = rownames(se),
        condition, nom_DE, denom_DE,
        depth_threshold = 10, logit_max = 5
) {
    if (!any(event_list %in% rownames(se))) .log(
        "None of events in event_list matches those in the NxtSE object")

    inc <- assay(se, "Included")[event_list, ]
    exc <- assay(se, "Excluded")[event_list, ]
    mat <- inc / (inc + exc)
    mat[inc + exc < depth_threshold] <- NA

    # use logit method to calculate geometric mean

    mat.nom <- qlogis(mat[, colData(se)[, condition] == nom_DE])
    mat.denom <- qlogis(mat[, colData(se)[, condition] == denom_DE])

    mat.nom[mat.nom > logit_max] <- logit_max
    mat.denom[mat.denom > logit_max] <- logit_max
    mat.nom[mat.nom < -logit_max] <- -logit_max
    mat.denom[mat.denom < -logit_max] <- -logit_max

    df <- data.frame(EventName = event_list,
        nom = plogis(rowMeans(mat.nom, na.rm = TRUE)),
        denom = plogis(rowMeans(mat.denom, na.rm = TRUE)))

    return(df)
}
alexchwong/NxtIRFcore documentation built on Oct. 31, 2022, 9:14 a.m.