R/make_plot_data.R

Defines functions makeMeanPSI makeMatrix

Documented in makeMatrix makeMeanPSI

#' Construct data of percent-spliced-in (PSI) matrices and group-average PSIs
#'
#' `makeMatrix()` constructs a matrix of PSI values of the given alternative
#' splicing events (ASEs).\cr\cr
#' `makeMeanPSI()` constructs a table of "average" PSI values, with samples
#' grouped by a number of given conditions (e.g. "group A" and "group B") of a
#' given condition category (e.g. condition "treatment"). 
#' See details below.
#' @details
#' Note that this function takes the geometric mean of PSI, by first converting
#'   all values to logit(PSI), taking the average logit(PSI) values of each
#'   condition, and then converting back to PSI using inverse logit.
#'
#' Samples with low splicing coverage (either due to insufficient sequencing
#' depth or low gene expression) are excluded from calculation of mean PSIs.
#' The threshold can be set using `depth_threshold`. Excluding these samples is
#' appropriate because the uncertainty of PSI is high when the total included /
#' excluded count is low. Note that events where all samples in a condition is
#' excluded will return a value of `NaN`.
#'
#' Using logit-transformed PSI values is appropriate because PSI values are
#'   bound to the (0,1) interval, and are often thought to be beta-distributed.
#'   The link function often used with beta-distributed models is the logit
#'   function, which is defined as `logit(x) = function(x) log(x / (1 - x))`,
#' and is equivalent to [stats::qlogis]. Its inverse is equivalent to
#' [stats::plogis].
#'
#' Users wishing to calculate arithmetic means of PSI are advised to use
#'   [makeMatrix], followed by [rowMeans] on subsetted sample columns.
#' @param se (Required) A \linkS4class{NxtSE} object generated by [makeSE]
#' @param event_list A character vector containing the names of ASE events
#'   (as given by the `EventName` column of differential ASE results table
#'   generated by one of the [ASE-methods], or
#'   the rownames of the \linkS4class{NxtSE} object)
#' @param sample_list (default = `colnames(se)`) In `makeMatrix()`, a list of
#'   sample names in the given experiment to be included in the returned matrix
#' @param method In `makeMatrix()`, rhe values to be returned
#'   (default = "PSI"). It can
#'   alternately be "logit" which returns logit-transformed PSI values, or
#'   "Z-score" which returns Z-score-transformed PSI values
#' @param depth_threshold (default = 10) Samples with the number of reads
#'   supporting either included or excluded isoforms below this values are
#'   excluded
#' @param logit_max PSI values close to 0 or 1 are rounded up/down
#'  to `plogis(-logit_max)` and `plogis(logit_max)`, respectively. See details.
#' @param na.percent.max (default = 0.1) The maximum proportion of values in
#'   the given dataset that were transformed to `NA` because of low splicing
#'   depth. ASE events where there are a higher proportion (default 10%) `NA`
#'   values will be excluded from the final matrix. Most heatmap
#'   functions will spring an error if there are too many NA values in any
#'   given row. This option caps the number of NA values to avoid returning
#'   this error.
#' @param condition The name of the column containing the condition values in
#'   `colData(se)`
#' @param conditionList A list (or vector) of condition values of which to
#' calculate mean PSIs
#' @return
#' For `makeMatrix`: A matrix of PSI (or alternate) values, with
#' columns as samples and rows as ASE events.
#'
#' For `makeMeanPSI`: A 3 column data frame, with the first column containing
#'   `event_list` list of ASE events, and the last 2 columns containing the
#'   average PSI values of the nominator and denominator conditions.
#' @examples
#' se <- SpliceWiz_example_NxtSE()
#'
#' colData(se)$treatment <- rep(c("A", "B"), each = 3)
#'
#' event_list <- rowData(se)$EventName
#'
#' mat <- makeMatrix(se, event_list[1:10])
#'
#' diag_values <- makeMeanPSI(se, event_list,
#'   condition = "treatment", 
#'   conditionList = list("A", "B")
#' )
#' @name make_plot_data
#' @aliases makeMatrix makeMeanPSI
#' @md
NULL

#' @describeIn make_plot_data constructs a matrix of PSI values of the given
#' alternative splicing events (ASEs)
#' @export
makeMatrix <- function(
        se,
        event_list,
        sample_list = colnames(se),
        method = c("PSI", "logit", "Z-score"),
        depth_threshold = 10, logit_max = 5, na.percent.max = 0.1
) {
    if (!any(event_list %in% rownames(se))) .log(
        "None of events in event_list matches those in the NxtSE object")

    method <- match.arg(method)
    inc <- as.matrix(assay(se, "Included")[
        event_list, sample_list, drop = FALSE])
    exc <- as.matrix(assay(se, "Excluded")[
        event_list, sample_list, drop = FALSE])
    mat <- inc / (inc + exc)
    mat[inc + exc < depth_threshold] <- NA
    
    if(!is.na(na.percent.max)) {
        which_exclude <- rowSums(is.na(mat)) < na.percent.max * ncol(mat)
        mat <- mat[which_exclude, , drop = FALSE]
    }
    
    if (method == "PSI") {
        # essentially M/Cov
    } else if (method == "logit") {
        mat <- qlogis(mat)
        mat[mat > logit_max] <- logit_max
        mat[mat < -logit_max] <- -logit_max
    } else if (method == "Z-score") {
        mat <- mat - rowMeans(mat)
        mat <- mat / rowSds(mat)
    }
    return(as.matrix(mat))
}


#' @describeIn make_plot_data constructs a table of "average" PSI values
#' @export
makeMeanPSI <- function(
        se, event_list = rownames(se),
        condition, 
        conditionList,
        depth_threshold = 10, logit_max = 10
) {
    if (!any(event_list %in% rownames(se))) .log(
        "None of events in event_list matches those in the NxtSE object")

    mat <- makeMatrix(
        se, event_list,
        colnames(se)[colData(se)[, condition] %in% conditionList],
        method = "logit",
        depth_threshold = depth_threshold,
        logit_max = logit_max,
        na.percent.max = NA
    )

    if(nrow(mat) < length(event_list) || !all(rownames(mat) == event_list)) {
        .log("Error retrieving matrix via makeMatrix")
    }
    
    # use logit method to calculate geometric mean
    df <- data.frame(EventName = event_list)
    for(cond in conditionList) {
        if(cond %in% colData(se)[, condition]) {
            cond_samples <- colnames(se)[
                colData(se)[, condition] == cond]
            df$newcond <- plogis(rowMeans(mat[, cond_samples], na.rm = TRUE))
            colnames(df)[ncol(df)] <- paste(condition, cond, sep = "_")
        } else {
            .log(paste(
                cond, "not found in", condition, "-", cond, "ignored"
            ), "message")
            df$newcond <- rep(NA, nrow(df))
            colnames(df)[ncol(df)] <- paste(condition, cond, sep = "_")
        }
    }
    return(df)
}
alexchwong/SpliceWiz documentation built on March 17, 2024, 3:16 a.m.