#' Calculate MA-plot data
#'
#' Calculate MA-plot data
#'
#' This function takes a numeric matrix as input, and calculates
#' data sufficient to produce MA-plots. The default output is a
#' list of two-column numeric matrices with `"x"` and `"y"` coordinates,
#' representing the group median and difference from median,
#' respectively.
#'
#' The mean value can be used by setting `useMedian=FALSE`.
#'
#' Samples can be grouped using the argument `centerGroups`.
#' In this case the y-axis value will be "difference from
#' group median."
#'
#' Control samples can be specified for centering using the
#' argument `controlSamples`. In this case, the y-axis value will
#' be "difference from control median".
#'
#' The sample grouping, and control samples can be combined,
#' in which case the y-axis values will be "difference from
#' the control median within the centering group."
#'
#' @family jam matrix functions
#'
#' @param x `numeric` matrix typically containing log-normal measurements,
#' with measurement rows, and sample columns.
#' @param na.rm `logical` indicating whether to ignore NA values
#' during `numeric` summary functions.
#' @param controlSamples `character` vector containing values in
#' `colnames(x)` to define control samples used during centering.
#' These values are passed to `centerGeneData()`.
#' @param centerGroups `character` vector with length equal to `ncol(x)`
#' which defines the group for each column in `x`. Data will
#' be centered within each group.
#' @param groupedX `logical` indicating how to calculate the x-axis
#' value when `centerGroups` contains multiple groups. When
#' groupedX=TRUE, the mean of each group median is used, which
#' has the effect of representing each group equally. When
#' groupedX=FALSE, the median across all columns is used, which
#' can have the effect of preferring sample groups with a larger
#' number of columns.
#' @param useMedian `logical` indicating whether to use the median
#' values when calculating the x-axis and during data centering.
#' The median naturally reduces the effect of outlier points on
#' the resulting MA-plots., when compared to using the mean.
#' When useMedian=FALSE, the mean value is used.
#' @param useMean (deprecated) `logical` indicating whether to use the
#' mean instead of the median value. This argument is being removed
#' in order to improve consistency with other Jam package functions.
#' @param whichSamples `character` vector containing `colnames(x)`, or
#' integer vector referencing column numbers in `x`. This argument
#' specifies which columns to return, but does not change the columns
#' used to define the group centering values. For example, the
#' group medians are calculated using all the data, but only the
#' samples in `whichSamples` are centered to produce MA-plot data.
#' @param noise_floor `numeric` value indicating the minimum numeric value
#' allowed in the input matrix `x`. When `NULL` or `-Inf` no noise
#' floor is applied. It is common to set `noise_floor=0` to limit
#' MA-plot data to use values zero and above.
#' @param noise_floor_value single `numeric` value used to replace `numeric`
#' values at or below `noise_floor` when `noise_floor` is not NULL.
#' By default,
#' `noise_floor_value=noise_floor` which means values at or below
#' the noise floor are set to the floor. Another useful option is
#' `noise_floor_value=NA` which has the effect of removing the point
#' from the MA-plot altogether. This option is recommended for sparse
#' data matrices where the presence of values at or below zero are
#' indicative of missing data (zero-inflated data) and does not
#' automatically reflect an actual value of zero.
#' @param naValue single `numeric` value used to replace any `NA` values in
#' the input matrix `x`. This argument can be useful to replace
#' `NA` values with something like zero.
#' @param mad_row_min `numeric` value defining the minimum group
#' value, corresponding to the x-axis position on the MA-plot,
#' required for a row to be included in the MAD calculation.
#' This threshold is useful to filter outlier data below a noise
#' threshold, so that the MAD calculation will include only the
#' data above that value. For example, with count data, it is
#' useful to filter out counts below roughly 8, where Poisson
#' noise is a more dominant component than real count data.
#' Remember that count data should already be log2-transformed,
#' so the threshold should also be identically transformed,
#' for example using `log2(1 + 8)` to set a minimum count
#' threshold of at least 8.
#' @param grouped_mad `logical` indicating whether the MAD value
#' should be calculated per group when `centerGroups` is
#' supplied, from which the MAD factor values are derived.
#' When `TRUE` it has the effect of highlighting outliers
#' within each group using the variability in that group.
#' When `FALSE` the overall MAD is calculated, and a
#' particularly high variability group may have all its
#' group members labeled with a high MAD factor.
#' @param centerFunc `function` used for centering data, by default
#' one of the functions `centerGeneData()` or `centerGeneData_v1()`.
#' This argument will be removed in the near future and is mainly
#' intended to allow testing the two centering functions.
#' The following arguments are passed to this function:
#' * x: the input `numeric` data matrix
#' * na.rm: `logical` whether to ignore NA value. Always use `na.rm=TRUE`.
#' * controlSamples: `character` optional subset of `colnames(x)` to
#' use as reference controls during centering
#' * centerGroups: `character` vector of groups for `colnames(x)`
#' * controlFloor: `numeric` optional minimum allowed value for control
#' summary prior to centering
#' * naControlAction: `character` string for how to handle entirely NA
#' control groups during centering
#' * naControlFloor: `numeric` used when `naControlAction="floor"` and
#' all control values are `NA`. One `numeric` value is inserted into
#' the control group.
#' * useMedian: `logical` whether to use median (TRUE) or mean (FALSE)
#' * returnGroups: `logical` whether to return summary of group assignment
#' in attribute `"center_df"`
#' * returnGroupedValues: `logical` whether to return group summary values
#' in attribute `"x_group"`
#' * ...: other arguments are passed along via `...`.
#' @param returnType `character` string indicating the format of data
#' to return: `"ma_list"` is a list of MA-plot two-column
#' numeric matrices with colnames `c("x","y")`; "tidy"
#' returns a tall `data.frame` suitable for use in ggplot2.
#' @param verbose `logical` indicating whether to print verbose output.
#' @param ... additional arguments are ignored.
#'
#'
#' @export
jammacalc <- function
(x,
na.rm=TRUE,
controlSamples=NULL,
centerGroups=NULL,
controlFloor=NA,
naControlAction=c("row", "floor", "min", "na"),
naControlFloor=0,
groupedX=TRUE,
useMedian=TRUE,
useMean=NULL,
whichSamples=NULL,
noise_floor=-Inf,
noise_floor_value=noise_floor,
naValue=NA,
mad_row_min=0,
grouped_mad=TRUE,
centerFunc=centerGeneData,
useRank=FALSE,
returnType=c("ma_list", "tidy"),
verbose=FALSE,
...)
{
## Purpose is to separate jammaplot() from the underlying
## math that produces the data for MA-plots.
if (length(x) == 0 || ncol(x) == 0 || nrow(x) == 0) {
return(NULL);
}
returnType <- match.arg(returnType);
naControlAction <- match.arg(naControlAction);
if (length(useMean) > 0 && is.logical(useMean)) {
useMedian <- !useMean;
if (verbose) {
jamba::printDebug("jammacalc(): ",
"useMedian defined by !useMean, useMedian=",
useMedian);
}
}
## Validate whichSamples
if (length(whichSamples) == 0) {
whichSamples <- seq_len(ncol(x));
} else if (any(c("numeric","integer") %in% class(whichSamples))) {
whichSamples <- whichSamples[whichSamples %in% seq_len(ncol(x))];
if (length(whichSamples) == 0) {
whichSamples <- seq_len(ncol(x));
}
} else {
whichSamples <- whichSamples[whichSamples %in% colnames(x)];
whichSamples <- match(whichSamples, colnames(x));
}
if (verbose) {
jamba::printDebug("jammacalc(): ",
"whichSamples:",
whichSamples);
}
if (!jamba::check_pkg_installed("matrixStats")) {
rowMedians <- function(x, na.rm=TRUE) {
apply(x, 1, median, na.rm=na.rm);
}
colMedians <- function(x, na.rm=TRUE) {
apply(x, 2, median, na.rm=na.rm);
}
} else {
rowMedians <- matrixStats::rowMedians;
colMedians <- matrixStats::colMedians;
}
## Apply noise_floor if needed
noise_floor <- head(noise_floor, 1);
noise_floor_value <- head(noise_floor_value, 1);
if (length(noise_floor) == 1 && !is.infinite(noise_floor)) {
if (all(noise_floor %in% noise_floor_value)) {
if (jamba::rmNA(naValue=0, any(x < noise_floor))) {
if (verbose) {
jamba::printDebug("jammacalc(): ",
"flooring values below ",
noise_floor,
" to ",
noise_floor_value);
}
x[x < noise_floor] <- noise_floor_value;
}
} else if (jamba::rmNA(naValue=0, any(x <= noise_floor))) {
if (verbose) {
jamba::printDebug("jammacalc(): ",
"flooring values at or below ",
noise_floor,
" to ",
head(noise_floor_value, 10));
}
x[x <= noise_floor] <- noise_floor_value;
}
}
## Replace NA if needed
if (!all(naValue %in% c(NA))) {
if (any(is.na(x))) {
if (verbose) {
jamba::printDebug("jammacalc(): ",
"replacing NA with ",
naValue);
}
x[is.na(x)] <- naValue;
}
}
## apply useRank
if (TRUE %in% useRank) {
x <- matrix_to_column_rank(x,
keepNA=TRUE,
...)
if (FALSE) {
x1 <- apply(x, 2, function(xi){
rank(
jamba::rmNA(xi,
naValue=min(xi, na.rm=TRUE) - 1),
na.last=FALSE);
})
if (any(is.na(x))) {
x1[is.na(x)] <- NA;
}
x <- x1;
rm(x1);
}
}
## Center the data in one step
x_ctr <- centerFunc(x,
na.rm=na.rm,
controlSamples=controlSamples,
centerGroups=centerGroups,
controlFloor=controlFloor,
naControlAction=naControlAction,
naControlFloor=naControlFloor,
useMedian=useMedian,
returnGroups=TRUE,
returnGroupedValues=TRUE,
...);
rownames(x_ctr) <- rownames(x);
## Pull out grouped data
if ("x_group" %in% names(attributes(x_ctr))) {
x_grp <- attr(x_ctr, "x_group");
} else {
if (length(centerGroups) == 0) {
grp_col <- vigrep("^group_me", colnames(x_ctr));
x_grp <- x_ctr[,grp_col,drop=FALSE];
} else if (all(centerGroups %in% colnames(x_ctr))) {
grp_col <- provigrep(paste0("^", unique(centerGroups), "_me"), colnames(x_ctr));
x_grp <- x_ctr[,grp_col,drop=FALSE];
} else {
stop("The group values could not be detected from centerFunc().");
}
}
rownames(x_grp) <- rownames(x);
## Determine the appropriate x-axis value to use
if (groupedX && ncol(x_grp) > 1 && length(unique(centerGroups)) > 1) {
if (verbose) {
jamba::printDebug("jammacalc(): ",
"Using group values for MA-plot x-axis values");
}
} else {
groupedX <- FALSE;
if (useMedian) {
if (verbose) {
jamba::printDebug("jammacalc(): ",
"using rowMedian() of all values for x.");
}
x_use <- rowMedians(x,
na.rm=na.rm);
} else {
if (verbose) {
jamba::printDebug("jammacalc(): ",
"using rowMean() of all values for x.");
}
x_use <- rowMeans(x,
na.rm=na.rm);
}
names(x_use) <- rownames(x);
}
## Calculate per-column MAD values, the grouped MAD values
## First create a matrix of centered values
x_mad_m <- as.matrix(x_ctr);
## blank entries where the x-axis value is not above mad_row_min
if (groupedX) {
x_mad_m[x_grp[,centerGroups] < mad_row_min] <- NA;
} else {
x_mad_m[x_use < mad_row_min] <- NA;
}
## Compute MAD values for all columns
x_mads <- jamba::nameVector(
colMedians(abs(x_mad_m),
na.rm=TRUE),
colnames(x_ctr));
if (grouped_mad && length(unique(centerGroups)) > 1) {
if (verbose) {
jamba::printDebug("jammacalc(): ",
"Calculating grouped MAD values.");
}
x_grp_mads <- tapply(x_mads, centerGroups, median, na.rm=TRUE);
x_mad_factors <- x_mads / x_grp_mads[as.character(centerGroups)];
names(x_mad_factors) <- names(x_mads);
} else {
if (verbose) {
jamba::printDebug("jammacalc(): ",
"Calculating global MAD values.");
}
x_grp_mads <- median(x_mads, na.rm=TRUE);
names(x_grp_mads) <- "global";
x_mad_factors <- x_mads / x_grp_mads;
names(x_mad_factors) <- names(x_mads);
}
## Calculate the list of two-column matrices
jammadata <- lapply(whichSamples, function(i){
if (groupedX) {
cbind(x=x_grp[,as.character(centerGroups[i])],
y=x_ctr[,i]);
} else {
cbind(x=x_use,
y=x_ctr[,i]);
}
});
names(jammadata) <- colnames(x)[whichSamples];
if ("tidy" %in% returnType) {
jammadata <- jamba::rbindList(lapply(names(jammadata), function(i){
j <- jammadata[[i]];
data.frame(
check.names=FALSE,
stringsAsFactors=FALSE,
item=rownames(j),
name=i,
colnum=match(i, colnames(x)),
x=j[,"x"],
y=j[,"y"]
)
}));
## Suitable for
## ggplot2::ggplot(jammmadata_df, ggplot2::aes(x=x, y=y)) +
## ggplot2::geom_point() + ggplot2::facet_wrap(~sample) + colorjam::theme_jam()
}
## Add MAD data to jammadata
attr(jammadata, "MADs") <- x_mads;
attr(jammadata, "groupMADs") <- x_grp_mads;
attr(jammadata, "MADfactors") <- x_mad_factors;
return(jammadata);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.