# Weighted variance using a matrix decomposition as model
calc_row_dispersion_inner <- function(args) with(args, {
x <- as.matrix(x)
w <- as.matrix(w)
n <- nrow(x)
p <- ncol(row)
result <- rep(NA_real_,n)
for(i in seq_len(n)) {
wi <- w[i,]
present <- wi > 0
wi_present <- wi[present]
df <- length(wi_present)-p
if (df > 0) {
errors <- as.vector(x[i,present]) -
as.vector(col[present,,drop=FALSE] %*% row[i,])
result[i] <- sum(errors*errors*wi_present) / df
}
}
result
})
calc_row_dispersion <- function(x, w, row, col) {
BPPARAM <- bpparam()
parts <- partitions(nrow(x), ncol(x)*2, BPPARAM=BPPARAM)
feed <- map(parts, function(part) {
list(
x=x[part,,drop=FALSE],
w=w[part,,drop=FALSE],
row=row[part,,drop=FALSE],
col=col)
})
result <- bplapply(feed, calc_row_dispersion_inner, BPPARAM=BPPARAM)
unlist(result)
}
as_components <- function(design, weitrix) {
if (is(design, "Components"))
result <- design
else
result <- weitrix_components(weitrix, design=design, p=0, verbose=FALSE)
assert_that(nrow(result$row) == nrow(weitrix))
assert_that(nrow(result$col) == ncol(weitrix))
result
}
#' Calculate row dispersions
#'
#' Calculate the dispersion of each row. For each observation,
#' this value divided by the weight gives the observation's variance.
#'
#' @param weitrix
#' A weitrix object, or an object that can be converted to a weitrix
#' with \code{as_weitrix}.
#' @param design
#' A formula in terms of \code{colData(weitrix} or a design matrix,
#' which will be fitted to the weitrix on each row.
#' Can also be a pre-existing Components object,
#' in which case the existing fits (\code{design$row}) are used.
#'
#' @return
#' A numeric vector.
#'
#' @examples
#' # Using a model just containing an intercept
#' weitrix_dispersions(simwei, ~1)
#'
#' # Allowing for one component of variation, the dispersions are lower
#' comp <- weitrix_components(simwei, p=1, verbose=FALSE)
#' weitrix_dispersions(simwei, comp)
#'
#' @export
weitrix_dispersions <- function(weitrix, design=~1) {
weitrix <- as_weitrix(weitrix)
comp <- as_components(design, weitrix)
calc_row_dispersion(
weitrix_x(weitrix),
weitrix_weights(weitrix),
comp$row,
comp$col)
}
#' Adjust weights row-wise based on given row dispersions
#'
#' Based on estimated row dispersions, adjust weights in each row.
#'
#' For large numbers of samples this can be based directly
#' on weitrix_dispersions.
#' For small numbers of samples, when using limma,
#' it should be based on a trend-line fitted to known co-variates of
#' the dispersions.
#' This can be done using \code{weitrix_calibrate_trend}.
#'
#' @param weitrix
#' A weitrix object, or an object that can be converted to a weitrix
#' with \code{as_weitrix}.
#' @param dispersions
#' A dispersion for each row.
#'
#' @return
#' A SummarizedExperiment object with metadata fields marking it as a weitrix.
#'
#' @examples
#' # Adjust weights so dispersion for each row is exactly 1. This is dubious
#' # for a small dataset, but would be fine for a dataset with many columns.
#' comp <- weitrix_components(simwei, p=1, verbose=FALSE)
#' disp <- weitrix_dispersions(simwei, comp)
#' cal <- weitrix_calibrate(simwei, disp)
#' weitrix_dispersions(cal, comp)
#'
#' @export
weitrix_calibrate <- function(weitrix, dispersions) {
weitrix <- as_weitrix(weitrix)
assert_that(nrow(weitrix) == length(dispersions))
# Zero out weights if dispersion not available
mul <- 1/dispersions
mul[is.na(mul)] <- 0
weitrix_weights(weitrix) <- sweep(
weitrix_weights(weitrix), 1, mul, "*")
metadata(weitrix)$weitrix$calibrated <- TRUE
weitrix
}
#' Adjust weights row-wise by fitting a trend to estimated dispersions
#'
#' Dispersions are estimated using \code{weitrix_dispersions}.
#' A trend line is then fitted to the dispersions using a gamma GLM
#' with log link function.
#' Weitrix weights are calibrated based on this trend line.
#'
#' @param weitrix
#' A weitrix object, or an object that can be converted to a weitrix
#' with \code{as_weitrix}.
#' @param design
#' A formula in terms of \code{colData(weitrix} or a design matrix,
#' which will be fitted to the weitrix on each row.
#' Can also be a pre-existing Components object,
#' in which case the existing fits (\code{design$row}) are used.
#' @param trend_formula
#' A formula specification for predicting log dispersion from
#' columns of rowData(weitrix).
#' If absent, metadata(weitrix)$weitrix$calibrate_trend_formula is used.
#'
#' @return
#' A SummarizedExperiment object with metadata fields marking it as a weitrix.
#'
#' Several columns are added to the \code{rowData}:
#' \itemize{
#' \item{deg_free}{ Degrees of freedom for dispersion calculation.}
#' \item{dispersion_before}{ Dispersion before calibration.}
#' \item{dispersion_trend}{ Fitted dispersion trend.}
#' \item{dispersion_after}{ Dispersion for these new weights.}
#' }
#'
#' @examples
#' rowData(simwei)$total_weight <- rowSums(weitrix_weights(simwei))
#'
#' # To estimate dispersions, use a simple model containing only an intercept
#' # term. Model log dispersion as a straight line relationship with log total
#' # weight and adjust weights to remove any trend.
#' cal <- weitrix_calibrate_trend(simwei,~1,trend_formula=~log(total_weight))
#'
#' # This dataset has few rows, so calibration like this is dubious.
#' # Predictors in the fitted model are not significant.
#' summary( metadata(cal)$weitrix$trend_fit )
#'
#' # Information about the calibration is added to rowData
#' rowData(cal)
#'
#'
#' # A Components object may also be used as the design argument.
#' comp <- weitrix_components(simwei, p=1, verbose=FALSE)
#' cal2 <- weitrix_calibrate_trend(simwei,comp,trend_formula=~log(total_weight))
#'
#' rowData(cal2)
#'
#' @export
weitrix_calibrate_trend <- function(weitrix, design=~1, trend_formula=NULL) {
weitrix <- as_weitrix(weitrix)
comp <- as_components(design, weitrix)
if (is.null(trend_formula))
trend_formula <- metadata(weitrix)$weitrix$calibrate_trend_formula
if (is.null(trend_formula))
trend_formula <- metadata(weitrix)$weitrix$trend_formula
assert_that(!is.null(trend_formula))
trend_formula <- as.formula(trend_formula)
rowData(weitrix)$deg_free <-
rowSums(weitrix_weights(weitrix) > 0) - ncol(comp$col)
rowData(weitrix)$dispersion_before <- weitrix_dispersions(weitrix, comp)
data <- rowData(weitrix)
# Least squares method on log dispersion_before
#
#trend_formula <- update(trend_formula, log(dispersion_before)~.)
#
#tiny <- mean(data$dispersion_before,na.rm=TRUE)*1e-9
#good <- !is.na(data$dispersion_before) & data$dispersion_before > tiny
#data <- data[good,]
#
#fit <- eval(substitute(
# lm(trend_formula, data=data),
# list(trend_formula=trend_formula)
#))
#
#pred <- exp(
# predict(fit, newdata=rowData(weitrix)) +
# sigma(fit)^2/2)
# Gamma GLM method
# - Gamma() doesn't like zeros, but it's actually fine,
# so use quasi(), fit will be identical.
# - Choice of starting predictions for iteration matters,
# very small dispersions cause numerical errors,
# so start with a conservative set of predictions.
trend_formula <- update(trend_formula, dispersion_before~.)
mustart <- mean(data$dispersion_before, na.rm=TRUE)
n <- nrow(data)
fit <- eval(substitute(
glm2(
trend_formula,
data=data,
weights=deg_free,
family=quasi(link="log", variance="mu^2"),
mustart=rep(mustart, n),
control=glm.control(maxit=100)
),
list(trend_formula=trend_formula, mustart=mustart, n=n)
))
pred <- predict(fit, newdata=rowData(weitrix), type="response")
rowData(weitrix)$dispersion_trend <- pred
rowData(weitrix)$dispersion_after <-
rowData(weitrix)$dispersion_before / pred
metadata(weitrix)$weitrix$trend_fit <- fit
weitrix_calibrate(weitrix, pred)
}
# Hack!
# Extract anything that might be referred to in a formula
all_names <- function(obj) {
if (inherits(obj, "name"))
return(as.character(obj))
if (!inherits(obj, c("formula","call")))
return(c())
do.call(c, lapply(obj, all_names))
}
#' Adjust weights element-wise by fitting a trend to squared residuals
#'
#' This is a very flexible method of calibrating weights.
#' It should be especially useful if your existing weights account for
#' technical variation,
#' but there is also biological variation.
#' In this case large weights will tend to be overly optimistic,
#' and a non-linear transformation of weights is needed.
#'
#' Residuals are found relative to a fitted model.
#' A trend model is then fitted to the squared residuals using a gamma GLM
#' with log link function.
#' Weitrix weights are set based on the inverse of the fitted trend.
#'
#' Residuals from a fitted model are generally smaller than residuals
#' from the true model.
#' A simple adjustment to the weights is made to account for this.
#' Weights are reduced by a factor of (n-ncol(design)*nrow(weitrix))/n
#' where n is the number of non-missing values in the weitrix.
#'
#' \code{trend_formula} may reference any row or column variables,
#' or \code{mu} for the predicted value,
#' or \code{weight} for the existing weights,
#' or special factors \code{row} and \code{col}.
#' Keep in mind also that a log link function is used.
#'
#' Unlike in \code{weitrix_calibrate_trend},
#' existing weights must be explicitly included in the formula
#' if they are to be retained (see examples).
#'
#' This function is currently not memory efficient,
#' it should be fine for bulk experiments but may struggle for single cell.
#' To reduce memory usage somewhat,
#' when constructing the data frame on which to fit the glm,
#' only columns referenced in \code{trend_formula} are included.
#'
#' Example formulas:
#'
#' \code{trend_formula=~1+offset(-log(weight))}
#' Apply a global scaling, otherwise keeping weights the same.
#'
#' \code{trend_formula=~log(weight)}
#' Moderate weights by raising them to some power
#' and applying some overall scaling factor.
#' This will allow for biological variation.
#'
#' \code{trend_formula=~poly(log(weight),2))}
#' Apply a more complex quadratic curve-based moderation of weights.
#'
#' \code{trend_formula=~col+offset(-log(weight))}
#' Calibrate each sample's weights by a scaling factor.
#' Note that due to the simplistic adjustment for using a fitted model rather
#' than the true model, this may give misleading results when
#' the design is unbalanced and there are few samples,
#' i.e. when there are some samples with much higher leverage than others.
#'
#' \code{trend_formula=~col*poly(log(weight),2)}
#' Quadratic curve moderation of weights, applied to each sample individually.
#'
#' @param weitrix
#' A weitrix object, or an object that can be converted to a weitrix
#' with \code{as_weitrix}.
#' @param design
#' A formula in terms of \code{colData(weitrix} or a design matrix,
#' which will be fitted to the weitrix on each row.
#' Can also be a pre-existing Components object,
#' in which case the existing fits (\code{design$row}) are used.
#' @param trend_formula
#' A formula specification for predicting squared residuals. See below.
#' If absent, metadata(weitrix)$weitrix$calibrate_all_formula is used.
#'
#' @param mu_min
#' When fitting the GLM, omit observations
#' where the estimated mu is less than this value.
#' When calculating weights from the fitted GLM,
#' clip mu to be at least this value.
#' @param mu_max
#' When fitting the GLM, omit observations
#' where the estimated mu is greater than this value.
#' When calculating weights from the fitted GLM,
#' clip mu to be at most this value.
#'
#' @param keep_fit
#' Keep glm fit and the data used to create it. This can be large!
#' If TRUE, these will be stored in \code{metadata(weitrix)$weitrix$all_fit}
#' and \code{metadata(weitrix)$weitrix$all_data}.
#'
#' @return
#' A SummarizedExperiment object with metadata fields marking it as a weitrix.
#'
#' \code{metadata(weitrix)$weitrix} will contain the fitted trend model,
#' and if requested the data frame used to fit the model.
#'
#' @examples
#'
#' simcal <- weitrix_calibrate_all(simwei, ~1, ~log(weight), keep_fit=TRUE)
#'
#' metadata(simcal)$weitrix$all_fit
#'
#' @export
weitrix_calibrate_all <- function(
weitrix, design=~1, trend_formula=NULL,
mu_min=NA, mu_max=NA,
keep_fit=FALSE) {
weitrix <- as_weitrix(weitrix)
comp <- as_components(design, weitrix)
if (is.null(trend_formula))
trend_formula <- metadata(weitrix)$weitrix$calibrate_all_formula
assert_that(!is.null(trend_formula))
trend_formula <- as.formula(trend_formula)
needed <- all_names(trend_formula)
needed_rowdata <- intersect(colnames(rowData(weitrix)), needed)
needed_coldata <- intersect(colnames(colData(weitrix)), needed)
which_index <- which(as.vector(weitrix_weights(weitrix)>0))-1
which_row <- which_index %% nrow(weitrix)
which_col <- which_index %/% nrow(weitrix)
which_index <- which_index + 1
which_row <- which_row + 1
which_col <- which_col + 1
data <- data.frame(row.names=seq_along(which_index))
for(name in needed_rowdata)
data[[name]] <- rowData(weitrix)[[name]][which_row]
for(name in needed_coldata)
data[[name]] <- colData(weitrix)[[name]][which_col]
if ("row" %in% needed)
data$row <- factor(rownames(weitrix)[which_row], rownames(weitrix))
if ("col" %in% needed)
data$col <- factor(colnames(weitrix)[which_col], colnames(weitrix))
data$weight <- as.vector(weitrix_weights(weitrix))[which_index]
data$mu <- as.vector(comp$row %*% t(comp$col))[which_index] #TODO: make more efficient
data$.y <- (as.vector(weitrix_x(weitrix))[which_index] - data$mu)^2
# mu may be clipped to a range of values
# - data outside the range is treated as missing
# - when assigning weights, mu is clipped to this range
if (!is.na(mu_min)) {
clip <- data$mu < mu_min
data$.y[clip] <- NA
data$mu[clip] <- mu_min
}
if (!is.na(mu_max)) {
clip <- data$mu > mu_max
data$.y[clip] <- NA
data$mu[clip] <- mu_max
}
mustart <- mean(data$.y, na.rm=TRUE)
n <- nrow(data)
trend_formula <- update(trend_formula, .y ~ .)
fit <- eval(substitute(
glm2(
trend_formula,
data=data,
family=quasi(link="log", variance="mu^2"),
mustart=rep(mustart, n),
model=FALSE, x=FALSE, y=FALSE,
control=glm.control(maxit=100)
),
list(trend_formula=trend_formula, mustart=mustart, n=n)
))
n_good <- sum(!is.na(data$.y))
overfitting_adjustment <- (n_good-ncol(comp$col)*nrow(weitrix)) / n_good
pred <- predict(fit, newdata=data, type="response")
pred[is.na(pred)] <- Inf
new_weights <- matrix(0, nrow=nrow(weitrix), ncol=ncol(weitrix))
new_weights[which_index] <- overfitting_adjustment/pred
rownames(new_weights) <- rownames(weitrix)
colnames(new_weights) <- colnames(weitrix)
weitrix_weights(weitrix) <- new_weights
metadata(weitrix)$weitrix$all_coef <- coef(fit)
if (keep_fit) {
metadata(weitrix)$weitrix$all_fit <- fit
data$new_weight <- as.vector(weitrix_weights(weitrix))[which_index]
metadata(weitrix)$weitrix$all_data <- data
}
weitrix
}
#' Weight calibration plots, optionally versus a covariate
#'
#' Various plots based on weighted squared residuals
#' of each element in the weitrix.
#' \code{weight*residual^2} is the Pearson residual for a gamma GLM plus one,
#' as used by \code{weitrix_calibrate_all}.
#'
#' This function is not memory efficient.
#' It is suitable for typical bulk data, but generally not not for single-cell.
#'
#' Defaults to a boxplot of sqrt(weight) weighted residuals.
#' Blue guide bars are shown for the expected quartiles,
#' these will ideally line up with the boxplot.
#'
#' If \code{cat} is given, it will be used to break the elements down
#' into categories.
#'
#' If \code{covar} is given, sqrt(weight) weighted residuals are plotted versus
#' the covariate, with red trend lines for
#' the mean and for the mean +/- one standard deviation.
#' If the weitrix is calibrated, the trend lines should be horizontal lines
#' with y intercept close to -1, 0 and 1.
#' Blue guide lines are shown for this ideal outcome.
#'
#' Any of the variables available with \code{weitrix_calibrate_all}
#' can be used for \code{covar} or \code{cat}.
#'
#' @param weitrix
#' A weitrix object, or an object that can be converted to a weitrix
#' with \code{as_weitrix}.
#' @param design
#' A formula in terms of \code{colData(weitrix} or a design matrix,
#' which will be fitted to the weitrix on each row.
#' Can also be a Components object.
#' @param covar
#' Optional. A covariate. Specify as you would with \code{ggplot2::aes}.
#' Can be a matrix of the same size as \code{weitrix}.
#' @param cat
#' Optional. A categorical variable to break down the data by.
#' Specify as you would with \code{ggplot2::aes}.
#' @param funnel
#' Flag. Produce a funnel plot?
#' Note: \code{covar} can not be used for funnel plots.
#' @param guides
#' Show blue guide lines.
#'
#' @return
#' A ggplot2 plot.
#'
#' @examples
#' weitrix_calplot(simwei, ~1)
#' weitrix_calplot(simwei, ~1, covar=mu)
#' weitrix_calplot(simwei, ~1, cat=col)
#'
#' # weitrix_calplot should generally be used after calibration
#' cal <- weitrix_calibrate_all(simwei, ~1, ~col+log(weight))
#' weitrix_calplot(cal, ~1, cat=col)
#'
#' # You can use a matrix of the same size as the weitrix as a covariate.
#' # It will often be useful to assess vs the original weighting.
#' weitrix_calplot(cal, ~1, covar=weitrix_weights(simwei))
#'
#' @export
weitrix_calplot <- function(
weitrix, design=~1, covar, cat, funnel=FALSE, guides=TRUE) {
covar_var <- enquo(covar)
cat_var <- enquo(cat)
have_covar <- !identical(covar_var, quo())
have_cat <- !identical(cat_var, quo())
if (funnel) {
assert_that(!have_covar, msg="Can't use covar with funnel.")
weight <- NULL #Prevent R CMD check note. Is there a better way?
covar_var <- quo(1/sqrt(weight))
have_covar <- TRUE
}
weitrix <- as_weitrix(weitrix)
comp <- as_components(design, weitrix)
needed <- c(all_names(covar_var), all_names(cat_var))
needed_rowdata <- intersect(colnames(rowData(weitrix)), needed)
needed_coldata <- intersect(colnames(colData(weitrix)), needed)
data <- cbind(
as.data.frame(rowData(weitrix)[
rep(seq_len(nrow(weitrix)), ncol(weitrix)),
needed_rowdata,
drop=FALSE]),
as.data.frame(colData(weitrix)[
rep(seq_len(ncol(weitrix)), each=nrow(weitrix)),
needed_coldata,
drop=FALSE])
)
if ("row" %in% needed)
data$row <-
rep(factor(rownames(weitrix), rownames(weitrix)), ncol(weitrix))
if ("col" %in% needed)
data$col <-
rep(factor(colnames(weitrix), colnames(weitrix)),
each=nrow(weitrix))
data$weight <- as.vector(weitrix_weights(weitrix))
data$mu <- as.vector(comp$row %*% t(comp$col))
data$residual <- ifelse(
data$weight > 0,
as.vector(weitrix_x(weitrix)) - data$mu,
rep(NA, nrow(data)))
data$weighted_residual <- sqrt(data$weight) * data$residual
n_good <- sum(!is.na(data$residual))
overfitting_adjustment <- (n_good-ncol(comp$col)*nrow(weitrix)) / n_good
guide_pos <- sqrt(overfitting_adjustment)
if (!have_cat) {
data$weitrix <- ""
cat_var <- quo(weitrix)
}
if (have_covar) {
y <- if (funnel) data$residual else data$weighted_residual
x <- eval_tidy(covar_var, data=data)
cat <- droplevels(factor(eval_tidy(cat_var, data=data)))
present <- !is.na(y)
y <- y[present]
x <- x[present]
cat <- cat[present]
plot_data <- data.frame(x=x,y=y,cat=cat)
# Calculate a trend line, needs to be square-unbiassed
# Could be made more efficient
bin_cat <- c()
bin_x <- c()
bin_mean <- c()
bin_sd <- c()
for(level in levels(cat)) {
this_x <- x[ cat == level ]
this_y <- y[ cat == level ]
n <- length(this_x)
n_bins_wanted <- ceiling(n^(1/3))
bins <- droplevels(cut(rank(this_x),
seq(0.5,n+0.5,length.out=n_bins_wanted+1)))
bin_cat <- c(bin_cat, rep(level, length(levels(bins))))
bin_x <- c(bin_x, tapply(this_x, list(bins), mean))
bin_mean <- c(bin_mean, tapply(this_y, list(bins), mean))
bin_sd <- c(bin_sd, tapply(this_y, list(bins), sd))
}
bin_cat <- factor(bin_cat, levels=levels(cat))
line_data <- data.frame(x=bin_x,mean=bin_mean,sd=bin_sd,cat=bin_cat)
ggplot(plot_data, aes(y=.data$y, x=.data$x)) +
{if (have_cat) facet_wrap(vars(.data$cat))} +
#geom_point(stroke=0, size=0.5, alpha=0.5, na.rm=TRUE) +
geom_bin2d(bins=200) + scale_fill_viridis_c() +
{if (guides && !funnel) geom_hline(
yintercept=c(-guide_pos,0,guide_pos), color="blue")} +
{if (guides && funnel) geom_abline(
slope=c(guide_pos,0,-guide_pos),
intercept=c(0,0,0),color="blue")} +
geom_line(aes(y=.data$mean), data=line_data, size=1, color="red")+
geom_line(aes(y=.data$mean+.data$sd),
data=line_data, size=1, color="red")+
geom_line(aes(y=.data$mean-.data$sd),
data=line_data, size=1, color="red")+
labs(
y=if (funnel) "residual" else "sqrt(weight) * residual",
x=as_label(covar_var),
fill="Count")+
theme(legend.position="bottom", legend.justification=c(1,1))
} else {
ggplot(data, aes(
y=.data$weighted_residual,
x=!!cat_var)) +
{if (guides) geom_hline(
yintercept=guide_pos*qnorm(c(0.25,0.5,0.75)), color="blue")} +
geom_boxplot(na.rm=TRUE) +
labs(y="sqrt(weight) * residual", x="") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.