#' @rdname mvsusie
#'
#' @title Multivariate SUm of Single Effect (SuSiE) Regression
#'
#' @description Performs a Bayesian multiple linear regression of Y on X.
#' That is, this function fits the regression model \deqn{Y = \sum_l X
#' b_l + e,} where the elements of \eqn{e} are \emph{i.i.d.} normal
#' with zero mean and variance \code{residual_variance}, and the sum
#' \eqn{\sum_l b_l} is a vector of p effects to be estimated. The
#' SuSiE assumption is that each \eqn{b_l} has exactly one non-zero
#' element.
#'
#' @param X N by J matrix of covariates.
#'
#' @param Y Vector of length N, or N by R matrix of response
#' variables.
#'
#' @param L Maximum number of non-zero effects.
#'
#' @param prior_variance Can be either (1) a vector of length L, or a
#' scalar, for scaled prior variance when Y is univariate (which
#' should then be equivalent to \code{\link[susieR]{susie}}); or (2) a
#' matrix for a simple multivariate regression; or (3) a mixture prior
#' from \code{\link{create_mixture_prior}}.
#'
#' @param residual_variance The residual variance (defaults to the
#' sample variance of Y).
#'
#' @param prior_weights A vector of length p giving the prior
#' probability that each element is non-zero. Note that the prior
#' weights need to be non-negative but do not need to sum to 1; they
#' will automatically be normalized to sum to 1 so that they represent
#' probabilities. The default setting is that the prior weights are
#' the same for all variables.
#'
#' @param standardize Logical flag specifying whether to standardize
#' columns of X to unit variance prior to fitting. If you do not
#' standardize you may need to think more carefully about specifying
#' the scale of the prior variance. Whatever the value of standardize,
#' the coefficients (returned by \code{\link{coef}}) are for X
#' on the original input scale. Note that any column of X with zero
#' variance is not standardized, but left as is.
#'
#' @param intercept Should intercept be fitted or set to zero. Setting
#' \code{intercept = FALSE} is generally not recommended.
#'
#' @param approximate Specifies whether to use approximate computation
#' for the intercept when there are missing values in Y. The
#' approximation saves some computational effort. Note that when the
#' residual_variance is a diagonal matrix, running \code{mvsusie} with
#' \code{approximate = TRUE} will give same result as
#' \code{approximate = FALSE}, but with less running time. This
#' setting is only relevant when there are missing values in Y and
#' \code{intercept} = TRUE.
#'
#' @param estimate_residual_variance When
#' \code{estimate_residual_variance = TRUE}, the residual variance is
#' estimated; otherwise it is fixed. Currently
#' \code{estimate_residual_variance = TRUE} only works for univariate Y.
#'
#' @param estimate_prior_variance When \code{estimate_prior_variance =
#' TRUE}, the prior variance is estimated; otherwise it is
#' fixed. Currently \code{estimate_prior_variance = TRUE} only works
#' for univariate Y, or for multivariate Y when the prior variance is
#' a matrix).
#'
#' @param estimate_prior_method The method used for estimating the
#' prior variance; valid choices are \code{"optim"}, \code{"uniroot"}
#' or \code{"em"} for univariate Y; and \code{"optim"},
#' \code{"simple"} for multivariate Y.
#'
#' @param check_null_threshold When the prior variance is estimated,
#' the estimate is compared against the null, and the prior variance
#' is set to zero unless the log-likelihood using the estimate is
#' larger than that of null by this threshold. For example, setting
#' \code{check_null_threshold = 0.1} will \dQuote{nudge} the estimate
#' towards zero. When used with \code{estimate_prior_method = "EM"},
#' setting \code{check_null_threshold = NA} will skip this check.
#'
#' @param prior_tol When the prior variance is estimated, compare the
#' estimated value to this value at the end of the analysis and
#' exclude a single effect from PIP computation if the estimated prior
#' variance is smaller than it.
#'
#' @param compute_objective Add description of "compute_objective"
#' input argument here.
#'
#' @param precompute_covariances If \code{precompute_covariances =
#' TRUE}, precomputes various covariance quantities to speed up
#' computations at the cost of increased memory usage.
#'
#' @param s_init A previous model fit with which to initialize.
#'
#' @param coverage Coverage of credible sets.
#'
#' @param min_abs_corr Minimum of absolute value of correlation
#' allowed in a credible set. The setting \code{min_abs_corr = 0.5}
#' corresponds to squared correlation of 0.25, which is a commonly
#' used threshold for genotype data in genetics studies.
#'
#' @param compute_univariate_zscore When
#' \code{compute_univariate_zscore = TRUE}, the z-scores from the
#' per-variable univariate regressions are outputted. (Note that these
#' z-scores are not actually used to fit the multivariate susie
#' model.)
#'
#' @param n_thread Maximum number of threads to use for parallel
#' computation (only applicable when a mixture prior is used).
#'
#' @param max_iter Maximum number of iterations to perform.
#'
#' @param tol The model fitting will terminate when the increase in
#' ELBOs between two successive iterations is less than \code{tol}.
#'
#' @param verbosity Set \code{verbosity = 0} for no messages;
#' \code{verbosity = 1} for a progress bar; and \code{verbosity = 2}
#' for more detailed information about the algorithm's progress at the
#' end of each iteration.
#'
#' @param track_fit Add attribute \code{trace} to the return value
#' which records the algorithm's progress at each iteration.
#'
#' @return A multivariate susie fit, which is a list with some or all
#' of the following elements:
#'
#' \item{alpha}{L by p matrix of posterior inclusion probabilites.}
#'
#' \item{b1}{L by p matrix of posterior mean single-effect estimates.}
#'
#' \item{b1_rescaled}{L by p matrix} of posterior mean single-effect
#' estimates on the original input scale (same as \code{coef}).
#'
#' \item{b2}{L by p matrix of posterior second moments.}
#'
#' \item{KL}{Vector of single-effect KL divergences.}
#'
#' \item{lbf}{Vector of single-effect log-Bayes factors.}
#'
#' \item{sigma2}{Residual variance.}
#'
#' \item{V}{Prior variance.}
#'
#' \item{elbo}{Vector storing the the evidence lower bound, or
#' \dQuote{ELBO}, achieved at each iteration of the model fitting
#' algorithm, which attempts to maximize the ELBO.}
#'
#' \item{niter}{Number of iterations performed.}
#'
#' \item{convergence}{Convergence status.}
#'
#' \item{sets}{Estimated credible sets.}
#'
#' \item{pip}{Vector of posterior inclusion probabilities.}
#'
#' \item{walltime}{Records runtime of the model fitting algorithm.}
#'
#' \item{z}{Vector of univariate z-scores.}
#'
#' \item{single_effect_lfsr}{Average lfsr (local false sign rate) for
#' each CS.}
#'
#' \item{lfsr}{TO DO: Explain what this output is.}
#'
#' \item{conditional_lfsr}{The lfsr (local false sign rate) given that
#' the variable is the single effect.}
#'
#' @examples
#' # Example with one response.
#' set.seed(1)
#' n <- 2000
#' p <- 1000
#' beta <- rep(0, p)
#' beta[1:4] <- 1
#' X <- matrix(rnorm(n * p), n, p)
#' Y <- X %*% beta + rnorm(n)
#' fit <- mvsusie(X, Y, L = 10)
#'
#' # Sufficient statistics example with one response.
#' X_colmeans <- colMeans(X)
#' Y_colmeans <- colMeans(Y)
#' X <- scale(X, center = TRUE, scale = FALSE)
#' Y <- scale(Y, center = TRUE, scale = FALSE)
#' XtX <- crossprod(X)
#' XtY <- crossprod(X, Y)
#' YtY <- crossprod(Y)
#' res <- mvsusie_suff_stat(XtX, XtY, YtY, n, L = 10, X_colmeans, Y_colmeans)
#'
#' # RSS example with one response.
#' R <- crossprod(X)
#' z <- susieR:::calc_z(X, Y)
#' res <- mvsusie_rss(z, R, N = n, L = 10)
#'
#' # Example with three responses.
#' set.seed(1)
#' n <- 500
#' p <- 1000
#' true_eff <- 2
#' X <- sample(c(0, 1, 2), size = n * p, replace = TRUE)
#' X <- matrix(X, n, p)
#' beta1 <- rep(0, p)
#' beta2 <- rep(0, p)
#' beta3 <- rep(0, p)
#' beta1[1:true_eff] <- runif(true_eff)
#' beta2[1:true_eff] <- runif(true_eff)
#' beta3[1:true_eff] <- runif(true_eff)
#' y1 <- X %*% beta1 + rnorm(n)
#' y2 <- X %*% beta2 + rnorm(n)
#' y3 <- X %*% beta3 + rnorm(n)
#' Y <- cbind(y1, y2, y3)
#' prior <- create_mixture_prior(R = 3)
#' fit <- mvsusie(X, Y, prior_variance = prior)
#'
#' # Sufficient statistics example with three responses.
#' X_colmeans <- colMeans(X)
#' Y_colmeans <- colMeans(Y)
#' X <- scale(X, center = TRUE, scale = FALSE)
#' Y <- scale(Y, center = TRUE, scale = FALSE)
#' XtX <- crossprod(X)
#' XtY <- crossprod(X, Y)
#' YtY <- crossprod(Y)
#' res <- mvsusie_suff_stat(XtX, XtY, YtY, n,
#' L = 10, X_colmeans, Y_colmeans,
#' prior_variance = prior
#' )
#'
#' # RSS example with three responses.
#' R <- crossprod(X)
#' Z <- susieR:::calc_z(X, Y)
#' res <- mvsusie_rss(Z, R, N = n, L = 10, prior_variance = prior)
#'
#' @importFrom Matrix isDiagonal
#' @importFrom stats sd
#' @importFrom stats var
#' @importFrom susieR susie_get_cs
#'
#' @export
#'
mvsusie <- function(X, Y, L = 10, prior_variance = 0.2,
residual_variance = NULL, prior_weights = NULL,
standardize = TRUE, intercept = TRUE, approximate = FALSE,
estimate_residual_variance = FALSE,
estimate_prior_variance = TRUE,
estimate_prior_method = "EM",
check_null_threshold = 0, prior_tol = 1e-9,
compute_objective = TRUE, s_init = NULL,
coverage = 0.95, min_abs_corr = 0.5,
compute_univariate_zscore = FALSE,
precompute_covariances = FALSE, n_thread = 1,
max_iter = 100, tol = 1e-3, verbosity = 2,
track_fit = FALSE) {
# Adjust prior weights.
if (is.null(prior_weights)) {
prior_weights <- rep(1 / ncol(X), ncol(X))
} else {
prior_weights <- prior_weights / sum(prior_weights)
}
# Check and process prior variance.
if (inherits(prior_variance, "MashInitializer")) {
prior_variance <- prior_variance$clone(deep = TRUE)
}
is_numeric_prior <- !(is.matrix(prior_variance) ||
inherits(prior_variance, "MashInitializer"))
if (!is.null(dim(Y)) && ncol(Y) > 1 && is_numeric_prior) {
stop("Please specify prior variance for the multivariate response Y")
}
if (standardize && !is_numeric_prior) {
# Scale prior variance; see
# https://github.com/stephenslab/mvsusieR/blob/master/
# inst/prototypes/prior_matrices_scale.ipynb
sigma <- sapply(1:ncol(Y), function(i) sd(Y[, i], na.rm = TRUE))
n <- sapply(1:ncol(Y), function(i) length(which(!is.na(Y[, 1]))))
sigma <- sigma / sqrt(n)
# Make sigma numerically more robust against extreme values.
if (estimate_prior_variance) {
sigma <- sigma / max(sigma)
}
if (is.matrix(prior_variance)) {
prior_variance <- scale_covariance(prior_variance, sigma)
} else {
prior_variance$scale_prior_variance(sigma)
}
}
if (verbosity > 1) {
message("Initializing data object...")
message(paste("Dimension of X matrix:", nrow(X), ncol(X)))
message(paste("Dimension of Y matrix:", nrow(Y), ncol(Y)))
}
# Set data object.
if (anyNA(Y)) {
# When the residual variance is a diagonal matrix, the approximate
# version has the same result as the exact version, and it is
# faster, so we set approximate = TRUE.
if (isDiagonal(residual_variance)) {
approximate <- TRUE
}
data <- DenseDataYMissing$new(X, Y, approximate)
estimate_residual_variance <- FALSE
} else {
data <- DenseData$new(X, Y)
}
# Include residual variance in the data object.
data$set_residual_variance(residual_variance,
numeric = is_numeric_prior,
quantities = "residual_variance"
)
data$standardize(intercept, standardize)
data$set_residual_variance(quantities = "effect_variance")
# Fit the susie model.
s <- mvsusie_core(
data, s_init, L, prior_variance, prior_weights,
estimate_residual_variance, estimate_prior_variance,
estimate_prior_method, check_null_threshold,
precompute_covariances, compute_objective, n_thread,
max_iter, tol, prior_tol, track_fit, verbosity
)
# Compute CSs and PIPs.
if (!is.null(coverage) && !is.null(min_abs_corr)) {
s$sets <- susie_get_cs(s,
coverage = coverage, X = X,
min_abs_corr = min_abs_corr
)
}
# Report z-scores from univariate regression.
if (compute_univariate_zscore) {
s$z <- susieR:::calc_z(X, Y, center = intercept, scale = standardize)
}
# Set row and column names of the outputs, and fix dimensions of
# outputs if needed.
s$coef <- drop(s$coef)
s$fitted <- drop(s$fitted)
s$residual_variance <- data$residual_variance
if (is.null(colnames(Y))) {
s$condition_names <- paste0("cond", 1:ncol(Y))
} else {
s$condition_names <- colnames(Y)
}
if (is.null(colnames(X))) {
s$variable_names <- paste0("var", 1:ncol(X))
} else {
s$variable_names <- colnames(X)
}
if (length(s$condition_names) == 1) {
names(s$coef) <- c("(Intercept)", s$variable_names)
names(s$fitted) <- rownames(X)
rownames(s$b1) <- paste0("l", 1:L)
colnames(s$b2) <- s$variable_names
rownames(s$b2) <- paste0("l", 1:L)
colnames(s$b2) <- s$variable_names
} else {
rownames(s$coef) <- c("(Intercept)", s$variable_names)
colnames(s$coef) <- s$condition_names
colnames(s$fitted) <- s$condition_names
rownames(s$fitted) <- rownames(X)
dimnames(s$b1) <- list(
single_effect = paste0("l", 1:L),
variable = s$variable_names,
condition = s$condition_names
)
dimnames(s$b2) <- list(
single_effect = paste0("l", 1:L),
variable = s$variable_names,
condition = s$condition_names
)
rownames(s$residual_variance) <- s$condition_names
colnames(s$residual_variance) <- s$condition_names
}
names(s$pip) <- s$variable_names
colnames(s$alpha) <- s$variable_names
names(s$intercept) <- s$condition_names
rownames(s$alpha) <- paste0("l", 1:L)
class(s) <- "mvsusie"
return(s)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.