Nothing
############################
## Functions relating to surrogate variables and permuting the design
## matrix in thin_diff().
############################
#' Estimate the surrogate variables.
#'
#' This will use either \code{\link[sva]{sva}} or an SVD on the residuals
#' of a regression of \code{mat} on \code{design_obs} to estimate the
#' surrogate variables.
#'
#' @inheritParams thin_diff
#' @param n_sv The number of surrogate variables.
#'
#' @return A matrix of estimated surrogate variables. The columns index the
#' surrogate variables and the rows index the individuals. The surrogate
#' variables are centered and scaled to have mean 0 and variance 1.
#'
#' @author David Gerard
est_sv <- function(mat, n_sv, design_obs, use_sva = FALSE) {
assertthat::is.count(n_sv)
assertthat::assert_that(is.matrix(mat))
assertthat::assert_that(is.matrix(design_obs))
assertthat::assert_that(is.logical(use_sva))
assertthat::are_equal(ncol(mat), nrow(design_obs))
assertthat::are_equal(length(use_sva), 1)
matlog2 <- log2(mat + 0.5)
Xfixed <- cbind(design_obs, 1)
if (use_sva & ncol(design_obs) > 0) {
utils::capture.output(sv <- sva::sva(dat = matlog2, mod = Xfixed, n.sv = n_sv)$sv)
} else {
sv <- svd(matlog2 %*% (diag(nrow(Xfixed)) - Xfixed %*% solve(t(Xfixed) %*% Xfixed) %*% t(Xfixed)), nv = n_sv, nu = 0)$v
}
sv <- sv * sqrt(nrow(sv - 1))
return(sv)
}
#' Permute the design matrix so that it is approximately correlated with
#' the surrogate variables.
#'
#' @inheritParams thin_diff
#' @param sv A matrix of surrogate variables
#' @param method Should we use the Gale-Shapley algorithm
#' for stable marriages (\code{"marriage"}) (Gale and Shapley, 1962)
#' as implemented in the matchingR package, or the Hungarian algorithm
#' (Papadimitriou and Steiglitz, 1982) (\code{"hungarian"})
#' as implemented in the clue package (Hornik, 2005)? The
#' Hungarian method almost always works better, so is the default.
#'
#' @references
#' \itemize{
#' \item{Gale, David, and Lloyd S. Shapley. "College admissions and the stability of marriage." \emph{The American Mathematical Monthly} 69, no. 1 (1962): 9-15. \doi{10.1080/00029890.1962.11989827}.}
#' \item{C. Papadimitriou and K. Steiglitz (1982), Combinatorial Optimization: Algorithms and Complexity. Englewood Cliffs: Prentice Hall.}
#' \item{Hornik K (2005). "A CLUE for CLUster Ensembles." \emph{Journal of Statistical Software}, 14(12). \doi{10.18637/jss.v014.i12}. \doi{10.18637/jss.v014.i12}.}
#' }
#'
#' @return A list with two elements:
#' \describe{
#' \item{\code{design_perm}}{A row-permuted version of the user-provided
#' \code{design_perm}.}
#' \item{\code{latent_var}}{A matrix of the latent variables on which
#' \code{design_perm} was matched.}
#' }
#'
#' @author David Gerard
permute_design <- function(design_perm, sv, target_cor, method = c("hungarian", "marriage")) {
## Check input --------------------------------------------------------------
assertthat::are_equal(nrow(design_perm), nrow(sv))
assertthat::are_equal(ncol(design_perm), nrow(target_cor))
assertthat::are_equal(ncol(sv), ncol(target_cor))
nsamp <- nrow(design_perm)
## in case a non-standard sv is given ---------------------------------------
sv <- scale(sv)
method <- match.arg(method)
## Generate latent factors --------------------------------------------------
sigma11 <- stats::cor(design_perm)
sigma12 <- target_cor
sigma_cond <- sigma11 - sigma12 %*% t(sigma12)
mu_cond <- sv %*% t(sigma12)
latent_var <- rmvnorm(mu = mu_cond, sigma = sigma_cond)
## Get permutations ---------------------------------------------------------
distmat <- as.matrix(pdist::pdist(X = scale(design_perm), Y = scale(latent_var)))
if (method == "marriage") {
matchout <- matchingR::galeShapley.marriageMarket(proposerUtils = -1 * distmat, reviewerUtils = -1 * t(distmat))
design_perm <- design_perm[matchout$proposals, , drop = FALSE]
} else if (method == "hungarian") {
clue_out <- clue::solve_LSAP(x = t(distmat), maximum = FALSE)
design_perm <- design_perm[as.numeric(clue_out), , drop = FALSE]
}
return(list(design_perm = design_perm, latent_var = latent_var))
}
#' Estimates the effective correlation.
#'
#' Will return the estimated correlation between the design matrix and the
#' surrogate variables when you assign a target correlation. The method is
#' described in detail in Gerard (2020).
#'
#' This function permutes the rows of \code{design_perm} many times, each
#' time calculating the Pearson correlation between the columns of
#' \code{design_perm} and the columns of \code{sv}. It then returns the
#' averages of these Pearson correlations. The permutation is done
#' using \code{\link{permute_design}}.
#'
#' @inheritParams thin_diff
#' @inheritParams permute_design
#' @param iternum The total number of simulated correlations to consider.
#' @param calc_first Should we calculate the correlation of the mean
#' \code{design_perm} and \code{sv} (\code{calc_first = "mean"}), or
#' should we calculate the mean of the correlations between
#' \code{design_perm} and \code{sv} (\code{calc_first = "cor"})? This
#' should only be changed by expert users.
#'
#' @export
#'
#' @author David Gerard
#'
#' @references
#' \itemize{
#' \item{Gale, David, and Lloyd S. Shapley. "College admissions and the stability of marriage." \emph{The American Mathematical Monthly} 69, no. 1 (1962): 9-15. \doi{10.1080/00029890.1962.11989827}.}
#' \item{Gerard, D (2020). "Data-based RNA-seq simulations by binomial thinning." \emph{BMC Bioinformatics}. 21(1), 206. \doi{10.1186/s12859-020-3450-9}.}
#' \item{Hornik K (2005). "A CLUE for CLUster Ensembles." \emph{Journal of Statistical Software}, 14(12). \doi{10.18637/jss.v014.i12}. \doi{10.18637/jss.v014.i12}.}
#' \item{C. Papadimitriou and K. Steiglitz (1982), Combinatorial Optimization: Algorithms and Complexity. Englewood Cliffs: Prentice Hall.}
#' }
#'
#' @return A matrix of correlations. The rows index the observed covariates
#' and the columns index the surrogate variables. Element (i, j) is
#' the estimated correlation between the ith variable in
#' \code{design_perm} and the jth variable in \code{sv}.
#'
#' @examples
#' ## Generate the design matrices and set target correlation -----------------
#' n <- 10
#' design_perm <- cbind(rep(c(0, 1), each = n / 2),
#' rep(c(0, 1), length.out = n))
#' sv <- matrix(rnorm(n))
#' target_cor <- matrix(c(0.9, 0.1), ncol = 1)
#'
#' ## Get estimated true correlation ------------------------------------------
#' ## You should use a much larger iternum in practice
#' effective_cor(design_perm = design_perm,
#' sv = sv,
#' target_cor = target_cor,
#' iternum = 10)
#'
effective_cor <- function(design_perm,
sv,
target_cor,
calc_first = c("cor", "mean"),
method = c("hungarian", "marriage"),
iternum = 1000) {
## Check input -------------------------------------------------------------
calc_first <- match.arg(calc_first)
assertthat::assert_that(is.matrix(design_perm))
assertthat::assert_that(is.numeric(design_perm))
assertthat::assert_that(is.matrix(sv))
assertthat::assert_that(is.numeric(sv))
assertthat::assert_that(is.matrix(target_cor))
assertthat::assert_that(is.numeric(target_cor))
assertthat::are_equal(nrow(sv), nrow(design_perm))
assertthat::are_equal(ncol(sv), ncol(target_cor))
assertthat::are_equal(ncol(design_perm), nrow(target_cor))
assertthat::is.count(iternum)
method <- match.arg(method)
## Get estimated correlation
target_cor <- fix_cor(design_perm = design_perm, target_cor = target_cor)
if (calc_first == "cor") {
corarray <- array(NA, dim = c(ncol(design_perm), ncol(sv), iternum))
## latent_cor_array <- array(NA, dim = dim(corarray))
for (index in seq_len(iternum)) {
pout <- permute_design(design_perm = design_perm,
sv = sv,
target_cor = target_cor,
method = method)
corarray[, , index] <- stats::cor(pout$design_perm, sv)
## latent_cor_array[, , index] <- stats::cor(pout$latent_var, sv)
}
truecor <- apply(corarray, c(1, 2), mean)
} else if (calc_first == "mean") {
perm_array <- array(NA, dim = c(nrow(design_perm),
ncol(design_perm),
iternum))
## latent_array <- array(NA, dim = dim(perm_array))
for (index in seq_len(iternum)) {
pout <- permute_design(design_perm = design_perm,
sv = sv,
target_cor = target_cor,
method = method)
perm_array[, , index] <- pout$design_perm
## latent_array[, , index] <- pout$latent_var
}
truecor <- stats::cor(apply(perm_array, c(1, 2), mean), sv)
}
return(truecor)
}
#' Fixes an invalid target correlation.
#'
#' Shrinks the target correlation using a uniform scaling factor so that
#' the overall correlation matrix is positive semi-definite. The method
#' is described in detail in Gerard (2020).
#'
#' Let \eqn{W} = \code{cor(design_perm)}. Let \eqn{R} = \code{target_cor}.
#' Then the overall correlation matrix is:
#' \deqn{ \left(
#' \begin{array}{cc}
#' W & R\\
#' R' & I_K
#' \end{array}
#' \right).
#' }
#' This function applies a multiplicative scaling factor to \eqn{R} until
#' the above matrix is positive semi-definite. That is, it finds \eqn{a}
#' between 0 and 1 such that
#' \deqn{ \left(
#' \begin{array}{cc}
#' W & aR\\
#' aR' & I_K
#' \end{array}
#' \right)
#' }
#' is positive semi-definite.
#'
#' @inheritParams thin_diff
#' @param num_steps The number of steps between 0 and 1 to take in the
#' grid search for the shrinkage factor. The step-size would be
#' \code{1 / (num_steps - 1)}.
#'
#' @author David Gerard
#'
#' @return A matrix of correlations the same dimension as \code{target_cor}.
#' Actually, the returned matrix is \code{a * target_cor}, where \code{a}
#' was determined to make the overall correlation matrix positive
#' semi-definite.
#'
#' @references
#' \itemize{
#' \item{Gerard, D (2020). "Data-based RNA-seq simulations by binomial thinning." \emph{BMC Bioinformatics}. 21(1), 206. \doi{10.1186/s12859-020-3450-9}.}
#' }
#'
#' @export
#'
#' @examples
#' n <- 10
#' design_perm <- matrix(rep(c(0, 1), length.out = n))
#' target_cor <- matrix(seq(1, 0, length.out = 10), nrow = 1)
#' new_cor <- seqgendiff:::fix_cor(design_perm = design_perm, target_cor = target_cor)
#' new_cor / target_cor
#'
#' ## In the case of one observed covariate, the requirement is just that
#' ## the sum of squared correlations is less than or equal to one.
#' sum(target_cor ^ 2)
#' sum(new_cor ^ 2)
fix_cor <- function(design_perm, target_cor, num_steps = 51) {
## Check input --------------------------------------------------------------
stopifnot(is.matrix(design_perm))
stopifnot(is.matrix(target_cor))
stopifnot(ncol(design_perm) == nrow(target_cor))
stopifnot(target_cor >= -1, target_cor <= 1)
assertthat::is.count(num_steps)
stopifnot(num_steps > 1)
p <- ncol(design_perm)
## Calculate top left correlation -------------------------------------------
top_left_cor <- stats::cor(design_perm)
## Calculate RR^T -----------------------------------------------------------
RRt <- tcrossprod(target_cor)
## Test if already psd ------------------------------------------------------
eout <- eigen(top_left_cor - RRt, only.values = TRUE)
if (eout$values[p] >= 0) {
return(target_cor)
}
shrink_vec <- seq(1, 0, length.out = num_steps)
valid <- FALSE
step_index <- 1
while (!valid) {
step_index <- step_index + 1
current_shrink <- shrink_vec[step_index]
eout <- eigen(top_left_cor - current_shrink * RRt, only.values = TRUE)
if (eout$values[p] >= 0) {
valid <- TRUE
}
}
return(sqrt(current_shrink) * target_cor)
}
# Generate multivariate normal random samples.
#
# @param mu A matrix of means. The rows index the independent samples,
# the columns index the variables.
# @param sigma A covariance matrix of the columns.
rmvnorm <- function(mu, sigma) {
stopifnot(nrow(sigma) == ncol(mu))
cholout <- chol(sigma)
simout <- matrix(stats::rnorm(n = prod(dim(mu))), nrow = nrow(mu)) %*% cholout + mu
return(simout)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.