# simulations/simulation_utils.R In lingxuez/sLED: A Sparse Leading Eigenvalue Driven (sLED) Test for High-dimensional Matrices

#' Helper functions to simulate data

#' Simulate samples
#'
#' @param n1 number of samples in first population
#' @param n2 number of samples in second population
#' @param p feature dimension
#' @param gamma.1 square root of the first covariance matrix
#' @param gamma.2 square root of the second covariance matrix
#' @param sample.dist sample distribution. See \strong{Details} below.
#'
#' @details The function generates independent zero-mean p-dimensional
#' random variables Z_i, where each coordinate is
#' independently sampled from the given distribution \code{sample.dist}.
#' Then the samples are generated by
#' X = gamma.1 %*% Z1,
#' Y.null = gamma.1 %*% Z2,
#' Y.alt = gamma.2 %*% Z3.
#' This leads to cov(X)=cov(Y.null)=sigma.1, and cov(Y.alt)=sigma.2.
#'
#' The possible values for \code{sample.dist} include
#' \tabular{ll}{
#' "normal" \tab Standard normal distribution. \cr
#' "nb" \tab Centralized Negative binomial with mu=2, size=2 (the dispersion parameter),
#'            which implies variance = mu+mu^2/size = 4.
#'            The theoretical mean is substracted from samples.\cr
#' "t" \tab Student t-distribution with df=12.\cr
#' "gamma" \tab centralized Gamma(4, 0.5)
#'        (i.e., the theoretical expectation 2 is substracted from samples).
#' }
#'
#' @return A list containing
#' \item{X}{Independent samples from the first population, \eqn{n1} by \eqn{p} matrix.}
#' \item{Y.null}{Independent samples from the first population, \eqn{n2} by \eqn{p} matrix.}
#' \item{Y.alt}{Independent samples from the second population, \eqn{n2} by \eqn{p} matrix.}
simulate.samples <- function(n1, n2, p, gamma.1, gamma.2, sample.dist="normal") {
if (sample.dist=="normal") {
Z1 <- matrix(rnorm(n1*p), nrow=n1, ncol=p)
Z2 <- matrix(rnorm(n2*p), nrow=n2, ncol=p)
Z3 <- matrix(rnorm(n2*p), nrow=n2, ncol=p)

} else if (sample.dist == "nb") {
mu <- 2
size <- 2
Z1 <- matrix(rnbinom(n1*p, mu=mu, size=size), nrow=n1, ncol=p) - mu
Z2 <- matrix(rnbinom(n2*p, mu=mu, size=size), nrow=n2, ncol=p) - mu
Z3 <- matrix(rnbinom(n2*p, mu=mu, size=size), nrow=n2, ncol=p) - mu

} else if (sample.dist == "t") {
df <- 12
Z1 <- matrix(rt(n1*p, df=df), nrow=n1, ncol=p)
Z2 <- matrix(rt(n2*p, df=df), nrow=n2, ncol=p)
Z3 <- matrix(rt(n2*p, df=df), nrow=n2, ncol=p)

} else if (sample.dist == "gamma") {
shape <- 4
scale <- 0.5
Z1 <- matrix(rgamma(n1*p, shape=shape, scale=scale), nrow=n1, ncol=p) - shape*scale
Z2 <- matrix(rgamma(n2*p, shape=shape, scale=scale), nrow=n2, ncol=p) - shape*scale
Z3 <- matrix(rgamma(n2*p, shape=shape, scale=scale), nrow=n2, ncol=p) - shape*scale

} else {
stop("Unknown values of sample.dist.")
}

return(list(X = Z1 %*% gamma.1, Y.null = Z2 %*% gamma.1, Y.alt = Z3 %*% gamma.2))
}

#' Generate covariance matrix
#'
#' Generate two covariance matrices \code{sigma.1} and \code{sigma.2}
#' for null and alternative hypothesis, respectively.
#' For the convenience of later simulations, the square root version
#' \code{gamma.1} and \code{gamm.2} are returned, such that
#' sigma.1 = gamma.1 %*% t(gamma.1),
#' sigma.2 = gamma.2 %*% t(gamma.2).
#'
#' @param p feature dimension
#' @param cov.method method for generating the covariance matrix under null.
#'            See \strong{Details} below.
#' @param diff.method the method for generating the differential matrix.
#'            See \strong{Details} below.
#'
#' @details The \code{cov.method} argument is a character string
#' that specifies the type of covariance matrix to be generated for null hypothesis.
#' Possible values are:
#'
#' \tabular{ll}{
#' "block.diag" \tab block diagonal with 10 blocks.\cr
#' "exp.decay" \tab exponential decay.\cr
#' "noisy.diag" \tab diagonal matrix with sparse noise at off-diagonal elements.
#' }
#'
#' The \code{diff.method} argument is a character string
#' that specifies the type of differential matrix to be generated,
#' such that \code{sigma.2} = \code{sigma.1} + \code{diff.matrix}
#' for alternative hypothesis.
#' Possible values are:
#' \tabular{ll}{
#' "block" \tab \code{diff.matrix} is non-zero in an s-by-s sub-block,
#'              with non-zero entries generated from a uniform distribution.\cr
#' "spiked" \tab \code{diff.matrix} is a rank-one matrix \eqn{d v v^T},
#'              where \code{d} is a scalar controling the signal strength,
#'              and \code{v} is a unit vector.
#'              \code{v} has \code{2s} non-zero elements,
#'              where \code{s} of them have larger values.
#' }
#'
#' @return A list of two square root of covariance matrix:
#' \item{gamma.1}{The square root of covariance matrix under null hypothesis.}
#' \item{gamma.2}{The square root of covariance matrix under alternative hypothesis.}
get.cov.matrix <- function(p, cov.method="block.diag", diff.method="block",
theta=0.5, s=1) {
## base matrix
sigma.1 <- get.base.matrix(p, cov.method)
diff.matrix <- get.diff.matrix(p=p, diff.method=diff.method, theta=theta,
max.var=max(abs(diag(sigma.1))), s=s)
sigma.2 <- sigma.1 + diff.matrix
if ((!isSymmetric(sigma.1)) || (!isSymmetric(sigma.2))) {
stop("Something is wrong: sigmas are not symmetric.")
}

## make both sigma matrices positive definite

return(list(gamma.1=get.matrix.sqrt(sigma.1),
gamma.2=get.matrix.sqrt(sigma.2),
diff.matrix=diff.matrix))
}

#' The square root of a positive definite matrix
#' @param A a symmetric positive definite matrix
#' @return the square root of \eqn{A}.
get.matrix.sqrt <- function(A) {
A.eig <- eigen(A)
A.sqrt <- A.eig$vectors %*% diag(sqrt(A.eig$values)) %*% t(A.eig$vectors) return (A.sqrt) } #' Generate the base matrix #' #' Note: the base matrix may not be positive definite. #' @param p feature dimension #' @param cov.method method for generating the base matrix. #' see \strong{Details} below. #' #' @details The \code{cov.method} argument is a character string #' that specifies the type of base matrix to be generated. #' Possible values are: #' #' \tabular{ll}{ #' "block.diag" \tab block diagonal with 10 blocks.\cr #' "exp.decay" \tab exponential decay.\cr #' "noisy.diag" \tab diagonal matrix with sparse noise at off-diagonal elements. #' } #' @return A \eqn{p} by \eqn{p} base matrix, which will be used #' to generate the covariance matrix. get.base.matrix <- function(p, cov.method="block.diag") { if (cov.method == "block.diag") { ## parameter: block size and entries in block block.num <- 10 off.value <- 0.55 ## covariance A <- matrix(0, nrow=p, ncol=p) block.size <- ceiling(p/block.num) for (k in c(1:block.num)) { i.start <- (k-1)*block.size + 1 i.end <- min(k*block.size, p) A[i.start:i.end, i.start:i.end] <- off.value } diag(A) <- 1 } else if (cov.method == "exp.decay") { ## parameter: sigma_ij = delta^{|i-j|^rho} delta <- 0.5 rho <- 1 ## covariance A <- matrix(nrow=p, ncol=p) A <- delta^(abs(row(A) - col(A))^rho) } else if (cov.method == "noisy.diag") { A <- diag(p) A[upper.tri(A, diag=FALSE)] <- rbinom(n=p*(p-1)/2, size=1, prob=0.05) A[lower.tri(A, diag=FALSE)] <- t(A)[lower.tri(A, diag=FALSE)] } else { stop("Unknown value of cov.method.") } ## add variance on diagonal var.range <- c(0.5, 2.5) diag.sd <- sqrt(runif(p, min=var.range[1], max=var.range[2])) base.matrix <- diag(diag.sd) %*% A %*% diag(diag.sd) if (!isSymmetric(base.matrix)) { stop("Something is wrong: base.matrix is not symmetric!") } return (base.matrix) } #' Adjust base matrix to be positive definite #' #' @param base.matrix.1 a \eqn{p} by \eqn{p} base matrix #' @param base.matrix.2 optional, another \eqn{p} by \eqn{p} base matrix. #' #' @details #' If both \code{base.matrix.1} and \code{base.matrix.2} are provided, then #' the function finds a postive constant \code{lambda}, such that #' \code{base.matrix.1} + \code{lambda} I #' and #' \code{base.matrix.2} + \code{lambda} I #' are both positive definite, with minimum eigenvalues being at least 0.05. #' If only \code{base.matrix.1} is provided, the function finds #' \code{base.matrix.1} + \code{lambda} I #' such that its minimum eigenvalue is at least 0.05. #' #' @return \code{lambda}, a positive constant to adjust given base matrix(es). adj.base.matrix <- function(base.matrix.1, base.matrix.2=NULL) { min.eigen <- rARPACK::eigs_sym(base.matrix.1, k=1, which="SA")$values
if (! is.null(base.matrix.2)) {
min.eigen.2 <- rARPACK::eigs_sym(base.matrix.2, k=1, which="SA")\$values
min.eigen <- min(min.eigen, min.eigen.2)
}
return (abs(min.eigen) + 0.05)
}

#' Generate the differential matrix
#'
#' @param p feature dimension
#' @param diff.method the method for generating the differential matrix.
#'            See \strong{Details} below.
#'
#' @details The \code{diff.method} argument is a character string
#' that specifies the type of differential matrix to be generated,
#' such that \code{sigma.2} = \code{sigma.1} + \code{diff.matrix}.
#' Possible values are:
#' \tabular{ll}{
#' "block" \tab \code{diff.matrix} is non-zero in an s-by-s sub-block,
#'              with entries generated from uniform distribution.\cr
#' "spiked" \tab \code{diff.matrix} is a rank-one matrix \eqn{d v v^T},
#'              where \code{d} is a scalar controling the signal strength,
#'              and \code{v} is a unit vector.
#'              \code{v} has \code{2s} non-zero elements,
#'              where \code{s} of them have larger values.
#' }
#'
#' @return A \eqn{p} by \eqn{p} symmetric differential matrix.
get.diff.matrix <- function(p, diff.method, theta=1, max.var=1, s=2) {
if (diff.method == "block") {
## signal level
delta <- theta * sqrt(max.var * log(p))
signal <- matrix(runif(s*s, min=delta/2, max=2*delta), nrow=s, ncol=s)
## make it symmetric
signal[lower.tri(signal, diag=FALSE)] <- t(signal)[lower.tri(signal, diag=FALSE)]

diff.matrix <- matrix(0, nrow=p, ncol=p)
diff.matrix[1:s, 1:s] <- signal

} else if (diff.method == "spiked") {
## unit vector
v <- rep(0, p)
v[1:(2*s)] <- rnorm(2*s, mean=0.1, sd=0.1) ## weak signals
v[1:s] <- rnorm(s, mean=1, sd=0.1) ## strong signals
v <- v / sqrt(sum(v^2)) ## normalize

## signal level
delta <- theta * sqrt(max.var * log(p))
diff.matrix <- delta * v %*% t(v)

} else {
stop("Unknown value of diff.method.")
}

return (diff.matrix)
}

lingxuez/sLED documentation built on Oct. 6, 2017, 12:14 a.m.