knitr::opts_chunk$set( echo = TRUE, warning = FALSE, message = FALSE, error = FALSE, tidy = TRUE, eval = TRUE, dev = c("png", "pdf"), cache = TRUE )
Data whitening is a widely used preprocessing step to remove correlation structure since statistical models often assume independence (Kessy, et al. 2018). The typical procedures transforms the observed data by an inverse square root of the sample correlation matrix (Figure 1). For low dimension data (i.e. $n > p$), this transformation produces transformed data with an identity sample covariance matrix. This procedure assumes either that the true covariance matrix is know, or is well estimated by the sample covariance matrix. Yet the use of the sample covariance matrix for this transformation can be problematic since 1) the complexity is $\mathcal{O}(p^3)$ and 2) it is not applicable to the high dimensional (i.e. $n \ll p$) case since the sample covariance matrix is no longer full rank.
Here we use a probabilistic model of the observed data to apply a whitening transformation. Our Gaussian Inverse Wishart Empirical Bayes (GIW-EB) 1) model substantially reduces computational complexity, and 2) regularizes the eigen-values of the sample covariance matrix to improve out-of-sample performance.
```r$ after centering columns is $UDV^T$, the steps in the transformation are: A) Original data, B) Data rotated along principal components, C) Data rotated and scaled, D) Data rotated, scaled and rotated back to original axes. Green arrows indicate principal axes and lengths indicate eigen-values.", echo=FALSE} library(decorrelate) library(mvtnorm) library(ggplot2) library(cowplot) library(colorRamps) library(latex2exp) set.seed(2)
Sigma <- matrix(c(1, .9, .9, 2), ncol = 2)
Y <- rmvnorm(200, c(0, 0), Sigma)
dcmp <- eigen(cov(Y)) U <- whitening:::makePosDiag(dcmp$vectors) lambda <- dcmp$values
lim <- range(Y)
value <- (Y %*% U)[, 1]
plot_data <- function(Y, rotation, rank, lim) { dcmp <- eigen(cov(Y)) U <- whitening:::makePosDiag(dcmp$vectors) lambda <- dcmp$values
if (rotation == 1) { X <- Y
H <- with(eigen(cov(Y)), vectors %*% diag(values))
} else if (rotation == 2) { X <- Y %*% U
H <- with(eigen(cov(X)), vectors %*% diag(values))
} else if (rotation == 3) { X <- Y %% U %% diag(1 / sqrt(lambda))
H <- matrix(c(1, 0, 0, 1), ncol = 2)
} else if (rotation == 4) { X <- Y %% U %% diag(1 / sqrt(lambda)) %*% t(U)
H <- with(eigen(cov(Y)), vectors)
}
df <- data.frame(X, rank = scale(rank))
H <- whitening:::makePosDiag(H)
ggplot(df, aes(X1, X2, color = rank)) + geom_segment(x = 0, y = -lim[2], xend = 0, yend = lim[2], color = "black") + geom_segment(x = -lim[2], y = 0, xend = lim[2], yend = 0, color = "black") + geom_point(size = .8) + theme_void() + scale_color_gradient2(low = "blue2", mid = "grey60", high = "red2") + theme(aspect.ratio = 1, legend.position = "none", plot.title = element_text(hjust = 0.5)) + xlim(lim) + ylim(lim) + geom_segment(x = 0, y = 0, xend = H[1, 1], yend = H[2, 1], color = "#2fbf2e", arrow = arrow(length = unit(0.03, "npc")), size = 1) + geom_segment(x = 0, y = 0, xend = H[1, 2], yend = H[2, 2], color = "#2fbf2e", arrow = arrow(length = unit(0.03, "npc")), size = 1) }
fig1 <- plot_data(Y, rotation = 1, rank(value), lim) + ggtitle(TeX("$Y$")) fig2 <- plot_data(Y, rotation = 2, rank(value), lim) + ggtitle(TeX("$YV$")) fig3 <- plot_data(Y, rotation = 3, rank(value), lim) + ggtitle(TeX("$YVD^{-1}$")) fig4 <- plot_data(Y, rotation = 4, rank(value), lim) + ggtitle(TeX("$YVD^{-1}V^T$")) plot_grid(fig1, fig2, fig3, fig4, ncol = 4, labels = "AUTO")
# Basic usage ```r library(decorrelate) library(Rfast) n <- 500 # number of samples p <- 200 # number of features # create correlation matrix Sigma <- autocorr.mat(p, .9) # draw data from correlation matrix Sigma Y <- rmvnorm(n, rep(0, p), sigma = Sigma * 5.1, seed = 1) rownames(Y) <- paste0("sample_", 1:n) colnames(Y) <- paste0("gene_", 1:p) # eclairs decomposition implements GIW-EB method: # *E*stimate *c*ovariance/correlation with *l*ow *r*ank and *s*hrinkage ecl <- eclairs(Y) # decorrelate data using eclairs decomposition Y_whitened <- decorrelate(Y, ecl) # the same whitening can be performed with one command where # the eigen-value shrinkage is performed internally Y_whitened2 <- whiten(Y)
Here plot A) the eigen-values of the covariance matrix before and after shrinkage. B) The correlation of the observed and C) whitened data.
oldpar <- par(mfrow = c(1, 3)) # plot shrinkage of eigen-values plot(ecl) # correlation between variables in observed data image(cor(Y), axes = FALSE, main = "Correlation of observed data") # decorrelate data using eclairs decomposition image(cor(Y_whitened), axes = FALSE, main = "Correlation of whitened data") par(oldpar)
The decorrelate
package has advanced features to examine details of the whitening transformation.
While eclairs()
, decorrelate()
, and whiten()
perform the probabilistic whitening transformation efficiently without directly computing the whitening matrix, getWhiteningMatrix()
can directly compute the matrix.
# compute whitening matrix from eclairs decomposition W <- getWhiteningMatrix(ecl) # transform observed data using whitening matrix Z <- tcrossprod(Y, W) # evalute difference between whitened computed 2 ways max(abs(Z - Y_whitened))
The difference between function is due only to machine precision.
The full covariance for correlation matrix implied by the eclairs()
decomposition can be computed explicitly. Note that computing and storing these matries is $O(p^2)$, it may not feasable for large datasets.
# compute correlation matrix from eclairs getCor(ecl) # compute covariance matrix from eclairs getCov(ecl)
The form of the eclairs()
decomposition can be used to efficiently sample from a multivariage normal distribution with specified covariance.
# draw from multivariate normal n <- 1000 mu <- rep(0, ncol(Y)) # using eclairs decomposition X.draw1 <- rmvnorm_eclairs(n, mu, ecl)
A low rank eclairs()
decomposition can be computed more efficiently when $k$ is small relative to $min(n,p)$. Importantly, the emprical Bayes estimate of the shrinkage parameter $\lambda$ can still be computed accurately for sufficiently large $k$. Note that the low rank method trades computational efficientcy for accuracy in the whitening transform.
# use low rank decomposition with 50 components ecl <- eclairs(Y, k = 60) # decorrelate data using eclairs decomposition Y_whitened <- decorrelate(Y, ecl)
oldpar <- par(mfrow = c(1, 3)) # plot shrinkage of eigen-values plot(ecl) # decorrelate data using eclairs decomposition image(cor(Y_whitened), axes = FALSE, main = "Correlation of whitened data") par(oldpar)
In this case, the low rank whitening produces transformed features that are approximately independent. The approximation improves as the rank increases.
Compute the condition number (i.e. the ratio between the largest and smallest eigen-value) of the correlation/covariance matrix from the eclairs()
decomposition.
kappa(ecl)
By default eclairs()
computes the covariance between columns by using the default compute = "covariance"
. Running decorrelate()
using this removes the covariance between columns. Setting compute = "correlation"
, evaluates and then removes correlation between columns while retaining the variance.
library(clusterGeneration) # generate covariance matrix, where the diagonals (i.e. variances) vary Sigma <- genPositiveDefMat(p, rangeVar=c(1, 1e6))$Sigma Y <- rmvnorm(n, rep(0, p), sigma = Sigma) # examine variances of the first 5 variables apply(Y, 2, var)[1:5] # transform removes covariance between columns # so variance of transformed features are *approximately* equal ecl_cov <- eclairs(Y, compute = "covariance") Z1 <- decorrelate(Y, ecl_cov) # variance are *approximately* equal apply(Z1, 2, var)[1:5] # transform removes **correlation** between columns # but variables are not scaled ecl_cor <- eclairs(Y, compute = "correlation") Z2 <- decorrelate(Y, ecl_cor) # variances are not standardized apply(Z2, 2, var)[1:5]
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.