Fast Whitening Transformation
In decorrelate: Decorrelation Projection Scalable to High Dimensional Data

knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  error = FALSE,
  tidy = TRUE,
  eval = TRUE,
  dev = c("png", "pdf"),
  cache = TRUE
)

Data whitening is a widely used preprocessing step to remove correlation structure since statistical models often assume independence (Kessy, et al. 2018). The typical procedures transforms the observed data by an inverse square root of the sample correlation matrix (Figure 1). For low dimension data (i.e. $n > p$), this transformation produces transformed data with an identity sample covariance matrix. This procedure assumes either that the true covariance matrix is know, or is well estimated by the sample covariance matrix. Yet the use of the sample covariance matrix for this transformation can be problematic since 1) the complexity is $\mathcal{O}(p^3)$ and 2) it is not applicable to the high dimensional (i.e. $n \ll p$) case since the sample covariance matrix is no longer full rank.

Here we use a probabilistic model of the observed data to apply a whitening transformation. Our Gaussian Inverse Wishart Empirical Bayes (GIW-EB) 1) model substantially reduces computational complexity, and 2) regularizes the eigen-values of the sample covariance matrix to improve out-of-sample performance.

```r$ after centering columns is $UDV^T$, the steps in the transformation are: A) Original data, B) Data rotated along principal components, C) Data rotated and scaled, D) Data rotated, scaled and rotated back to original axes. Green arrows indicate principal axes and lengths indicate eigen-values.", echo=FALSE} library(decorrelate) library(mvtnorm) library(ggplot2) library(cowplot) library(colorRamps) library(latex2exp) set.seed(2)

Sigma <- matrix(c(1, .9, .9, 2), ncol = 2)

Y <- rmvnorm(200, c(0, 0), Sigma)

dcmp <- eigen(cov(Y)) U <- whitening:::makePosDiag(dcmp$vectors) lambda <- dcmp$values

lim <- range(Y)

value <- (Y %*% U)[, 1]

plot_data <- function(Y, rotation, rank, lim) { dcmp <- eigen(cov(Y)) U <- whitening:::makePosDiag(dcmp$vectors) lambda <- dcmp$values

if (rotation == 1) { X <- Y

H <- with(eigen(cov(Y)), vectors %*% diag(values))

} else if (rotation == 2) { X <- Y %*% U

H <- with(eigen(cov(X)), vectors %*% diag(values))

} else if (rotation == 3) { X <- Y %% U %% diag(1 / sqrt(lambda))

H <- matrix(c(1, 0, 0, 1), ncol = 2)

} else if (rotation == 4) { X <- Y %% U %% diag(1 / sqrt(lambda)) %*% t(U)

H <- with(eigen(cov(Y)), vectors)

}

df <- data.frame(X, rank = scale(rank))

H <- whitening:::makePosDiag(H)

ggplot(df, aes(X1, X2, color = rank)) + geom_segment(x = 0, y = -lim[2], xend = 0, yend = lim[2], color = "black") + geom_segment(x = -lim[2], y = 0, xend = lim[2], yend = 0, color = "black") + geom_point(size = .8) + theme_void() + scale_color_gradient2(low = "blue2", mid = "grey60", high = "red2") + theme(aspect.ratio = 1, legend.position = "none", plot.title = element_text(hjust = 0.5)) + xlim(lim) + ylim(lim) + geom_segment(x = 0, y = 0, xend = H[1, 1], yend = H[2, 1], color = "#2fbf2e", arrow = arrow(length = unit(0.03, "npc")), size = 1) + geom_segment(x = 0, y = 0, xend = H[1, 2], yend = H[2, 2], color = "#2fbf2e", arrow = arrow(length = unit(0.03, "npc")), size = 1) }

fig1 <- plot_data(Y, rotation = 1, rank(value), lim) + ggtitle(TeX("$Y$")) fig2 <- plot_data(Y, rotation = 2, rank(value), lim) + ggtitle(TeX("$YV$")) fig3 <- plot_data(Y, rotation = 3, rank(value), lim) + ggtitle(TeX("$YVD^{-1}$")) fig4 <- plot_data(Y, rotation = 4, rank(value), lim) + ggtitle(TeX("$YVD^{-1}V^T$")) plot_grid(fig1, fig2, fig3, fig4, ncol = 4, labels = "AUTO")

# Basic usage
```r
library(decorrelate)
library(Rfast)

n <- 500 # number of samples
p <- 200 # number of features

# create correlation matrix
Sigma <- autocorr.mat(p, .9)

# draw data from correlation matrix Sigma
Y <- rmvnorm(n, rep(0, p), sigma = Sigma * 5.1, seed = 1)
rownames(Y) <- paste0("sample_", 1:n)
colnames(Y) <- paste0("gene_", 1:p)

# eclairs decomposition implements GIW-EB method:
# *E*stimate *c*ovariance/correlation with *l*ow *r*ank and *s*hrinkage
ecl <- eclairs(Y)

# decorrelate data using eclairs decomposition
Y_whitened <- decorrelate(Y, ecl)

# the same whitening can be performed with one command where
# the eigen-value shrinkage is performed internally
Y_whitened2 <- whiten(Y)

Here plot A) the eigen-values of the covariance matrix before and after shrinkage. B) The correlation of the observed and C) whitened data.

oldpar <- par(mfrow = c(1, 3))

# plot shrinkage of eigen-values
plot(ecl)

# correlation between variables in observed data
image(cor(Y), axes = FALSE, main = "Correlation of observed data")

# decorrelate data using eclairs decomposition
image(cor(Y_whitened), axes = FALSE, main = "Correlation of whitened data")
par(oldpar)

Advanced usage

The decorrelate package has advanced features to examine details of the whitening transformation.

Directly compute whitening matrix

While eclairs(), decorrelate(), and whiten() perform the probabilistic whitening transformation efficiently without directly computing the whitening matrix, getWhiteningMatrix() can directly compute the matrix.

# compute whitening matrix from eclairs decomposition
W <- getWhiteningMatrix(ecl)

# transform observed data using whitening matrix
Z <- tcrossprod(Y, W)

# evalute difference between whitened computed 2 ways
max(abs(Z - Y_whitened))

The difference between function is due only to machine precision.

Explicit covariance or correlation

The full covariance for correlation matrix implied by the eclairs() decomposition can be computed explicitly. Note that computing and storing these matries is $O(p^2)$, it may not feasable for large datasets.

# compute correlation matrix from eclairs
getCor(ecl)

# compute covariance matrix from eclairs
getCov(ecl)

Sample from multivariate normal

The form of the eclairs() decomposition can be used to efficiently sample from a multivariage normal distribution with specified covariance.

# draw from multivariate normal
n <- 1000
mu <- rep(0, ncol(Y))

# using eclairs decomposition
X.draw1 <- rmvnorm_eclairs(n, mu, ecl)

Low-rank models

A low rank eclairs() decomposition can be computed more efficiently when $k$ is small relative to $min(n,p)$. Importantly, the emprical Bayes estimate of the shrinkage parameter $\lambda$ can still be computed accurately for sufficiently large $k$. Note that the low rank method trades computational efficientcy for accuracy in the whitening transform.

# use low rank decomposition with 50 components
ecl <- eclairs(Y, k = 60)

# decorrelate data using eclairs decomposition
Y_whitened <- decorrelate(Y, ecl)

oldpar <- par(mfrow = c(1, 3))

# plot shrinkage of eigen-values
plot(ecl)

# decorrelate data using eclairs decomposition
image(cor(Y_whitened), axes = FALSE, main = "Correlation of whitened data")
par(oldpar)

In this case, the low rank whitening produces transformed features that are approximately independent. The approximation improves as the rank increases.

Computing condition number

Compute the condition number (i.e. the ratio between the largest and smallest eigen-value) of the correlation/covariance matrix from the eclairs() decomposition.

kappa(ecl)

Removing correlation vs covariance

By default eclairs() computes the covariance between columns by using the default compute = "covariance". Running decorrelate() using this removes the covariance between columns. Setting compute = "correlation", evaluates and then removes correlation between columns while retaining the variance.

library(clusterGeneration)

# generate covariance matrix, where the diagonals (i.e. variances) vary
Sigma <- genPositiveDefMat(p, rangeVar=c(1, 1e6))$Sigma

Y <- rmvnorm(n, rep(0, p), sigma = Sigma)

# examine variances of the first 5 variables
apply(Y, 2, var)[1:5]

# transform removes covariance between columns
# so variance of transformed features are *approximately* equal
ecl_cov <- eclairs(Y, compute = "covariance")
Z1 <- decorrelate(Y, ecl_cov)

# variance are *approximately* equal
apply(Z1, 2, var)[1:5]

# transform removes **correlation** between columns
# but variables are not scaled
ecl_cor <- eclairs(Y, compute = "correlation")
Z2 <- decorrelate(Y, ecl_cor)

# variances are not standardized
apply(Z2, 2, var)[1:5]