In stephenslab/susieR: Sum of Single Effects Linear Regression

knitr::opts_chunk$set(echo = TRUE)

simulate data

This is Lei's example

set.seed(777)
library(susieR)
X <- matrix(rnorm(1010 * 1000), 1010, 1000)
beta <- rep(0, 1000)
beta[1 : 200] <- 100
y <- X %*% beta + rnorm(1010)
s = susie(X,y,L=200)

plot(coef(s),beta)
s$sigma2

# fit <- lm(y ~ X - 1)
# mlr.p <- log(summary(fit)$coefficients[, 4])
# 
mar.p <- c()
mar.betahat = c()
for (i in 1 : 1000) {
 fit <- lm(y ~ X[, i] - 1)
  mar.p[i] <- log(summary(fit)$coefficients[, 4])
  mar.betahat[i] <- summary(fit)$coefficients[, 1]
}
# 
# pdf("pvalue.pdf", width = 10, height = 5)
# par(mfrow = c(1, 2))
# plot(mlr.p, ylab = "log(p-value)", main = "Multiple Linear Regression")
# abline(h = log(0.05 / 1000), lty = 2, col = "red")
# legend("right", lty = 2, col = "red", "log(0.05/p)")
# 
# plot(mar.p, ylab = "log(p-value)", main = "One-on-One Linear Regression")
# abline(h = log(0.05 / 1000), lty = 2, col = "red")

Notice that the coefficients are monotonic with betahat. Some shrinkage of zero values is evident, but it is not enough... presumably because sigma2 is way over-estimated. And further we see excessive shrinkage of true signals, presumably because sa2 is too small.

plot(coef(s),mar.betahat)

Here we try fixing $L$ and residual variance to true value.

strue = susie(X,y,L=200,residual_variance =1,estimate_residual_variance =FALSE)
plot(coef(strue),beta)
strue$elbo

it works!!

plot(strue$alpha[1,])
plot(strue$alpha[2,])

Try with very small residual variance

s3 = susie(X,y,L=200, residual_variance = 0.01,estimate_residual_variance = FALSE)
plot(coef(s3))
s4 = susie(X,y,s_init = s3)
plot(coef(s4))
s4$elbo

That is weird it goes away from the solution!

Try with estimating prior:

s5 = susie(X,y,s_init = s3, estimate_prior_variance = TRUE)
plot(coef(s5))
s5$elbo
sqrt(s5$sa2)
sqrt(s4$sa2)

much better!

Now try too many effects

s3.300 = susie(X,y,L=300, residual_variance = 0.01,estimate_residual_variance = FALSE)
s5.300 = susie(X,y,s_init = s3.300, estimate_prior_variance = TRUE)
plot(coef(s3.300))
plot(coef(s5.300))
s3.300$elbo
s5.300$elbo
sum(s5.300$sa2>0)

Now try too many effects but just a very small number of iterations for initial case:

s3.300.5 = susie(X,y,L=300, residual_variance = 0.01,estimate_residual_variance = FALSE, max_iter = 5)
s5.300.5 = susie(X,y,s_init = s3.300.5, estimate_prior_variance = TRUE)
plot(coef(s3.300.5))
plot(coef(s5.300.5))
s3.300.5$elbo
s5.300.5$elbo
plot(colSums(s5.300.5$alpha))

Q: does the initial run with small variance gradually find the smaller effects, or does it get them from the first iteration? Could look at that by doing one iteration at a time.

s3.300.1 = susie(X,y,L=300, residual_variance = 0.01,estimate_residual_variance = FALSE, max_iter = 1)
plot(coef(s3.300.1))
s3.300.2 = susie(X,y,s_init=s3.300.1,estimate_residual_variance = FALSE, max_iter = 1)
plot(coef(s3.300.2))
s3.300 = susie(X,y,s_init=s3.300.2,estimate_residual_variance = TRUE)