In dleelab/iasvaExamples: iasva example data package

knitr::opts_chunk$set(comment = "#",collapse = TRUE)

rm(list=ls())
library(iasva)
library(sva)
library(SummarizedExperiment)
library(corrplot)

The reviewer 2's simulation #1 with weak unknown factor.

# Setting up the known and unknown factors. 
set.seed(10000) 
known <- runif(1000) 
unknown <- runif(1000, 0, 0.1) # weak unknown factor 

# Generating Poisson count data from log-normal means. 
ngenes <- 5000 
known.effect <- rnorm(ngenes, sd=2) 
unknown.effect <- rnorm(ngenes, sd=2) 
mat <- outer(known.effect, known) + outer(unknown.effect, unknown) + 2 # to get decent-sized counts 

counts <- matrix(rpois(length(mat), lambda=2^mat), nrow=ngenes) 
library(SummarizedExperiment) 
se <- SummarizedExperiment(list(counts=counts)) 

# Running IA-SVA, version 0.99.3. 
library(iasva) 
design <- model.matrix(~known) 
res <- iasva(se, design, num.sv=5, permute = FALSE) 

cor(res$sv[,1], unknown) # these SVs are not the unknown factor (correlation ~= 0) 
cor(res$sv[,2], unknown) 
cor(res$sv[,3], unknown) 
cor(res$sv[,4], unknown) 
cor(res$sv[,5], unknown) 

cor(res$sv[,1], known) # ... but are instead the known factor (correlation ~= +/-1) 
cor(res$sv[,2], known) 
cor(res$sv[,3], known) 
cor(res$sv[,4], known) 
cor(res$sv[,5], known) 

## Compare to just naively taking the first PC of the residual matrix. 
library(limma) 
resid <- removeBatchEffect(log(assay(se)+1), covariates=known) 
pr.out <- prcomp(t(resid), rank.=1) 
abs(cor(pr.out$x[,1], unknown)) # close to 1 
abs(cor(pr.out$x[,1], known)) # close to zero. 

## Compare to SVA
library(sva)
mod1 <- model.matrix(~known)
mod0 <- cbind(mod1[,1])
sva.res = svaseq(counts,mod1,mod0, n.sv=5)$sv
abs(cor(sva.res[,1], unknown)) # 0.55
abs(cor(sva.res[,1], known)) # close to zero.

hf.mat1 <- cbind(known, unknown, res$sv[,1], pr.out$x[,1], sva.res[,1])
colnames(hf.mat1) <- c("KF","HF","IASVA","PC1_Residual","SVA")
corrplot.mixed(abs(cor(hf.mat1)), tl.pos="lt")

The reviewer 2's simulation #2 with stronger effect size.

set.seed(10000) 

known <- runif(1000) 
unknown <- runif(1000) 

ngenes <- 5000 
known.effect <- rnorm(ngenes, sd=2) 
unknown.effect <- rnorm(ngenes, sd=2) 
mat <- outer(known.effect, known) + outer(unknown.effect, unknown) + 2 # to get decent-sized counts 

counts <- matrix(rpois(length(mat), lambda=2^mat), nrow=ngenes) 
library(SummarizedExperiment) 
se <- SummarizedExperiment(list(counts=counts)) 

library(iasva) 
design <- model.matrix(~known) 
res <- iasva(se, design, num.sv=5, permute = FALSE) 

cor(res$sv[,1], unknown) # first SV is the unknown factor. 
cor(res$sv[,2], unknown) 
cor(res$sv[,3], unknown) 
cor(res$sv[,4], unknown) 
cor(res$sv[,5], unknown) 

cor(res$sv[,1], known) 
cor(res$sv[,2], known) # but SVs 2-5 are correlated with the known factor... 
cor(res$sv[,3], known) 
cor(res$sv[,4], known) 
cor(res$sv[,5], known) 

## Compare to just naively taking the first PC of the residual matrix. 
library(limma) 
resid <- removeBatchEffect(log(assay(se)+1), covariates=known) 
pr.out <- prcomp(t(resid), rank.=1) 
abs(cor(pr.out$x[,1], unknown)) # close to 1 
abs(cor(pr.out$x[,1], known)) # close to zero.

# Compare to SVA
library(sva)
mod1 <- model.matrix(~known)
mod0 <- cbind(mod1[,1])
sva.res = svaseq(counts,mod1,mod0, n.sv=1)$sv
abs(cor(sva.res[,1], unknown)) # close to 1 
abs(cor(sva.res[,1], known)) # close to zero.

hf.mat2 <- cbind(known, unknown, res$sv[,1], pr.out$x[,1], sva.res[,1])
colnames(hf.mat2) <- c("KF","HF","IASVA","PC1_Residual","SVA")
corrplot.mixed(abs(cor(hf.mat2)), tl.pos="lt")

Several issues in the simulation studies suggested by the reviewer #2

We noticed several issues in these simulations. First, SVA or factor analysis based methods including IA-SVA are designed to estimate hidden variables with distinct levels (i.e., factors) or a transformation of factors. However, instead of simulating factors (e.g., a variable with two levels (1 and -1)) as typically done in such analyses1,2 (e.g., see http://jtleek.com/svaseq/simulateData.html and Method section (Simulated data) of https://www.biorxiv.org/content/biorxiv/early/2017/10/27/200345.full.pdf), the reviewer simulated numeric vectors from a uniform distribution (e.g., runif(1000)) and used them as hidden variables. We note that this leads, like IA-SVA, SVA also to perform poorly in the simulation study (r = -0.55) (see https://dleelab.github.io/iasvaExamples/reviewer2_sim.html). Second, IA-SVA does not accept intercept column in the design matrix; therefore instead of design, design[,-1] should have been used in these simulations. Third, the reviewer simulated counts using a Poisson distribution (mean = variance); however, negative binomial or zero-inflated negative binomial is a more appropriate model to simulate high variability (mean << variance) in bulk or single cell RNA-seq data. Fourth, even though only one hidden factor is simulated in the study, IA-SVA was forced to generate five hidden factors by setting the option ‘num.sv’ = 5. This leads IA-SVA to generate SV2-SV5, which are highly correlated SV1. To avoid this, we recommend users using the permutation procedure (by setting the default setting for option ‘permute’ (i.e, permute = TRUE) in ‘iasva’ function) or ‘fast_iasva’ function to determine the number of meaningful hidden factors. Lastly, to simulate a hidden factor with stronger effect sizes, the standard deviation (sd) in “unknown.effect <- rnorm(ngenes, sd=2) ” should be increased (e.g., sd=4); rather increasing the range of simulated factor values from “unknown <- runif(1000, 0, 0.1)” to “unknown <- runif(1000) ”. We have repeated the simulation studies by taking into consideration these points below.

Corrected simulation with low correlation btw known and hidden factors

# Setting up the known and unknown factors. 
set.seed(10000) 

## to reduce the computational burden, we reduced the sample size (# of cells) and # of genes.
sample.size <- 100 
ngenes <- 2000     

# CORRECTEION: Here, we simulate factors with two levels (-1 or 1).
factor.prop <- 0.5
flip.prob <- 0.5 ## to make no correlation btw known and hidden factors
known <- c(rep(-1,each=sample.size*factor.prop),rep(1,each=sample.size-(sample.size*factor.prop)))
coinflip = rbinom(sample.size,size=1,prob=flip.prob)
unknown = known*coinflip + -known*(1-coinflip)
cor(known, unknown)

# Generating Poisson count data from log-normal means. 
# CORRECTION: to make factor with weak effect size, we use 1 as SD here.
known.effect <- rnorm(ngenes, sd=1) 
unknown.effect <- rnorm(ngenes, sd=1)

# To make estimation of the hidden factor more challenging, here we ramdomly select 1500 (75%) and 1900 (95%) genes from each effect size vector and set their effect sizes as 0. That is, only 25% and 5% of genes are affected by the known and hidden factors, respectively. 
known.effect[sample(1:ngenes, 1500, replace=FALSE)] <- 0
unknown.effect[sample(1:ngenes, 1900, replace=FALSE)] <- 0

mat <- outer(known.effect, known) + outer(unknown.effect, unknown) + 2 # to get decent-sized counts 
counts <- matrix(rpois(length(mat), lambda=2^mat), nrow=ngenes)
se <- SummarizedExperiment(assays=counts) 

# Running IA-SVA, version 0.99.3. 
library(iasva) 
design <- model.matrix(~known) 

# CORRECTION: IASVA doesn't accept the constant term of design matrix, so we use design[,-1] instead of design
res <- iasva(se, as.matrix(design[,-1]), threads=8) # iasva detected only one hidden factor.
# fast_iasva used here.
res.fast <- fast_iasva(se, as.matrix(design[,-1]))  # fast_iasva detected only one hidden factor.

#plot(res$sv[,1], unknown) 
abs(cor(res$sv[,1], unknown)) ## absolute value of cor(SV1, unknown factor) = 0.9995
abs(cor(res$sv[,1], known))   ## absolute value of cor(SV1, known factor) close to zero.

#plot(res.fast$sv[,1], unknown) 
abs(cor(res.fast$sv[,1], unknown)) ## absolute value of cor(SV1, unknown factor) = 0.9995
abs(cor(res.fast$sv[,1], known))   ## absolute value of cor(SV1, known factor) close to zero.


# Compare to just naively taking the first PC of the residual matrix. 
library(limma) 
resid <- removeBatchEffect(log(assay(se)+1), covariates=known) 
pr.out <- prcomp(t(resid), rank.=1) 
#plot(pr.out$x[,1], unknown) 
abs(cor(pr.out$x[,1], unknown)) # close to 1 
abs(cor(pr.out$x[,1], known)) # close to zero. 


# Compare to SVA
library(sva)
mod1 <- model.matrix(~known)
mod0 <- cbind(mod1[,1])
sva.res = svaseq(counts,mod1,mod0)$sv
#plot(sva.res[,1], unknown)
abs(cor(sva.res[,1], unknown)) # close to 1 
abs(cor(sva.res[,1], known)) # close to zero. 

hf.mat3 <- cbind(known, unknown, res$sv[,1], pr.out$x[,1], sva.res[,1])
colnames(hf.mat3) <- c("KF","HF","IASVA","PC1_Residual","SVA")
corrplot.mixed(abs(cor(hf.mat3)), tl.pos="lt")

Corrected Simulation with high correlation btw known and hidden factors with stronger effect sizes

Here, we simulated known and hidden factors are strongly correlated (r=0.7) and affect 25% and 5% of genes respectively. We also simulated stronger effect sizes by increasing sd from 1 to 2 when simulating effect sizes from normal distribution,"unknown.effect <- rnorm(ngenes, sd=2)".

# Setting up the known and unknown factors. 
set.seed(10000) 

# to reduce the computational burden, we reduced the sample size (# of cells) and # of genes.
sample.size <- 100 
ngenes <- 2000     

# CORRECTEION: Here, we simulate factors with two levels (-1 or 1).
factor.prop <- 0.5
flip.prob <- 0.8 # to simulate high correlation btw known and hidden factors
known <- c(rep(-1,each=sample.size*factor.prop),rep(1,each=sample.size-(sample.size*factor.prop)))
coinflip = rbinom(sample.size,size=1,prob=flip.prob)
unknown = known*coinflip + -known*(1-coinflip)
cor(known, unknown) # correlation between known and unknown factors = 0.7

# Generating Poisson count data from log-normal means. 
# CORRECTION: to make factor with stronger effect size, we use 2 as SD here.
known.effect <- rnorm(ngenes, sd=2) 
unknown.effect <- rnorm(ngenes, sd=2)

# To make estimation of the hidden factor more challenging, here we ramdomly select 1500 (75%) and 1900 (95%) genes from each effect size vector and set their effect sizes as 0. That is, only 25% and 5% of genes are affected by the known and hidden factors, respectively. 
known.effect[sample(1:ngenes, 1500, replace=FALSE)] <- 0
unknown.effect[sample(1:ngenes, 1900, replace=FALSE)] <- 0

mat <- outer(known.effect, known) + outer(unknown.effect, unknown) + 2 # to get decent-sized counts 
counts <- matrix(rpois(length(mat), lambda=2^mat), nrow=ngenes)
se <- SummarizedExperiment(assays=counts) 

# Running IA-SVA, version 0.99.3. 
library(iasva) 
design <- model.matrix(~known) 

# CORRECTION: IASVA doesn't accept the constant term of design matrix, so we use design[,-1] instead of design
res <- iasva(se, as.matrix(design[,-1]), threads=8) # iasva detected only one hidden factor.
# fast_iasva used here.
res.fast <- fast_iasva(se, as.matrix(design[,-1]))  # fast_iasva detected only one hidden factor.

#plot(res$sv[,1], unknown) 
abs(cor(res$sv[,1], unknown)) ## absolute value of cor(SV1, unknown factor) = 0.9995
abs(cor(res$sv[,1], known))   ## absolute value of cor(SV1, known factor) close to 0.7.

#plot(res.fast$sv[,1], unknown) 
abs(cor(res.fast$sv[,1], unknown)) ## absolute value of cor(SV1, unknown factor) = 0.9995
abs(cor(res.fast$sv[,1], known))   ## absolute value of cor(SV1, known factor) close to 0.7.


# Compare to just naively taking the first PC of the residual matrix.
library(limma) 
resid <- removeBatchEffect(log(assay(se)+1), covariates=known) 
pr.out <- prcomp(t(resid), rank.=1) 
#plot(pr.out$x[,1], unknown) 
abs(cor(pr.out$x[,1], unknown)) # cor = 0.7 
abs(cor(pr.out$x[,1], known)) # close to zero. 


# Compare to SVA
library(sva)
mod1 <- model.matrix(~known)
mod0 <- cbind(mod1[,1])
sva.res = svaseq(counts,mod1,mod0)$sv
#plot(sva.res[,1], unknown)
abs(cor(sva.res[,1], unknown)) # cor = 0.7 
abs(cor(sva.res[,1], known)) # close to zero.

hf.mat4 <- cbind(known, unknown, res$sv[,1], pr.out$x[,1], sva.res[,1])
colnames(hf.mat4) <- c("KF","HF","IASVA","PC1_Residual","SVA")
corrplot.mixed(abs(cor(hf.mat4)), tl.pos="lt")

More challenging scenario.

Here, we simulated the known and hidden factors to affect only 100 (5%) and 20 (1%) genes respectively. We simulated the 20 genes affected by the hidden factor to be also affected by the known factor. Similarly, the two factors are highly correlated (r=0.7).

# Setting up the known and unknown factors. 
set.seed(10000) 

# to reduce the computational burden, we reduced the sample size (# of cells) and # of genes.
sample.size <- 100 
ngenes <- 2000     

# CORRECTEION: Here, we simulate factors with two levels (-1 or 1).
factor.prop <- 0.5
flip.prob <- 0.8 # to simulate high correlation btw known and hidden factors
known <- c(rep(-1,each=sample.size*factor.prop),rep(1,each=sample.size-(sample.size*factor.prop)))
coinflip = rbinom(sample.size,size=1,prob=flip.prob)
unknown = known*coinflip + -known*(1-coinflip)
cor(known, unknown) # correlation between known and unknown factors = 0.7

# Generating Poisson count data from log-normal means. 
# CORRECTION: to make factor with stronger effect size, we use 2 as SD here.
known.effect <- rnorm(ngenes, sd=2) 
unknown.effect <- rnorm(ngenes, sd=2)

# To make estimation of the hidden factor more challenging, here we select 1900 (95%) and 1980 (1%) genes from each effect size vector and set their effect sizes as 0. That is, 5% and 1% of genes are affected by the known and hidden factors, respectively. All 20 genes affected by the hidden factor are affected by the known factor. 

known.effect[1:1900] <- 0
unknown.effect[1:1980] <- 0

mat <- outer(known.effect, known) + outer(unknown.effect, unknown) + 2 # to get decent-sized counts 
counts <- matrix(rpois(length(mat), lambda=2^mat), nrow=ngenes)
se <- SummarizedExperiment(assays=counts) 

# Running IA-SVA, version 0.99.3. 
library(iasva) 
design <- model.matrix(~known) 

# CORRECTION: IASVA doesn't accept the constant term of design matrix, so we use design[,-1] instead of design
res <- iasva(se, as.matrix(design[,-1]), threads=8) # iasva detected only one hidden factor.
# fast_iasva used here.
res.fast <- fast_iasva(se, as.matrix(design[,-1]), pct.cutoff = 2)  # fast_iasva detected only one hidden factor.

#plot(res$sv[,1], unknown) 
abs(cor(res$sv[,1], unknown)) ## absolute value of cor(SV1, unknown factor) = 0.9995
abs(cor(res$sv[,1], known))   ## absolute value of cor(SV1, known factor) close to 0.7.

#plot(res.fast$sv[,1], unknown) 
abs(cor(res.fast$sv[,1], unknown)) ## absolute value of cor(SV1, unknown factor) = 0.9995
abs(cor(res.fast$sv[,1], known))   ## absolute value of cor(SV1, known factor) close to 0.7.


# Compare to just naively taking the first PC of the residual matrix.
library(limma) 
resid <- removeBatchEffect(log(assay(se)+1), covariates=known) 
pr.out <- prcomp(t(resid), rank.=1) 
#plot(pr.out$x[,1], unknown) 
abs(cor(pr.out$x[,1], unknown)) # cor = 0.7 
abs(cor(pr.out$x[,1], known)) # close to zero. 


# Compare to SVA
library(sva)
mod1 <- model.matrix(~known)
mod0 <- cbind(mod1[,1])
sva.res = svaseq(counts,mod1,mod0)$sv
#plot(sva.res[,1], unknown)
abs(cor(sva.res[,1], unknown)) # cor = 0.7 
abs(cor(sva.res[,1], known)) # close to zero.

hf.mat5 <- cbind(known, unknown, res$sv[,1], pr.out$x[,1], sva.res[,1])
colnames(hf.mat5) <- c("KF","HF","IASVA","PC1_Residual","SVA")
corrplot.mixed(abs(cor(hf.mat5)), tl.pos="lt")

#par(oma = c(0, 2, 2, 2))
pdf("output/FigureL2.pdf", width=6,height=9)
par(mfrow=c(3,2))
corrplot.mixed(abs(cor(hf.mat1)), tl.pos="lt")
corrplot.mixed(abs(cor(hf.mat2)), tl.pos="lt")
corrplot.mixed(abs(cor(hf.mat3)), tl.pos="lt")
corrplot.mixed(abs(cor(hf.mat4)), tl.pos="lt")
corrplot.mixed(abs(cor(hf.mat5)), tl.pos="lt")
dev.off()