To make simulation studies more objective, we used a simulation design suggested by the author of svaseq (http://jtleek.com/svaseq/simulateData.html). We slightly modified the original design to simulate realistic single cell RNA sequencing (scRNA-seq) data sets (read counts with a high dropout rate) and multiple correlated hidden factors. We simulated baseline scRNA-seq data using the Polyester R package (https://bioconductor.org/packages/release/bioc/html/polyester.html) by using the zero-inflated negative binomial distribution parameters estimated from a real-world scRNA-seq data obtained from human pancreatic islet samples (Lawlor et. al., 2016). The islet scRNA-seq dataset is included in a R data package ("iasvaExamples") containing data examples for IA-SVA (https://github.com/dleelab/iasvaExamples). To install the 'iasvaExamples' package, follow the instruction provided in the GitHub page.
rm(list=ls()) library(iasva) library(iasvaExamples) library(polyester) library(sva) library(corrplot) library(DescTools) #pcc i.e., Pearson's contingency coefficient library(RColorBrewer) library(SummarizedExperiment) color.vec <- brewer.pal(8, "Set1")
data("Lawlor_Islet_scRNAseq_Read_Counts") data("Lawlor_Islet_scRNAseq_Annotations") ls() counts <- Lawlor_Islet_scRNAseq_Read_Counts anns <- Lawlor_Islet_scRNAseq_Annotations dim(anns) dim(counts) summary(anns) ContCoef(table(anns$Gender, anns$Cell_Type)) ContCoef(table(anns$Phenotype, anns$Cell_Type)) ContCoef(table(anns$Race, anns$Cell_Type)) ContCoef(table(anns$Patient_ID, anns$Cell_Type)) ContCoef(table(anns$Batch, anns$Cell_Type))
plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=color.vec[2])
## Estimate the zero inflated negative binomial parameters #set.seed(12345) set.seed(2018) params = get_params(counts)
sample.size <- 50 num.genes <- 10000 prop.kfactor.genes <- 0.1 #known factor prop.hfactor1.genes <- 0.3 #hidden factor1 prop.hfactor2.genes <- 0.2 #hidden factor2 prop.hfactor3.genes <- 0.1 #hidden factor3 num.kfactor.genes <- num.genes*prop.kfactor.genes num.hfactor1.genes <- num.genes*prop.hfactor1.genes num.hfactor2.genes <- num.genes*prop.hfactor2.genes num.hfactor3.genes <- num.genes*prop.hfactor3.genes factor.prop <- 0.5 flip.prob <- 0.8 # high corrleation between factors #flip.prob <- 0.5 # row correlation between factors # make first 10% of genes affected by the known factor. kfactor = c(rep(-1,each=sample.size*factor.prop),rep(1,each=sample.size-(sample.size*factor.prop))) coinflip = rbinom(sample.size,size=1,prob=flip.prob) hfactor1 = kfactor*coinflip + -kfactor*(1-coinflip) coinflip = rbinom(sample.size,size=1,prob=flip.prob) hfactor2 = kfactor*coinflip + -kfactor*(1-coinflip) coinflip = rbinom(sample.size,size=1,prob=flip.prob) hfactor3 = kfactor*coinflip + -kfactor*(1-coinflip) corrplot.mixed(cor(cbind(kfactor,hfactor1,hfactor2,hfactor3))) hfactor.mat <- cbind(hfactor1,hfactor2,hfactor3) kfcoeffs = c(rnorm(num.kfactor.genes),rep(0,num.genes-num.kfactor.genes)) nullindex= (num.kfactor.genes+1):num.genes hfcoeffs1 <- rep(0,num.genes) hfcoeffs2 <- rep(0,num.genes) hfcoeffs3 <- rep(0,num.genes) ## genes affected by each hidden factor is randomly selected. hfcoeffs1[sample(1:num.genes, num.hfactor1.genes)] <- rnorm(num.hfactor1.genes, sd=1) hfcoeffs2[sample(1:num.genes, num.hfactor2.genes)] <- rnorm(num.hfactor2.genes, sd=1) hfcoeffs3[sample(1:num.genes, num.hfactor3.genes)] <- rnorm(num.hfactor3.genes, sd=1) coeffs = cbind(hfcoeffs1, hfcoeffs2, hfcoeffs3, kfcoeffs) controls = ((hfcoeffs1!=0)|(hfcoeffs2!=0)|(hfcoeffs3!=0))&(kfcoeffs==0) par(mfrow=c(5,1)) plot(kfcoeffs) plot(hfcoeffs1) plot(hfcoeffs2) plot(hfcoeffs3) plot(controls) par(mfrow=c(1,1)) mod = model.matrix(~-1 + hfactor1 + hfactor2 + hfactor3 + kfactor) dat0 = create_read_numbers(params$mu,params$fit, params$p0,beta=coeffs,mod=mod) sum(dat0==0)/length(dat0) filter = apply(dat0, 1, function(x) length(x[x>5])>=3) dat0 = dat0[filter,] sum(dat0==0)/length(dat0) controls <- controls[filter] dim(dat0) dim(mod)
## set null and alternative models mod1 = model.matrix(~kfactor) mod0 = cbind(mod1[,1]) ## Estimate hidden factors with unsuprvised SVA usva.res = svaseq(dat0,mod1,mod0)$sv cor(hfactor1, usva.res[,1]) cor(hfactor2, usva.res[,2]) cor(hfactor3, usva.res[,3]) ## Estimate hidden factors with supervised SVA ssva.res = svaseq(dat0,mod1,mod0,controls=controls)$sv cor(hfactor1, ssva.res[,1]) cor(hfactor2, ssva.res[,2]) cor(hfactor3, ssva.res[,3]) ## Estimate hidden factors with IA-SVA summ_exp <- SummarizedExperiment(assays = dat0) iasva.res <- iasva(summ_exp, as.matrix(mod1[,-1]), num.p = 20)$sv # the default number of permutations in SVA = 20 cor(hfactor1, iasva.res[,1]) cor(hfactor2, iasva.res[,2]) cor(hfactor3, iasva.res[,3]) hf1.mat <- cbind(hfactor1, usva.res[,1], ssva.res[,1], iasva.res[,1]) hf2.mat <- cbind(hfactor2, usva.res[,2], ssva.res[,2], iasva.res[,2]) hf3.mat <- cbind(hfactor3, usva.res[,3], ssva.res[,3], iasva.res[,3]) colnames(hf1.mat) <- c("HF1", "USVA", "SSVA", "IA-SVA") colnames(hf2.mat) <- c("HF2", "USVA", "SSVA", "IA-SVA") colnames(hf3.mat) <- c("HF3", "USVA", "SSVA", "IA-SVA") par(mfrow=c(2,2)) corrplot.mixed(abs(cor(hf1.mat)), tl.pos="lt") corrplot.mixed(abs(cor(hf2.mat)), tl.pos="lt") corrplot.mixed(abs(cor(hf3.mat)), tl.pos="lt")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.