#devtools library(devtools) #iasva devtools::install_github("UcarLab/IA-SVA") #iasvaExamples devtools::install_github("dleelab/iasvaExamples")
rm(list=ls()) library(irlba) # partial SVD, the augmented implicitly restarted Lanczos bidiagonalization algorithm library(iasva) library(iasvaExamples) library(sva) library(polyester) library(corrplot) library(DescTools) #pcc i.e., Pearson's contingency coefficient library(RColorBrewer) library(reshape2) library(ggplot2) library(SummarizedExperiment) color.vec <- brewer.pal(9, "Set1") iasva.tmp <- function(Y, X, intercept=TRUE, num.sv=NULL, permute=TRUE, num.p=100, sig.cutoff= 0.05, threads=1, num.sv.permtest=NULL, tol=1e-10, verbose=FALSE){ cat("IA-SVA running...") sv <- NULL pc.stat.obs <- NULL pval <- NULL wgt.mat <- NULL rsq <- NULL isv <- 0 while(TRUE){ if(!is.null(num.sv)){ if(isv==num.sv){ break } } iasva.res <- iasva.unit.tmp(Y, X, intercept, permute, num.p, threads, num.sv.permtest, tol, verbose) if(iasva.res$pval < sig.cutoff){ sv <- cbind(sv, iasva.res$sv) pc.stat.obs <- cbind(pc.stat.obs, iasva.res$pc.stat.obs) pval <- c(pval, iasva.res$pval) wgt.mat <- cbind(wgt.mat, iasva.res$wgt) X <- cbind(X,iasva.res$sv) } else { break } isv <- isv+1 cat(paste0("\nSV",isv, " Detected!")) } if (isv > 0) { colnames(sv) <- paste0("SV", 1:ncol(sv)) cat(paste0("\n# of significant surrogate variables: ",length(pval))) return(list(sv=sv, pc.stat.obs=pc.stat.obs, pval=pval, n.sv=length(pval), wgt.mat=wgt.mat)) } else { cat ("\nNo significant surrogate variables") } } iasva.unit.tmp <- function(Y, X, intercept=TRUE, permute=TRUE, num.p=100, threads=1, num.sv.permtest=NULL, tol=1e-10, verbose=FALSE){ if(min(Y)<0){ Y <- Y + abs(min(Y)) } lY <- log(Y+1) if(intercept){ fit <- .lm.fit(cbind(1,X), lY) } else { fit <- .lm.fit(X, lY) } resid <- resid(fit) tresid = t(resid) if(verbose) {cat("\n Perform SVD on residuals")} svd_pca <- irlba::irlba(tresid-rowMeans(tresid), 1, tol=tol) if(verbose) {cat("\n Regress residuals on PC1")} fit <- .lm.fit(cbind(1,svd_pca$v[,1]), resid) if(verbose) {cat("\n Get Rsq")} rsq.vec <- calc.rsq(resid, fit) if(verbose) {cat("\n Rsq 0-1 Normalization")} rsq.vec[is.na(rsq.vec)] <- min(rsq.vec, na.rm=TRUE) wgt <- (rsq.vec-min(rsq.vec))/(max(rsq.vec)-min(rsq.vec)) #0-1 normalization if(verbose) {cat("\n Obtain weighted log-transformed read counts")} tlY = t(lY)*wgt # weigh each row (gene) with respect to its Rsq value. if(verbose) {cat("\n Perform SVD on weighted log-transformed read counts")} sv <- irlba::irlba(tlY-rowMeans(tlY), 1, tol=tol)$v[,1] if(permute==TRUE){ if(verbose) {cat("\n Assess the significance of the contribution of SV")} if(is.null(num.sv.permtest)){ svd.res.obs <- svd(tresid - rowMeans(tresid)) } else { svd.res.obs <- irlba::irlba(tresid-rowMeans(tresid), num.sv.permtest, tol=tol) } pc.stat.obs <- svd.res.obs$d[1]^2/sum(svd.res.obs$d^2) if(verbose) {cat("\n PC test statistic value:", pc.stat.obs)} # Generate an empirical null distribution of the PC test statistic. pc.stat.null.vec <- rep(0, num.p) permute.svd <- permute.svd.factory(lY, X, num.sv.permtest, tol, verbose) if (threads > 1) { threads <- min(threads, parallel::detectCores()-1) cl <- parallel::makeCluster(threads) pc.stat.null.vec <- tryCatch(parallel::parSapply(cl, 1:num.p, permute.svd), error=function(err){parallel::stopCluster(cl); stop(err)}) parallel::stopCluster(cl) } else { pc.stat.null.vec <- sapply(1:num.p, permute.svd) } if(verbose) {cat("\n Empirical null distribution of the PC statistic:", sort(pc.stat.null.vec))} pval <- sum(pc.stat.obs <= pc.stat.null.vec)/(num.p+1) if(verbose) {cat("\n Permutation p-value:", pval)} } else { pc.stat.obs <- -1 pval <- -1 } return(list(sv=sv, pc.stat.obs=pc.stat.obs, pval=pval, wgt=wgt)) } calc.rsq <- function(resid, fit) { RSS <- colSums(resid(fit)^2) TSS <- colSums(t(t(resid) - colSums(resid)/ncol(resid)) ^ 2) # vectorized # TSS <- colSums(sweep(resid, 2, mean, "-") ^ 2) # alt-2 return(1-(RSS/(nrow(resid)-2))/(TSS/(nrow(resid)-1))) } permute.svd.factory <- function(lY, X, num.sv.permtest, tol, verbose) { permute.svd <- function(i) { permuted.lY <- apply(t(lY), 1, sample, replace=FALSE) tresid.null <- t(resid(.lm.fit(cbind(1,X), permuted.lY))) if(is.null(num.sv.permtest)) { svd.res.null <- svd(tresid.null) } else { svd.res.null <- irlba::irlba(tresid.null, num.sv.permtest, tol=tol) } return(svd.res.null$d[1]^2/sum(svd.res.null$d^2)) } return(permute.svd) }
data("Lawlor_Islet_scRNAseq_Read_Counts") data("Lawlor_Islet_scRNAseq_Annotations") ls() counts <- Lawlor_Islet_scRNAseq_Read_Counts anns <- Lawlor_Islet_scRNAseq_Annotations dim(anns) dim(counts) summary(anns) ContCoef(table(anns$Gender, anns$Cell_Type)) ContCoef(table(anns$Phenotype, anns$Cell_Type)) ContCoef(table(anns$Race, anns$Cell_Type)) ContCoef(table(anns$Patient_ID, anns$Cell_Type)) ContCoef(table(anns$Batch, anns$Cell_Type))
plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=color.vec[2])
## Estimate the zero inflated negative binomial parameters params = get_params(counts)
Here we compare unsupervised SVA, supervised SVA and IA-SVA
set.seed(1234) #5000 sample.size <- 50 num.genes <- 10000 prop.kfactor.genes <- 0.2 #known factor prop.hfactor1.genes <- 0.1 #hidden factor1 num.kfactor.genes <- num.genes*prop.kfactor.genes num.hfactor1.genes <- num.genes*prop.hfactor1.genes factor.prop <- 0.5 kfactor = c(rep(-1,each=sample.size*factor.prop),rep(1,each=sample.size-(sample.size*factor.prop))) coinflip = rbinom(sample.size,size=1,prob=0.8) hfactor1 = kfactor*coinflip + -kfactor*(1-coinflip) cor(cbind(kfactor,hfactor1)) hfactor.mat <- cbind(hfactor1) kfcoeffs = c(rnorm(num.kfactor.genes),rep(0,num.genes-num.kfactor.genes)) nullindex= (num.kfactor.genes+1):num.genes overlap.prop <- 0.99 #hfcoeffs1 = c(rep(0,num.kfactor.genes*(1-overlap.prop)),rnorm(num.hfactor1.genes,sd=1),rep(0,num.genes-num.kfactor.genes*(1-overlap.prop)-num.hfactor1.genes)) hfcoeffs1 = c(rep(0,num.kfactor.genes-num.hfactor1.genes*overlap.prop),rnorm(num.hfactor1.genes,sd=1),rep(0,num.genes-(num.kfactor.genes-num.hfactor1.genes*overlap.prop)-num.hfactor1.genes)) par(mfrow=c(2,1)) plot(kfcoeffs) plot(hfcoeffs1) par(mfrow=c(1,1)) coeffs = cbind(hfcoeffs1,kfcoeffs) controls = (hfcoeffs1!=0)&(kfcoeffs==0) mod = model.matrix(~-1 + hfactor1 + kfactor) dat0 = create_read_numbers(params$mu,params$fit, params$p0,beta=coeffs,mod=mod) sum(dat0==0)/length(dat0) filter = apply(dat0, 1, function(x) length(x[x>5])>=2) dat0 = dat0[filter,] sum(dat0==0)/length(dat0) controls <- controls[filter] dim(dat0) dim(mod)
We also plot effect sizes of known and hidden factor and estimated weights (R-squared)
## Set null and alternative models mod1 = model.matrix(~kfactor) mod0 = cbind(mod1[,1]) ### iasva hfactors_iasva.res <- iasva.tmp(t(dat0), kfactor, num.sv=1, permute=F) hfactors_iasva <- hfactors_iasva.res$sv cor(hfactor1,hfactors_iasva) filter2 <- hfcoeffs1[filter]!=0 par(mfrow=c(3,1)) plot(kfcoeffs[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Known Factor Effect Size", cex=1.5) plot(hfcoeffs1[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Hidden Factor Effect Size") plot(hfactors_iasva.res$wgt.mat, xlim=c(0,2500), xlab="Gene Index", ylab="R squared") par(mfrow=c(1,1)) pdf("output/GeneOverlap.Fig1.pdf", width = 5, height=8) par(mfrow=c(3,1)) plot(kfcoeffs[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Known Factor Effect Size",cex.lab=1.5,cex.axis=1.5) plot(hfcoeffs1[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Hidden Factor Effect Size",cex.lab=1.5,cex.axis=1.5) plot(hfactors_iasva.res$wgt.mat, xlim=c(0,2500), xlab="Gene Index", ylab="R squared",cex.lab=1.5,cex.axis=1.5) dev.off() par(mfrow=c(1,1))
overlap.hfcoeffs1 <- hfcoeffs1[filter][filter2] overlap.kfcoeffs <- kfcoeffs[filter][filter2] overlap.R2 <- hfactors_iasva.res$wgt.mat[filter2] summary(lm(overlap.R2~overlap.hfcoeffs1+overlap.kfcoeffs)) par(mfrow=c(2,1)) plot(overlap.hfcoeffs1, overlap.R2, xlab="Hidden Factor Effect Size", ylab="R squared") plot(overlap.kfcoeffs, overlap.R2, xlab="Known Factor Effect Size", ylab="R squared") par(mfrow=c(1,1)) pdf("output/GeneOverlap.Fig2.pdf", width = 5, height=8) par(mfrow=c(2,1)) plot(overlap.hfcoeffs1, overlap.R2, xlab="Hidden Factor Effect Size", ylab="R squared") plot(overlap.kfcoeffs, overlap.R2, xlab="Known Factor Effect Size", ylab="R squared") dev.off() par(mfrow=c(1,1))
```{R alternative_hidden_factor_estimation, eval=TRUE}
ldat0 = log(dat0 + 1) hfactors_pca = svd(ldat0 - rowMeans(ldat0))$v[,1] cor(hfactor1,hfactors_pca)
hfactors_usva = svaseq(dat0,mod1,mod0, n.sv=1)$sv cor(hfactor1,hfactors_usva)
hfactors_ssva = svaseq(dat0,mod1,mod0,controls=controls, n.sv=1)$sv cor(hfactor1,hfactors_ssva)
## Plot hidden factor estimates of IA-SVA and existing methods ```r ## Plot the results par(mfrow=c(5,1)) plot(hfactor1,col=color.vec[1],pch=19,main="Hidden Factor1") plot(hfactors_iasva,pch=19,col=color.vec[2],main="IA-SVA") plot(hfactors_pca,pch=19, col=color.vec[3],main="PCA") plot(hfactors_usva,pch=19,col=color.vec[4],main="USVA") plot(hfactors_ssva,pch=19,col=color.vec[5],main="SSVA") par(mfrow=c(1,1))
set.seed(4000) sample.size <- 50 num.genes <- 10000 prop.kfactor.genes <- 0.2 #known factor prop.hfactor1.genes <- 0.1 #hidden factor1 num.kfactor.genes <- num.genes*prop.kfactor.genes num.hfactor1.genes <- num.genes*prop.hfactor1.genes factor.prop <- 0.5 kfactor = c(rep(-1,each=sample.size*factor.prop),rep(1,each=sample.size-(sample.size*factor.prop))) coinflip = rbinom(sample.size,size=1,prob=0.9) hfactor1 = kfactor*coinflip + -kfactor*(1-coinflip) cor(cbind(kfactor,hfactor1)) overlap.prop.vec <- c(0.99,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1,0) data.list <- list() controls.list <- list() hfactor.mat <- cbind(hfactor1) kfbetas <- rnorm(num.kfactor.genes) hfbetas <- rnorm(num.hfactor1.genes) kfcoeffs = c(rnorm(num.kfactor.genes),rep(0,num.genes-num.kfactor.genes)) nullindex= (num.kfactor.genes+1):num.genes pdf("output/gene_overlap.pdf", width = 10, height=5) for(i in 1:length(overlap.prop.vec)){ hfcoeffs1 = c(rep(0,num.kfactor.genes-num.hfactor1.genes*overlap.prop.vec[i]),hfbetas,rep(0,num.genes-(num.kfactor.genes-num.hfactor1.genes*overlap.prop.vec[i])-num.hfactor1.genes)) coeffs = cbind(hfcoeffs1,kfcoeffs) controls = (hfcoeffs1!=0)&(kfcoeffs==0) mod = model.matrix(~-1 + hfactor1 + kfactor) dat0 = create_read_numbers(params$mu,params$fit, params$p0,beta=coeffs,mod=mod) sum(dat0==0)/length(dat0) filter = apply(dat0, 1, function(x) length(x[x>5])>=2) dat0 = dat0[filter,] sum(dat0==0)/length(dat0) controls.list[[i]] <- controls[filter] #dim(dat0) #dim(mod) data.list[[i]] <- dat0 par(mfrow=c(2,1)) print(plot(kfcoeffs)) print(plot(hfcoeffs1)) } dev.off()
## Set null and alternative models mod1 = model.matrix(~kfactor) mod0 = cbind(mod1[,1]) hfactors_iasva_list <- list() hfactors_pca_list <- list() hfactors_usva_list <- list() hfactors_ssva_list <- list() for(i in 1:length(overlap.prop.vec)){ dat0 <- data.list[[i]] ### iasva summ_exp <- SummarizedExperiment(assays = dat0) hfactors_iasva_list[[i]] <- iasva(summ_exp, as.matrix(kfactor), num.sv=1, permute = F)$sv ## Estimate batch with pca ldat0 = log(dat0 + 1) hfactors_pca_list[[i]] = svd(ldat0 - rowMeans(ldat0))$v[,1] ## Estimate batch with svaseq (unsupervised) hfactors_usva_list[[i]] = svaseq(dat0,mod1,mod0,n.sv=1)$sv[,1] ## Estimate batch with svaseq (supervised) hfactors_ssva_list[[i]] = svaseq(dat0,mod1,mod0,n.sv=1,controls=controls.list[[i]])$sv[,1] cat(i,"\n") }
iasva.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_iasva_list)))) pca.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_pca_list)))) usva.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_usva_list)))) ssva.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_ssva_list)))) cor.df <- data.frame(overlap.pct = 100*overlap.prop.vec, IASVA = iasva.cor.vec, PCA = pca.cor.vec, USVA = usva.cor.vec, SSVA = ssva.cor.vec) melt.cor.df <- melt(cor.df, id.var=c("overlap.pct")) p <- ggplot(melt.cor.df, aes(x=overlap.pct, y=value, group=variable)) p <- p + geom_line(aes(col=variable), size=1) p <- p + geom_point(aes(col=variable), size=3) p <- p + xlab("Gene Set Overlap Percentage") p <- p + ylab("Absolute Correlation Coefficient") #p <- p + scale_color_manual(values=color.vec) p <- p + scale_color_brewer(palette = "Spectral", name="Method") p <- p + theme_bw() p <- p + theme(panel.border = element_blank(), panel.grid.major = element_blank(), #panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) p ggsave("output/absCor_gene_overlap_pct.pdf", width=8, height=5)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.