In dleelab/iasvaExamples: iasva example data package

Install packages

#devtools
library(devtools)
#iasva
devtools::install_github("UcarLab/IA-SVA")
#iasvaExamples  
devtools::install_github("dleelab/iasvaExamples")

Load packages

rm(list=ls())
library(irlba) # partial SVD, the augmented implicitly restarted Lanczos bidiagonalization algorithm
library(iasva)
library(iasvaExamples)
library(sva)
library(polyester)
library(corrplot)
library(DescTools) #pcc i.e., Pearson's contingency coefficient
library(RColorBrewer)
library(reshape2)
library(ggplot2)
library(SummarizedExperiment)

color.vec <- brewer.pal(9, "Set1")

iasva.tmp <- function(Y, X, intercept=TRUE, num.sv=NULL, permute=TRUE, num.p=100, sig.cutoff= 0.05, threads=1, num.sv.permtest=NULL, tol=1e-10, verbose=FALSE){
  cat("IA-SVA running...")
  sv <- NULL
  pc.stat.obs <- NULL
  pval <- NULL
  wgt.mat <- NULL
  rsq <- NULL
  isv <- 0
  while(TRUE){
    if(!is.null(num.sv)){
      if(isv==num.sv){ 
        break
      }
    }
    iasva.res <- iasva.unit.tmp(Y, X, intercept, permute, num.p, threads, num.sv.permtest, tol, verbose)
    if(iasva.res$pval < sig.cutoff){
      sv <- cbind(sv, iasva.res$sv)
      pc.stat.obs <- cbind(pc.stat.obs, iasva.res$pc.stat.obs)
      pval <- c(pval, iasva.res$pval)
      wgt.mat <- cbind(wgt.mat, iasva.res$wgt)
      X <- cbind(X,iasva.res$sv)
    } else { break }
    isv <- isv+1
    cat(paste0("\nSV",isv, " Detected!"))
  }
  if (isv > 0) {
    colnames(sv) <- paste0("SV", 1:ncol(sv))
    cat(paste0("\n# of significant surrogate variables: ",length(pval)))
    return(list(sv=sv, pc.stat.obs=pc.stat.obs, pval=pval, n.sv=length(pval), wgt.mat=wgt.mat))
  } else {
    cat ("\nNo significant surrogate variables")
  }
}

iasva.unit.tmp <- function(Y, X, intercept=TRUE, permute=TRUE, num.p=100, threads=1, num.sv.permtest=NULL, tol=1e-10, verbose=FALSE){
  if(min(Y)<0){ Y <- Y + abs(min(Y)) }
  lY <- log(Y+1)
  if(intercept){
    fit <- .lm.fit(cbind(1,X), lY)
  } else {
    fit <- .lm.fit(X, lY)
  }
  resid <- resid(fit)
  tresid = t(resid)

  if(verbose) {cat("\n Perform SVD on residuals")}
  svd_pca <- irlba::irlba(tresid-rowMeans(tresid), 1, tol=tol)

  if(verbose) {cat("\n Regress residuals on PC1")}
  fit <- .lm.fit(cbind(1,svd_pca$v[,1]), resid)

  if(verbose) {cat("\n Get Rsq")}
  rsq.vec <- calc.rsq(resid, fit)

  if(verbose) {cat("\n Rsq 0-1 Normalization")}
  rsq.vec[is.na(rsq.vec)] <- min(rsq.vec, na.rm=TRUE)
  wgt <- (rsq.vec-min(rsq.vec))/(max(rsq.vec)-min(rsq.vec)) #0-1 normalization

  if(verbose) {cat("\n Obtain weighted log-transformed read counts")}
  tlY = t(lY)*wgt # weigh each row (gene) with respect to its Rsq value.

  if(verbose) {cat("\n Perform SVD on weighted log-transformed read counts")}
  sv <- irlba::irlba(tlY-rowMeans(tlY), 1, tol=tol)$v[,1]

  if(permute==TRUE){
    if(verbose) {cat("\n Assess the significance of the contribution of SV")}
    if(is.null(num.sv.permtest)){
      svd.res.obs <- svd(tresid - rowMeans(tresid))
    } else {
      svd.res.obs <- irlba::irlba(tresid-rowMeans(tresid), num.sv.permtest, tol=tol)
    }

    pc.stat.obs <- svd.res.obs$d[1]^2/sum(svd.res.obs$d^2)
    if(verbose) {cat("\n PC test statistic value:", pc.stat.obs)}

    # Generate an empirical null distribution of the PC test statistic.
    pc.stat.null.vec <- rep(0, num.p)
    permute.svd <- permute.svd.factory(lY, X, num.sv.permtest, tol, verbose)
    if (threads > 1) {
      threads <- min(threads, parallel::detectCores()-1)
      cl <- parallel::makeCluster(threads)
      pc.stat.null.vec <- tryCatch(parallel::parSapply(cl, 1:num.p, permute.svd), error=function(err){parallel::stopCluster(cl); stop(err)})
      parallel::stopCluster(cl)
    } else {
      pc.stat.null.vec <- sapply(1:num.p, permute.svd)
    }
    if(verbose) {cat("\n Empirical null distribution of the PC statistic:", sort(pc.stat.null.vec))}
    pval <- sum(pc.stat.obs <= pc.stat.null.vec)/(num.p+1)
    if(verbose) {cat("\n Permutation p-value:", pval)}
  } else {
    pc.stat.obs <- -1
    pval <- -1
  }
  return(list(sv=sv, pc.stat.obs=pc.stat.obs, pval=pval, wgt=wgt))
}

calc.rsq <- function(resid, fit) {
  RSS <- colSums(resid(fit)^2)
  TSS <- colSums(t(t(resid) - colSums(resid)/ncol(resid)) ^ 2)  # vectorized
  # TSS <- colSums(sweep(resid, 2, mean, "-") ^ 2)  # alt-2
  return(1-(RSS/(nrow(resid)-2))/(TSS/(nrow(resid)-1)))
}

permute.svd.factory <- function(lY, X, num.sv.permtest, tol, verbose) {

  permute.svd <- function(i) {
    permuted.lY <- apply(t(lY), 1, sample, replace=FALSE)
    tresid.null <- t(resid(.lm.fit(cbind(1,X), permuted.lY)))
    if(is.null(num.sv.permtest)) {
      svd.res.null <- svd(tresid.null)
    } else {
      svd.res.null <- irlba::irlba(tresid.null, num.sv.permtest, tol=tol)
    }
    return(svd.res.null$d[1]^2/sum(svd.res.null$d^2))
  }

  return(permute.svd)
}

Load the islet single cell RNA-Seq data

data("Lawlor_Islet_scRNAseq_Read_Counts")
data("Lawlor_Islet_scRNAseq_Annotations")
ls()
counts <- Lawlor_Islet_scRNAseq_Read_Counts
anns <- Lawlor_Islet_scRNAseq_Annotations
dim(anns)
dim(counts)

summary(anns)
ContCoef(table(anns$Gender, anns$Cell_Type))
ContCoef(table(anns$Phenotype, anns$Cell_Type))
ContCoef(table(anns$Race, anns$Cell_Type))
ContCoef(table(anns$Patient_ID, anns$Cell_Type))
ContCoef(table(anns$Batch, anns$Cell_Type))

Look at mean variance relationship

plot(rowMeans(log(counts+1)),rowVars(log(counts+1)),pch=19,col=color.vec[2])

Estimate zero inflated negative binomial parameters from the islet data

## Estimate the zero inflated negative binomial parameters
params = get_params(counts)

Generate a data set affected by two factors (known and unknown).

Here we compare unsupervised SVA, supervised SVA and IA-SVA

set.seed(1234) #5000
sample.size <- 50
num.genes <- 10000
prop.kfactor.genes <- 0.2    #known factor
prop.hfactor1.genes <- 0.1   #hidden factor1

num.kfactor.genes <- num.genes*prop.kfactor.genes
num.hfactor1.genes <- num.genes*prop.hfactor1.genes

factor.prop <- 0.5
kfactor = c(rep(-1,each=sample.size*factor.prop),rep(1,each=sample.size-(sample.size*factor.prop)))
coinflip = rbinom(sample.size,size=1,prob=0.8)
hfactor1 = kfactor*coinflip + -kfactor*(1-coinflip)

cor(cbind(kfactor,hfactor1))

hfactor.mat <- cbind(hfactor1)
kfcoeffs = c(rnorm(num.kfactor.genes),rep(0,num.genes-num.kfactor.genes))
nullindex= (num.kfactor.genes+1):num.genes

overlap.prop <- 0.99

#hfcoeffs1 = c(rep(0,num.kfactor.genes*(1-overlap.prop)),rnorm(num.hfactor1.genes,sd=1),rep(0,num.genes-num.kfactor.genes*(1-overlap.prop)-num.hfactor1.genes))
hfcoeffs1 = c(rep(0,num.kfactor.genes-num.hfactor1.genes*overlap.prop),rnorm(num.hfactor1.genes,sd=1),rep(0,num.genes-(num.kfactor.genes-num.hfactor1.genes*overlap.prop)-num.hfactor1.genes))

par(mfrow=c(2,1))
plot(kfcoeffs)
plot(hfcoeffs1)
par(mfrow=c(1,1))

coeffs = cbind(hfcoeffs1,kfcoeffs)
controls = (hfcoeffs1!=0)&(kfcoeffs==0)
mod = model.matrix(~-1 + hfactor1 + kfactor)

dat0 = create_read_numbers(params$mu,params$fit,
                                     params$p0,beta=coeffs,mod=mod)
sum(dat0==0)/length(dat0)
filter = apply(dat0, 1, function(x) length(x[x>5])>=2)
dat0 = dat0[filter,]
sum(dat0==0)/length(dat0)
controls <- controls[filter]

dim(dat0)
dim(mod)

Estimate hidden factor using IA-SVA

We also plot effect sizes of known and hidden factor and estimated weights (R-squared)

## Set null and alternative models
mod1 = model.matrix(~kfactor)
mod0 = cbind(mod1[,1])

### iasva 
hfactors_iasva.res <- iasva.tmp(t(dat0), kfactor, num.sv=1, permute=F)
hfactors_iasva <- hfactors_iasva.res$sv
cor(hfactor1,hfactors_iasva)
filter2 <- hfcoeffs1[filter]!=0

par(mfrow=c(3,1))
plot(kfcoeffs[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Known Factor Effect Size", cex=1.5)
plot(hfcoeffs1[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Hidden Factor Effect Size")
plot(hfactors_iasva.res$wgt.mat, xlim=c(0,2500), xlab="Gene Index", ylab="R squared")
par(mfrow=c(1,1))


pdf("output/GeneOverlap.Fig1.pdf", width = 5, height=8)
par(mfrow=c(3,1))
plot(kfcoeffs[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Known Factor Effect Size",cex.lab=1.5,cex.axis=1.5)
plot(hfcoeffs1[filter], xlim=c(0,2500), xlab="Gene Index", ylab="Hidden Factor Effect Size",cex.lab=1.5,cex.axis=1.5)
plot(hfactors_iasva.res$wgt.mat, xlim=c(0,2500), xlab="Gene Index", ylab="R squared",cex.lab=1.5,cex.axis=1.5)
dev.off()
par(mfrow=c(1,1))

R-squared (Weights) as a function of hidden and known factors' effect size

overlap.hfcoeffs1 <- hfcoeffs1[filter][filter2]
overlap.kfcoeffs <- kfcoeffs[filter][filter2]
overlap.R2 <- hfactors_iasva.res$wgt.mat[filter2]

summary(lm(overlap.R2~overlap.hfcoeffs1+overlap.kfcoeffs))

par(mfrow=c(2,1))
plot(overlap.hfcoeffs1, overlap.R2, xlab="Hidden Factor Effect Size", ylab="R squared")
plot(overlap.kfcoeffs, overlap.R2, xlab="Known Factor Effect Size", ylab="R squared")
par(mfrow=c(1,1))

pdf("output/GeneOverlap.Fig2.pdf", width = 5, height=8)
par(mfrow=c(2,1))
plot(overlap.hfcoeffs1, overlap.R2, xlab="Hidden Factor Effect Size", ylab="R squared")
plot(overlap.kfcoeffs, overlap.R2, xlab="Known Factor Effect Size", ylab="R squared")
dev.off()
par(mfrow=c(1,1))

Hidden factor estimation using existing methods

```{R alternative_hidden_factor_estimation, eval=TRUE}

Estimate batch with pca

ldat0 = log(dat0 + 1) hfactors_pca = svd(ldat0 - rowMeans(ldat0))$v[,1] cor(hfactor1,hfactors_pca)

Estimate batch with svaseq (unsupervised)

hfactors_usva = svaseq(dat0,mod1,mod0, n.sv=1)$sv cor(hfactor1,hfactors_usva)

Estimate batch with svaseq (supervised)

hfactors_ssva = svaseq(dat0,mod1,mod0,controls=controls, n.sv=1)$sv cor(hfactor1,hfactors_ssva)

## Plot hidden factor estimates of IA-SVA and existing methods
```r
## Plot the results
par(mfrow=c(5,1))
plot(hfactor1,col=color.vec[1],pch=19,main="Hidden Factor1")
plot(hfactors_iasva,pch=19,col=color.vec[2],main="IA-SVA")
plot(hfactors_pca,pch=19, col=color.vec[3],main="PCA")
plot(hfactors_usva,pch=19,col=color.vec[4],main="USVA")
plot(hfactors_ssva,pch=19,col=color.vec[5],main="SSVA")
par(mfrow=c(1,1))

Multiple simulations where hidden and known factors affect overlapping sets of genes

set.seed(4000)
sample.size <- 50
num.genes <- 10000
prop.kfactor.genes <- 0.2    #known factor
prop.hfactor1.genes <- 0.1   #hidden factor1

num.kfactor.genes <- num.genes*prop.kfactor.genes
num.hfactor1.genes <- num.genes*prop.hfactor1.genes

factor.prop <- 0.5
kfactor = c(rep(-1,each=sample.size*factor.prop),rep(1,each=sample.size-(sample.size*factor.prop)))
coinflip = rbinom(sample.size,size=1,prob=0.9)
hfactor1 = kfactor*coinflip + -kfactor*(1-coinflip)

cor(cbind(kfactor,hfactor1))

overlap.prop.vec <- c(0.99,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1,0)

data.list <- list()
controls.list <- list()

hfactor.mat <- cbind(hfactor1)
kfbetas <- rnorm(num.kfactor.genes)
hfbetas <- rnorm(num.hfactor1.genes)  

kfcoeffs = c(rnorm(num.kfactor.genes),rep(0,num.genes-num.kfactor.genes))
nullindex= (num.kfactor.genes+1):num.genes

pdf("output/gene_overlap.pdf", width = 10, height=5)
for(i in 1:length(overlap.prop.vec)){

  hfcoeffs1 = c(rep(0,num.kfactor.genes-num.hfactor1.genes*overlap.prop.vec[i]),hfbetas,rep(0,num.genes-(num.kfactor.genes-num.hfactor1.genes*overlap.prop.vec[i])-num.hfactor1.genes))

  coeffs = cbind(hfcoeffs1,kfcoeffs)
  controls = (hfcoeffs1!=0)&(kfcoeffs==0)
  mod = model.matrix(~-1 + hfactor1 + kfactor)

  dat0 = create_read_numbers(params$mu,params$fit,
                                       params$p0,beta=coeffs,mod=mod)
  sum(dat0==0)/length(dat0)
  filter = apply(dat0, 1, function(x) length(x[x>5])>=2)
  dat0 = dat0[filter,]
  sum(dat0==0)/length(dat0)
  controls.list[[i]] <- controls[filter]

  #dim(dat0)
  #dim(mod)
  data.list[[i]] <- dat0
  par(mfrow=c(2,1))
  print(plot(kfcoeffs))
  print(plot(hfcoeffs1))
}
dev.off()

Estimation of hidden factors using IA-SVA and existing methods

## Set null and alternative models
mod1 = model.matrix(~kfactor)
mod0 = cbind(mod1[,1])
hfactors_iasva_list <- list()
hfactors_pca_list <- list()
hfactors_usva_list <- list()
hfactors_ssva_list <- list()
for(i in 1:length(overlap.prop.vec)){
  dat0 <- data.list[[i]]
  ### iasva 
  summ_exp <- SummarizedExperiment(assays = dat0)
  hfactors_iasva_list[[i]] <- iasva(summ_exp, as.matrix(kfactor), num.sv=1, permute = F)$sv
  ## Estimate batch with pca
  ldat0 = log(dat0 + 1)
  hfactors_pca_list[[i]] = svd(ldat0 - rowMeans(ldat0))$v[,1]
  ## Estimate batch with svaseq (unsupervised)
  hfactors_usva_list[[i]] = svaseq(dat0,mod1,mod0,n.sv=1)$sv[,1]
  ## Estimate batch with svaseq (supervised)
  hfactors_ssva_list[[i]] = svaseq(dat0,mod1,mod0,n.sv=1,controls=controls.list[[i]])$sv[,1]
  cat(i,"\n")
}

Compare the accuracy

iasva.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_iasva_list))))
pca.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_pca_list))))
usva.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_usva_list))))
ssva.cor.vec <- as.vector(abs(cor(hfactor1, do.call(cbind, hfactors_ssva_list))))

cor.df <- data.frame(overlap.pct = 100*overlap.prop.vec,
                     IASVA = iasva.cor.vec,
                     PCA = pca.cor.vec,
                     USVA = usva.cor.vec,
                     SSVA = ssva.cor.vec)

melt.cor.df <- melt(cor.df, id.var=c("overlap.pct"))

p <- ggplot(melt.cor.df, aes(x=overlap.pct, y=value, group=variable))
p <- p + geom_line(aes(col=variable), size=1)
p <- p + geom_point(aes(col=variable), size=3)
p <- p + xlab("Gene Set Overlap Percentage")
p <- p + ylab("Absolute Correlation Coefficient")
#p <- p + scale_color_manual(values=color.vec)
p <- p + scale_color_brewer(palette = "Spectral", name="Method")
p <- p + theme_bw()
p <- p + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
               #panel.grid.minor = element_blank(), 
               axis.line = element_line(colour = "black"))
p

ggsave("output/absCor_gene_overlap_pct.pdf", width=8, height=5)

dleelab/iasvaExamples documentation built on Aug. 16, 2022, 4:21 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dleelab/iasvaExamples
iasva example data package

In dleelab/iasvaExamples: iasva example data package

Install packages

Load packages

Load the islet single cell RNA-Seq data

Look at mean variance relationship

Estimate zero inflated negative binomial parameters from the islet data

Generate a data set affected by two factors (known and unknown).

Estimate hidden factor using IA-SVA

R-squared (Weights) as a function of hidden and known factors' effect size

Hidden factor estimation using existing methods

Estimate batch with pca

Estimate batch with svaseq (unsupervised)

Estimate batch with svaseq (supervised)

Multiple simulations where hidden and known factors affect overlapping sets of genes

Estimation of hidden factors using IA-SVA and existing methods

Compare the accuracy

R Package Documentation

Browse R Packages

We want your feedback!

dleelab/iasvaExamples iasva example data package

In dleelab/iasvaExamples: iasva example data package

Install packages

Load packages

Load the islet single cell RNA-Seq data

Look at mean variance relationship

Estimate zero inflated negative binomial parameters from the islet data

Generate a data set affected by two factors (known and unknown).

Estimate hidden factor using IA-SVA

R-squared (Weights) as a function of hidden and known factors' effect size

Hidden factor estimation using existing methods

Estimate batch with pca

Estimate batch with svaseq (unsupervised)

Estimate batch with svaseq (supervised)

Multiple simulations where hidden and known factors affect overlapping sets of genes

Estimation of hidden factors using IA-SVA and existing methods

Compare the accuracy

R Package Documentation

Browse R Packages

We want your feedback!

dleelab/iasvaExamples
iasva example data package