variancePartition.R
In variancePartition: Quantify and interpret divers of variation in multilevel gene expression experiments

## ----knitr, echo=FALSE, results='hide'----------------------------------------
library("knitr")
opts_chunk$set(tidy=FALSE,dev="pdf",fig.show="hide",
               fig.width=4,fig.height=4.5,
               message=FALSE, warning=FALSE)


## ----style, eval=TRUE, echo=FALSE, results="asis"---------------------------------------
BiocStyle::latex(width=90)

## ----options, results="hide", echo=FALSE--------------------------------------
options(digits=3, width=80, prompt=" ", continue=" ")

## ----simResult, cache=TRUE, results='hide', fig.height=4, fig.width=4---------
# load library
library('variancePartition')

# load simulated data:
# geneExpr: matrix of gene expression values
# info: information/metadata about each sample
data(varPartData)

# Specify variables to consider
# Age is continuous so model it as a fixed effect
# Individual and Tissue are both categorical, 
# so model them as random effects
# Note the syntax used to specify random effects
form <- ~ Age + (1|Individual) + (1|Tissue) + (1|Batch) 

# Fit model and extract results
# 1) fit linear mixed model on gene expression
# If categorical variables are specified, 
#     a linear mixed model is used
# If all variables are modeled as fixed effects, 
#		a linear model is used
# each entry in results is a regression model fit on a single gene
# 2) extract variance fractions from each model fit
# for each gene, returns fraction of variation attributable 
#		to each variable 
# Interpretation: the variance explained by each variables 
# after correcting for all other variables
# Note that geneExpr can either be a matrix, 
# and EList output by voom() in the limma package, 
# or an ExpressionSet
varPart <- fitExtractVarPartModel( geneExpr, form, info )

# sort variables (i.e. columns) by median fraction 
#		of variance explained
vp <- sortCols( varPart )

# Figure 1a
# Bar plot of variance fractions for the first 10 genes
plotPercentBars( vp[1:10,] )

#
# Figure 1b
# violin plot of contribution of each variable to total variance
plotVarPart( vp )

## ----accessResults, cache=TRUE, warning=FALSE---------------------------------
# Access first entries
head(varPart)

# Access first entries for Individual
head(varPart$Individual)

# sort genes based on variance explained by Individual
head(varPart[order(varPart$Individual, decreasing=TRUE),])

## ----savePlot, cache=TRUE, eval=FALSE-----------------------------------------
#  fig <- plotVarPart( vp )
#  ggsave(file, fig)

## ----plotStratify, cache=TRUE, warning=FALSE, fig.height=4, fig.width=4-------
# get gene with the highest variation across Tissues
# create data.frame with expression of gene i and Tissue 
#		type for each sample
i <- which.max( varPart$Tissue )
GE <- data.frame( Expression = geneExpr[i,], Tissue = info$Tissue)

# Figure 2a 
# plot expression stratified by Tissue 
plotStratify( Expression ~ Tissue, GE, main=rownames(geneExpr)[i])
#
# get gene with the highest variation across Individuals
# create data.frame with expression of gene i and Tissue 
#		type for each sample
i <- which.max( varPart$Individual )
GE <- data.frame( Expression = geneExpr[i,], 
	Individual = info$Individual)

# Figure 2b
# plot expression stratified by Tissue 
label <- paste("Individual:", format(varPart$Individual[i]*100, 
	digits=3), "%")
main <- rownames(geneExpr)[i]
plotStratify(  Expression ~ Individual, GE, colorBy=NULL, 
	text=label, main=main)

## ----cache=TRUE---------------------------------------------------------------
library('lme4')

# fit regression model for the first gene 
form_test <- geneExpr[1,] ~ Age + (1|Individual) + (1|Tissue)
fit <- lmer(form_test, info, REML=FALSE )

# extract variance statistics
calcVarPart(fit)

## ----canCorPairs, cache=TRUE, results='hide', fig.width=5, fig.height=5-------
form <- ~ Individual + Tissue + Batch + Age + Height

# Compute Canonical Correlation Analysis (CCA)
# between all pairs of variables
# returns absolute correlation value  
C = canCorPairs( form, info)

# Plot correlation matrix
plotCorrMatrix( C )

## ----simResult-omit, cache=TRUE, results='hide'-------------------------------

form <- ~ Age + (1|Individual) + (1|Tissue) + (1|Batch)

# Fit model
results <- fitVarPartModel( geneExpr, form, info )

# Extract results
varPart <- extractVarPart( results )

## ----simResult-fast-two-step, cache=TRUE, results='hide'----------------------
# Fit model and run summary() function on each model fit 
vpSummaries <- fitVarPartModel( geneExpr, form, info, fxn=summary )

## ----simResult-fast-two-step1, cache=TRUE-------------------------------------
# Show results of summary() for the first gene
vpSummaries[[1]]

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
form <- ~ (1|Tissue) + (1|Individual) + (1|Batch) + Age
varPart <- fitExtractVarPartModel( geneExpr, form, info )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library('limma')
# subtract out effect of Batch
fit <- lmFit( geneExpr, model.matrix(~ Batch, info))
res <- residuals( fit, geneExpr)

# fit model on residuals
form <- ~ (1|Tissue) + (1|Individual) + Age

varPartResid <- fitExtractVarPartModel( res, form, info )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
# subtract out effect of Batch with linear mixed model
modelFit <- fitVarPartModel( geneExpr, ~ (1|Batch), info )
res <- residuals( modelFit )

# fit model on residuals
form <- ~ (1|Tissue) + (1|Individual) + Age

varPartResid <- fitExtractVarPartModel( res, form, info )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
# extract residuals directly without storing intermediate results
residList <- fitVarPartModel( geneExpr, ~ (1|Batch), info, 
	fxn=residuals )

# convert list to matrix
residMatrix = do.call(rbind, residList)

## ----withinTissue, echo=TRUE, cache=TRUE, results='hide', fig.height=5, fig.width=4----
# specify formula to model within/between individual variance 
# separately for each tissue
# Note that including +0 ensures each tissue is modeled explicitly
# Otherwise, the first tissue would be used as baseline
form <- ~ (Tissue+0|Individual) + Age + (1|Tissue) + (1|Batch)

# fit model and extract variance percents
varPart <- fitExtractVarPartModel( geneExpr, form, info, showWarnings=FALSE )

# violin plot
plotVarPart( sortCols(varPart), label.angle=60 )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
form <- ~ (1|Individual) + (1|Tissue) + Age + Height

# fit model
res <- fitVarPartModel( geneExpr[1:4,], form, info )

## ----echo=TRUE, cache=TRUE----------------------------------------------------
# evaluate the collinearity score on the first model fit
# this reports the correlation matrix between coefficient estimates
# for fixed effects
# the collinearity score is the maximum absolute correlation value
# If the collinearity score > .99 then the variance partition 
# estimates may be problematic
# In that case, a least one variable should be omitted
colinearityScore( res[[1]] )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------

form <- ~ (1|Individual) + (1|Tissue) + Age + Height

# Specify custom weights
# In this example the weights are simulated from a 
# uniform distribution and are not meaningful.
weights <- matrix(runif(length(geneExpr)), nrow=nrow(geneExpr))

# Specify custom weights
res <- fitExtractVarPartModel( geneExpr[1:4,], form, info, 
	weightsMatrix=weights[1:4,] )

## ----vpInteraction, echo=TRUE, cache=TRUE, results='hide', fig.width=4, fig.height=4----
form <- ~ (1|Individual) + Age + Height + (1|Tissue) + (1|Batch) + 
  (1|Batch:Tissue)

# fit model
vpInteraction <- fitExtractVarPartModel( geneExpr, form, info )

plotVarPart( sortCols( vpInteraction ) )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library('limma')
library('edgeR')

# identify genes that pass expression cutoff
isexpr <- rowSums(cpm(geneCounts)>1) >= 0.5 * ncol(geneCounts)

# create data structure with only expressed genes
gExpr <- DGEList(counts=geneCounts[isexpr,])

# Perform TMM normalization
gExpr <- calcNormFactors(gExpr)

# Specify variables to be included in the voom() estimates of 
# uncertainty.
# Recommend including variables with a small number of categories 
# that explain a substantial amount of variation
design <- model.matrix( ~ Batch, info)

# Estimate precision weights for each gene and sample
# This models uncertainty in expression measurements
vobjGenes <- voom(gExpr, design )

# Define formula
form <- ~ (1|Individual) + (1|Tissue) + (1|Batch) + Age

# variancePartition seamlessly deals with the result of voom()
# by default, it seamlessly models the precision weights
# This can be turned off with useWeights=FALSE
varPart <- fitExtractVarPartModel( vobjGenes, form, info )

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library('DESeq2')

# create DESeq2 object from gene-level counts and metadata
dds <- DESeqDataSetFromMatrix(countData = geneCounts,
	colData = info,
	design = ~ 1)

# Estimate library size correction scaling factors
dds <- estimateSizeFactors(dds)

# identify genes that pass expression cutoff
isexpr <- rowSums(fpm(dds)>1) >= 0.5 * ncol(dds)

# compute log2 Fragments Per Million
# Alternatively, fpkm(), vst() or rlog() could be used
quantLog <- log2( fpm( dds )[isexpr,] + 1)

# Define formula
form <- ~ (1|Individual) + (1|Tissue) + (1|Batch) + Age

# Run variancePartition analysis
varPart <- fitExtractVarPartModel( quantLog, form, info)

## ----echo=TRUE, cache=TRUE, results='hide', eval=FALSE------------------------
#  library('tximportData')
#  library('tximport')
#  library('readr')
#  
#  # Get data from folder where tximportData is installed
#  dir <- system.file("extdata", package = "tximportData")
#  samples <- read.table(file.path(dir, "samples.txt"), header = TRUE)
#  files <- file.path(dir, "kallisto", samples$run, "abundance.tsv")
#  names(files) <- paste0("sample", 1:6)
#  
#  tx2gene <- read.csv(file.path(dir, "tx2gene.csv"))
#  
#  # reads results from kallisto
#  txi <- tximport(files, type = "kallisto", tx2gene = tx2gene,
#  	countsFromAbundance = "lengthScaledTPM")
#  
#  # define metadata (usually read from external source)
#  info_tximport <- data.frame( Sample = sprintf("sample%d", 1:6),
#  	Disease=c("case", "control")[c(rep(1, 3), rep(2, 3) )] )
#  
#  # Extract counts from kallisto
#  y <- DGEList( txi$counts )
#  
#  # compute library size normalization
#  y <- calcNormFactors(y)
#  
#  # apply voom to estimate precision weights
#  design <- model.matrix( ~ Disease, data = info_tximport)
#  vobj <- voom(y, design)
#  
#  # define formula
#  form <- ~ (1|Disease)
#  
#  # Run variancePartition analysis (on only 10 genes)
#  varPart_tx <- fitExtractVarPartModel( vobj[1:10,], form,
#  	info_tximport)
#  

## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library('ballgown')

# Get data from folder where ballgown is installed
data_directory <- system.file('extdata', package='ballgown') 

# Load results of Cufflinks/Tablemaker
bg <- ballgown(dataDir=data_directory, samplePattern='sample', 
	meas='all')

# extract gene-level FPKM quantification
# Expression can be convert to log2-scale if desired
gene_expression <- gexpr(bg)

# extract transcript-level FPKM quantification
# Expression can be convert to log2-scale if desired
transcript_fpkm <- texpr(bg, 'FPKM')

# define metadata (usually read from external source)
info_ballgown <- data.frame( Sample = sprintf("sample%02d", 1:20), 
	Batch = rep(letters[1:4], 5), 
	Disease=c("case", "control")[c(rep(1, 10), rep(2, 10) )] )

# define formula
form <- ~ (1|Batch) + (1|Disease)

# Run variancePartition analysis 
# Gene-level analysis
varPart_gene <- fitExtractVarPartModel( gene_expression, form, 
	info_ballgown)

# Transcript-level analysis
varPart_transcript <- fitExtractVarPartModel( transcript_fpkm, form, 
	info_ballgown)

## ----echo=FALSE, cache=TRUE, results='hide'-----------------------------------
library('variancePartition')

sim_data = function( n, p, var_indiv, var_site){

  info = data.frame(ID = paste("s", 1:n, sep=''), 
    Individual = factor(rep( round(seq(1, n/6, length.out=n/6)),6)), 
    Tissue = factor(sort(rep(toupper(letters[1:3]), n/3))),
    Age = rpois(n, 50))

  geneExpr = matrix(0, nrow=p, ncol=n)
  for( i in 1:p){
    beta_indiv = rnorm(nlevels(info$Individual), 0, sqrt(var_indiv))
    beta_site = rnorm(nlevels(info$Tissue), 0, sqrt(var_site))
    eta = model.matrix( ~ Individual, info) %*% beta_indiv + model.matrix( ~ Tissue+0, info) %*% beta_site

    noise = rbeta(1, 10, 100)
    errVar = var(eta) * (noise) / (1-noise)

    geneExpr[i,] = eta + rnorm(n, 0, sqrt(errVar))
  }
  colnames(geneExpr) = paste("s", 1:ncol(geneExpr), sep='')
  rownames(geneExpr) = paste("gene", 1:nrow(geneExpr), sep='')

  return( list(info=info, geneExpr=geneExpr))
}

plotVar = function( geneExpr, info ){

  form =  ~ (1|Individual) + (1|Tissue)
  varPart = fitExtractVarPartModel( geneExpr, form, info )
  plotVarPart( varPart, col=rainbow(8)[1:3] )
}

plotVarCross = function( geneExpr, info, label.angle=30 ){

  form =  ~ (Tissue+0|Individual) + (1|Tissue)

  #res = fitVarPartModel( geneExpr, form, info )
  #varPart = extractVarPart( res )

  varPart = fitExtractVarPartModel( geneExpr, form, info, showWarnings=FALSE )

  plotVarPart( varPart, label.angle=label.angle )
}

plotPCA = function(geneExpr, col){

  dcmp = prcomp(t(geneExpr))

  par(mar =  c(4, 4, 1, 1) + 0.1)
  plot(dcmp$x[,1:2], col=col)

}

plotTree = function(geneExpr, col){

  hc = hclust(dist(t(geneExpr)))
  library('dendextend')
  hcd = as.dendrogram(hc)
  labels_colors(hcd) <- col[match(labels(hcd), names(col))]

  par(mar =  c(4, 4, 1, 1) + 0.1, cex=.6)
  plot( hcd, yaxt='n', horiz=TRUE )
}

## ----siteDominant, echo=FALSE, cache=TRUE, results='hide', fig.height=5, fig.width=5----
set.seed(1)

n = 60
p = 200

data = sim_data( n, p, 1, 4)

colTissue = rainbow(nlevels(data$info$Tissue))[data$info$Tissue]
names(colTissue) = data$info$ID

colIndiv = rainbow(nlevels(data$info$Individual))[data$info$Individual]
names(colIndiv) = data$info$ID

plotPCA( data$geneExpr, colTissue)

plotPCA( data$geneExpr, colIndiv)

plotTree( data$geneExpr, colTissue)
legend("topleft", legend=levels(data$info$Tissue), fill=rainbow(nlevels(data$info$Tissue))[1:nlevels(data$info$Tissue)], title="Tissue")

plotTree( data$geneExpr, colIndiv)
legend("topleft", legend=levels(data$info$Individual), fill=rainbow(nlevels(data$info$Individual))[1:nlevels(data$info$Individual)], title="Individual")

plotVar( data$geneExpr, data$info )

plotVarCross( data$geneExpr, data$info, label.angle=60 )

## ----IndivDominant, echo=FALSE, cache=TRUE, results='hide', fig.height=5, fig.width=5----
set.seed(1)

n = 60
p = 200

data = sim_data( n, p, 3, 1)

colTissue = rainbow(nlevels(data$info$Tissue))[data$info$Tissue]
names(colTissue) = data$info$ID

colIndiv = rainbow(nlevels(data$info$Individual))[data$info$Individual]
names(colIndiv) = data$info$ID

plotPCA( data$geneExpr, colTissue)

plotPCA( data$geneExpr, colIndiv)

plotTree( data$geneExpr, colTissue)
legend("topleft", legend=levels(data$info$Tissue), fill=rainbow(nlevels(data$info$Tissue))[1:nlevels(data$info$Tissue)], title="Tissue")

plotTree( data$geneExpr, colIndiv)
legend("topleft", legend=levels(data$info$Individual), fill=rainbow(nlevels(data$info$Individual))[1:nlevels(data$info$Individual)], title="Individual")

plotVar( data$geneExpr, data$info )

plotVarCross( data$geneExpr, data$info, label.angle=60 )

## ----echo=FALSE, cache=FALSE--------------------------------------------------
# if( "cluster" %in% class(cl) ){
if( exists("cl") ){
	library('doParallel')
	
	# stop cluster and catch warning if invalid
	res = tryCatch( {stopCluster(cl)}, warning = function(x) {
	}, error = function(x) {
	}, finally={
	})
	# warning("STOP CLUSTER!!!!!!!!!!!!!!!!!\n")
}

## ----DE, echo=FALSE, cache=TRUE, fig.height=5, fig.width=5--------------------
set.seed(1) 

library('variancePartition')

n = 500
data = data.frame(Sex = factor(c(rep('F', n), rep('M', n))))
data$expression = rnorm(2*n, (as.integer(data$Sex)-1)*2, .3)
data$Sex = factor(data$Sex)

fit = lm(expression ~ Sex, data)
# calcVarPart(fit)
# coef(fit)[2]

plotStratify( expression ~ Sex, data, ylim=c(-6, 9)) +
 annotate("text", x = 0.5, y = 9, hjust=0, label = paste("fold change:", format(coef(fit)[2], digits=3)), size=5.5) +
 annotate("text", x = 0.5, y = 8.2, hjust=0, label = paste("% variance of expression:", format(calcVarPart(fit)[1]*100, digits=3), "%"), size=5.5)


n = 500
data = data.frame(Sex = factor(c(rep('F', n), rep('M', n))))
data$expression = rnorm(2*n, (as.integer(data$Sex)-1)*2, 2.01)
data$Sex = factor(data$Sex)

fit = lm(expression ~ Sex, data)
# calcVarPart(fit)
# coef(fit)[2] 

plotStratify( expression ~ Sex, data, ylim=c(-6, 9)) +
 annotate("text", x = 0.5, y = 9, hjust=0, label = paste("fold change:", format(coef(fit)[2], digits=3)), size=5.5) +
 annotate("text", x = 0.5, y = 8.2, hjust=0, label = paste("% variance of expression:", format(calcVarPart(fit)[1]*100, digits=3), "%"), size=5.5)

## ----sessInfo, results="asis", echo=FALSE-------------------------------------
toLatex(sessionInfo())

## ----resetOptions, results="hide", echo=FALSE---------------------------------
options(prompt="> ", continue="+ ")
Any scripts or data that you put into this service are public.
variancePartition documentation built on Nov. 8, 2020, 5:18 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
variancePartition
Quantify and interpret divers of variation in multilevel gene expression experiments

inst/doc/variancePartition.R
In variancePartition: Quantify and interpret divers of variation in multilevel gene expression experiments

Try the variancePartition package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

variancePartition Quantify and interpret divers of variation in multilevel gene expression experiments

inst/doc/variancePartition.R In variancePartition: Quantify and interpret divers of variation in multilevel gene expression experiments

Try the variancePartition package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

variancePartition
Quantify and interpret divers of variation in multilevel gene expression experiments

inst/doc/variancePartition.R
In variancePartition: Quantify and interpret divers of variation in multilevel gene expression experiments