Nothing
## ----libraries,echo=FALSE, error=FALSE,echo=TRUE,results='hide',warning=FALSE----
suppressPackageStartupMessages({
library(ggplot2)
library(GGally)
library(GSEABase)
library(limma)
library(reshape2)
library(data.table)
library(knitr)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(stringr)
library(NMF)
library(rsvd)
library(RColorBrewer)
library(MAST)
})
#options(mc.cores = detectCores() - 1) #if you have multiple cores to spin
options(mc.cores = 1)
knitr::opts_chunk$set(message = FALSE,error = FALSE,warning = FALSE,cache = FALSE,fig.width=8,fig.height=6)
## ----MASToptions--------------------------------------------------------------
freq_expressed <- 0.2
FCTHRESHOLD <- log2(1.5)
## ----data,results='hide'------------------------------------------------------
data(maits, package='MAST')
dim(maits$expressionmat)
head(maits$cdat)
head(maits$fdat)
## ----createSca----------------------------------------------------------------
scaRaw <- FromMatrix(t(maits$expressionmat), maits$cdat, maits$fdat)
## ----heatmap, dpi = 36,dev="png"----------------------------------------------
aheatmap(assay(scaRaw[1:1000,]), labRow='', annCol=as.data.frame(colData(scaRaw)[,c('condition', 'ourfilter')]), distfun='spearman')
## ----pca, dpi = 36------------------------------------------------------------
set.seed(123)
plotPCA <- function(sca_obj){
projection <- rpca(t(assay(sca_obj)), retx=TRUE, k=4)$x
colnames(projection)=c("PC1","PC2","PC3","PC4")
pca <- data.table(projection, as.data.frame(colData(sca_obj)))
print(ggpairs(pca, columns=c('PC1', 'PC2', 'PC3', 'libSize', 'PercentToHuman', 'nGeneOn', 'exonRate'),
mapping=aes(color=condition), upper=list(continuous='blank')))
invisible(pca)
}
plotPCA(scaRaw)
filterCrit <- with(colData(scaRaw), pastFastqc=="PASS"& exonRate >0.3 & PercentToHuman>0.6 & nGeneOn> 4000)
## ----filter_outlying_cell,results='hide'--------------------------------------
sca <- subset(scaRaw,filterCrit)
eid <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,keys = mcols(sca)$entrez,keytype ="GENEID",columns = c("GENEID","TXNAME"))
ueid <- unique(na.omit(eid)$GENEID)
sca <- sca[mcols(sca)$entrez %in% ueid,]
## Remove invariant genes
sca <- sca[sample(which(freq(sca)>0), 6000),]
## ---- fig.width=4, fig.height=4-----------------------------------------------
cdr2 <-colSums(assay(sca)>0)
qplot(x=cdr2, y=colData(sca)$nGeneOn) + xlab('New CDR') + ylab('Old CDR')
## -----------------------------------------------------------------------------
colData(sca)$cngeneson <- scale(cdr2)
## ----pcaFilter, dpi = 36------------------------------------------------------
plotPCA(sca)
## ----distribution, fig.width=6, fig.height=6----------------------------------
scaSample <- sca[sample(which(freq(sca)>.1), 20),]
flat <- as(scaSample, 'data.table')
ggplot(flat, aes(x=value))+geom_density() +facet_wrap(~symbolid, scale='free_y')
## ----threshold, results='hide', fig.width=6-----------------------------------
thres <- thresholdSCRNACountMatrix(assay(sca), nbins = 20, min_per_bin = 30)
par(mfrow=c(5,4))
plot(thres)
## ----assignThresh-------------------------------------------------------------
assays(sca, withDimnames = FALSE) <- list(thresh=thres$counts_threshold, tpm=assay(sca))
expressed_genes <- freq(sca) > freq_expressed
sca <- sca[expressed_genes,]
## ----zlm----------------------------------------------------------------------
cond<-factor(colData(sca)$condition)
cond<-relevel(cond,"Unstim")
colData(sca)$condition<-cond
zlmCond <- zlm(~condition + cngeneson, sca)
# The following are equivalent
## lrt <- lrTest(zlm, "condition")
## lrt <- lrTest(zlm, CoefficientHypothesis('conditionStim'))
# This would test if 2*cngeneson=conditionStim
# This is sheer nonsense biologically and statistically, but gives an example of the flexibility.
## lrt <- lrTest(zlm, Hypothesis('2*cngeneson-conditionStim'))
## ----zlmSummary---------------------------------------------------------------
#only test the condition coefficient.
summaryCond <- summary(zlmCond, doLRT='conditionStim')
#print the top 4 genes by contrast using the logFC
print(summaryCond, n=4)
## by discrete Z-score
print(summaryCond, n=4, by='D')
## by continuous Z-score
print(summaryCond, n=4, by='C')
## -----------------------------------------------------------------------------
summaryDt <- summaryCond$datatable
fcHurdle <- merge(summaryDt[contrast=='conditionStim' & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values
summaryDt[contrast=='conditionStim' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients
fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')]
fcHurdleSig <- merge(fcHurdle[fdr<.05 & abs(coef)>FCTHRESHOLD], as.data.table(mcols(sca)), by='primerid')
setorder(fcHurdleSig, fdr)
## ---- fig.width=8,fig.height=8, dpi = 36--------------------------------------
entrez_to_plot <- fcHurdleSig[1:50,primerid]
symbols_to_plot <- fcHurdleSig[1:50,symbolid]
flat_dat <- as(sca[entrez_to_plot,], 'data.table')
ggbase <- ggplot(flat_dat, aes(x=condition, y=thresh,color=condition)) + geom_jitter()+facet_wrap(~symbolid, scale='free_y')+ggtitle("DE Genes in Activated MAIT Cells")
ggbase+geom_violin()
## ---- dpi = 36----------------------------------------------------------------
flat_dat[,lmPred:=lm(thresh~cngeneson + condition)$fitted, key=symbolid]
ggbase +aes(x=cngeneson) + geom_line(aes(y=lmPred), lty=1) + xlab('Standardized Cellular Detection Rate')
## ---- dpi = 36----------------------------------------------------------------
## This is all rather kludgy at the moment
MM <- model.matrix(~condition,unique(colData(sca)[,c("condition"),drop=FALSE]))
rownames(MM) <- str_extract(rownames(MM), 'Stim|Unstim')
predicted <- predict(zlmCond,modelmatrix=MM)
## Avert your eyes...
predicted[, primerid:=as.character(primerid)]
predicted_sig <- merge(mcols(sca), predicted[primerid%in%entrez_to_plot], by='primerid')
predicted_sig <- as.data.table(predicted_sig)
## plot with inverse logit transformed x-axis
ggplot(predicted_sig)+aes(x=invlogit(etaD),y=muC,xse=seD,yse=seC,col=sample)+
facet_wrap(~symbolid,scales="free_y")+theme_linedraw()+
geom_point(size=0.5)+scale_x_continuous("Proportion expression")+
scale_y_continuous("Estimated Mean")+
stat_ell(aes(x=etaD,y=muC),level=0.95, invert='x')
## ---- fig.height=8------------------------------------------------------------
mat_to_plot <- assay(sca[entrez_to_plot,])
rownames(mat_to_plot) <- symbols_to_plot
aheatmap(mat_to_plot,annCol=colData(sca)[,"condition"],main="DE genes",col=rev(colorRampPalette(colors = brewer.pal(name="PiYG",n=10))(20)))
## -----------------------------------------------------------------------------
table(colData(sca)$beta, exclude=NULL)
#Note that we currently throw an uninformative error if a covariate is `NA`
scaHasBeta <- subset(sca, !is.na(beta))
## ----residuals----------------------------------------------------------------
scaDE <- sca[entrez_to_plot,]
zlmResidDE <- zlm(~condition + cngeneson, scaDE, hook=combined_residuals_hook)
residDE <- zlmResidDE@hookOut
residDEMatrix <- do.call(rbind, residDE)
## ----addResiduals, dpi = 36---------------------------------------------------
assays(scaDE, withDimnames = FALSE) <- c(assays(scaDE), list(resid=residDEMatrix))
scaResidFlat <- as(scaDE, 'data.table')
scaResidFlat[1:4,]
ggplot(scaResidFlat, aes(x=ngeneson, y=resid))+geom_point(aes(col=condition))+geom_smooth()+facet_wrap(~symbolid)
## ----boots, eval=TRUE---------------------------------------------------------
# bootstrap, resampling cells
# R should be set to >50 if you were doing this for real.
boots <- bootVcov1(zlmCond, R = 4)
## ----setupBTM, dependson="data;zlm;boots",results='hide'----------------------
module <- "BTM"
min_gene_in_module <- 5
packageExt <- system.file("extdata", package='MAST')
module_file <- list.files(packageExt, pattern = module, full.names = TRUE)
gene_set <- getGmt(module_file)
gene_ids <- geneIds(gene_set)
gene_ids <- gene_ids[!names(gene_ids)%like%"TBA"&!names(gene_ids)%like%"B cell"]
sets_indices <- limma::ids2indices(gene_ids, mcols(sca)$symbolid)
# Only keep modules with at least min_gene_in_module
sets_indices <- sets_indices[sapply(sets_indices, length) >= min_gene_in_module]
## ----gsea---------------------------------------------------------------------
gsea <- gseaAfterBoot(zlmCond, boots, sets_indices, CoefficientHypothesis("conditionStim"))
z_stat_comb <- summary(gsea, testType='normal')
## ----gseaView-----------------------------------------------------------------
sigModules <- z_stat_comb[combined_adj<.01]
gseaTable <- melt(sigModules[,.(set, disc_Z, cont_Z, combined_Z)], id.vars='set')
ggplot(gseaTable, aes(y=set, x=variable, fill=value))+geom_raster() + scale_fill_distiller(palette="PiYG")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.