knitr::opts_chunk$set(echo = TRUE)
This script reads in the data output to the feather file and uses it to calculate and plot some QC information for Supplementary figure 1. It also performs a reclustering of excitatory cells to show that the PT (now called ET) cluster does not get split into further clusters using less stringent parameters.
If needed, set the working directory first: e.g., setwd("C:/Users/jeremym/Desktop/VEN_TEST")
.
suppressPackageStartupMessages({ library(VENcelltypes) library(dplyr); library(feather) library(gplots) library(dplyr) library(ggplot2) library(ggbeeswarm) library(scrattch.hicat) library(Rtsne) }) options(stringsAsFactors=FALSE)
These files were created in the previous code file, which must be run prior to running this code block.
inputFolder = "FI/" Samp.dat <- read_feather(paste(inputFolder,"anno.feather",sep="")) Expr.dat <- feather(paste(inputFolder,"data.feather",sep="")) # FPKM Expr.dat <- Expr.dat[match(Samp.dat$sample_id,Expr.dat$sample_id),] # Make sure the expression matches the sample information datIn <- as.matrix(Expr.dat[,names(Expr.dat)!="sample_id"]) rownames(datIn) <- Expr.dat$sample_id datIn <- t(datIn) load(paste0(inputFolder,"clusterInfo.rda")) load(paste0(inputFolder,"dend.rda")) kpSamp <- is.element(Samp.dat$cluster_type_label,c("inh","exc","glia")) cl3 <- factor(Samp.dat$cluster_label,levels = clusterInfo$cluster_label) names(cl3) <- colnames(datIn) clustersF <- droplevels(cl3[kpSamp]) clusts <- levels(clustersF) datKp <- datIn[,kpSamp]
types = Samp.dat$cluster_type_label[kpSamp] cols = c("blue","purple","orange") names(cols) = unique(types) numGenes <- c(0,2,5,10,20,50,100,200,500) yy = NULL for (i in 1:length(numGenes)){ maxCPM = colSums(datKp>numGenes[i]) yTmp = NULL for (ty in unique(types)) yTmp = c(yTmp,median(maxCPM[types==ty])) yy = cbind(yy,round(yTmp)) } rownames(yy) = unique(types) data.frame(t(rbind(numGenes,yy)))
There are a median of 10,339 genes detected in excitatory cells with CPM>1 (and 9,426 in inhibitory cells, and 6,6146 in glia).
print(paste("We find a median of",median(Samp.dat$mappedReads_label),"reads per nucleus."))
We next want to plot some QC statistics (genes detected, # mapped reads, % exonic reads, and % intronic reads) at the level of clusters. To do this we first need to prepare some variables.
cluster_anno <- Samp.dat %>% select(cluster_id, cluster_label, cluster_color) %>% unique() cluster_anno <- cluster_anno[match(labels(dend),cluster_anno$cluster_label),] # Note, the plots format the labels weird and we shorten the names just to check that they are ordered correctly # They are and therefore the names are corrected in Adobe Illustrator cluster_anno$cluster_label <- substr(cluster_anno$cluster_label,nchar(cluster_anno$cluster_label)-6,nchar(cluster_anno$cluster_label)) anno = as.data.frame(Samp.dat)
Next we output these plots to a file.
qcPlot(anno,"genecountsGTzeroFPKM_label",scaleLimits = c(-4000, 16056), scaleBreaks = seq(0, 16000, 2000), scaleLabels = seq(0,16,2),ylab="Genes>0 (thousands)",width=4,height=2) qcPlot(anno,"mappedReads_label",scaleLimits = c(-3750000, 15000000), scaleBreaks = seq(0, 15000000, 5000000), scaleLabels = seq(0,15,5),ylab="Mapped reads (millions)",width=4,height=2) qcPlot(anno,"PCT_INTRONIC_BASES_label",scaleLimits = c(-0.25,1), scaleBreaks = seq(0, 1, 0.25), scaleLabels = seq(0,100,25),ylab="% intronic reads",width=4,height=2) qcPlot(anno,"PCT_HUMAN_EXONS_BASES_label",scaleLimits = c(-0.25,1), scaleBreaks = seq(0, 1, 0.25), scaleLabels = seq(0,100,25),ylab="% exonic reads",width=4,height=2)
We noticed that cells from one donor tended to be of higher quality and higher pass rate than cells from the second donor and want to see whether this introduced a bias in our sampling. First, let's look at the class level to see if we have a comparable fraction.
cellClass <- substr(anno$cluster_label,1,3) cellClass[!is.element(cellClass,c("Exc","Inh"))] = "Glia" dat2 <- table(factor(cellClass,levels=c("Inh","Exc","Glia")),anno$patient_id_label) dat2 <- round(100*t(t(dat2)/colSums(dat2))) par(mar=c(14,4,2,1)) barplot(t(dat2/100),las=2,beside=TRUE,ylab="Fraction in specimen") abline(h=0) pdf("FigS1_class_barplot.pdf",height=5,width=2.5) par(mar=c(14,4,2,1)) barplot(t(dat2/100),las=2,beside=TRUE,ylab="Fraction in specimen") abline(h=0) dev.off()
We do find a dearth of exictatory cells in one of the donors relative tot he other donor and relative to expectations. What about at the level of cell types, if we control for this difference in cells per class?
datScale <- table(factor(anno$cluster_label,levels=labels(dend)),anno$patient_id_label) i = 1:5; datScale[i,] <- t(t(datScale[i,])/colSums(datScale[i,])) i = 6:18; datScale[i,] <- t(t(datScale[i,])/colSums(datScale[i,])) i = 19:22; datScale[i,] <- t(t(datScale[i,])/colSums(datScale[i,])) par(mar=c(14,4,2,1)) barplot(t(datScale),las=2,beside=TRUE,ylab="Fraction in class",legend=TRUE,xlim=c(0,85)) abline(v=c(15.5,54.5)) abline(h=0) pdf("FigS1_cluster_barplot.pdf",height=5,width=12) par(mar=c(14,4,2,1)) barplot(t(datScale),las=2,beside=TRUE,ylab="Fraction in class",legend=TRUE,xlim=c(0,85)) abline(v=c(15.5,54.5)) abline(h=0) dev.off()
Most cell types have a relatively even distribution in the two donors, after you scale for the number of cells from each class.
Since the primary focus of this manuscript is the Exc FEZF2 GABRQ cluster, we want check that this cannot be further divided into subclusters with the available resolution of the current data set, and using reasonably statistically sound methods. To do so, we first adjust our scrattch.hicat
clustering pipeline to use parameters that are much more lenient that recommended and repeat the clustering and see how the results change.
isExc <- is.element(Samp.dat$cluster_type_label,"exc") datFI <- log2(datIn[,isExc]+1) datFI <- datFI[rowSums(datFI)>=1,] annoFI <- Samp.dat[isExc,] de.param <- de_param(padj.th = 0.1, # 0.05, # Recommended or default parameters commented out lfc.th = 0.2, # 1, low.th = 1, # 1, q1.th = 0.2, # 0.5, q.diff.th = 0.3, # 0.7 de.score.th = 5, # 40 min.cells = 3) # 4 cluster_new <- iter_clust(datFI,dim.method = "pca", de.param = de.param, split.size = 4) (tab <- table(annoFI$cluster_label,as.numeric(as.character(cluster_new$cl))))
Plot the results as compared with the initial clustering.
cl22 <- as.numeric(factor(cluster_new$cl,levels = as.numeric(sort(names(apply(tab,2,which.max)))))) names(cl22) <- annoFI$sample_id ref.cl <- setNames(factor(annoFI$cluster_id), annoFI$sample_id) ref.cl.df <- as.data.frame(unique(annoFI[,c("cluster_label", "cluster_id")])) ref.cl.df <- ref.cl.df[order(ref.cl.df$cluster_id),] rownames(ref.cl.df) <- ref.cl.df$cluster_id compare.result <- compare_annotate(cl22, ref.cl, ref.cl.df) compare.result$g ggsave("FigS1_clusterComparisonLenientParams.pdf", width = 7, height = 4)
Several studies have identified genes (or proteins) expressed in VENs using a variety of strategies. Here we will use these genes as a starting point to attempt to split the VEN-associated cluster into multiple cell types using a supervised approach. First, let's see which of these genes are available for plotting in our study.
literature <- c("VAT1L","CHST8","LYPD1","SULF2","BCL11B","FEZF2","SLC18A2","GABRQ","ADRA1A","ATF3","IL4R", "NMB","DISC1","VGF","ABAT","CBLN2","CHRM1","CHRNA4","CNR1","CRYM","DLD","GABBR1","GABRR2", "GABRA","GABRB2","GABRB3","GABRD","GLRB","GLS","GOT1","GOT2","GRIA1","GRIA2","GRIA3","GRIK2", "GRIN1","GRIN2A","GRIN2B","GRIN3A","HRH3","HTR3B","NEFH","NNAT","SCG2","SYT2","TAC1","NPY", "IGFBP2","MEF2C","CPLX1","MBP","PLP1","RNA5SP352","ACA64","ALKBH3","FABP6","PLP1","METRNL", "LINC00982","RNU6-1240P","RGMA","PRR5","HAPLN4","SOHLH1","HTR2B","DRD3","GRP","LMO4") print(paste("Genes not in our data set:",paste(setdiff(literature,rownames(datKp)),collapse=", "))) literature <- intersect(literature,rownames(normDat))
Next, calculate PCS of the VEN-associated cluster using this set of genes, and 100 random sets of genes for comparison.
kpVEN <- clustersF=="Exc FEZF2 GABRQ" datVEN <- t(datKp[literature,kpVEN]) datVEN <- log2(datVEN[,colSums(datVEN)>0]+1) pcVEN <- prcomp(datVEN, center = TRUE, scale = TRUE) veVEN <- (summary(pcVEN))[[6]][2,1] datRAN <- t(datKp[,kpVEN][rowSums(datKp[,kpVEN])>0,]) veRAN <- NULL for (i in 1:100){ gnRAN <- subsampleCells(rep(0,dim(datRAN)[2]),dim(datVEN)[2],seed=i) pcRAN <- prcomp(datRAN[,gnRAN], center = TRUE, scale = TRUE) veRAN <- c(veRAN,(summary(pcRAN))[[6]][2,1]) } hist(veRAN,main=paste("P =",mean(veRAN>veVEN)), xlab="Variance explained by PC1") abline(v=veVEN,lwd=3,col="green") pdf("FigS1_PC1varianceExplained.pdf") hist(veRAN,main=paste("P =",mean(veRAN>veVEN)), xlab="Variance explained by PC1") abline(v=veVEN,lwd=3,col="green") dev.off()
The variance explained by PC1 is not significantly different from chance.
Finally, show a TNSE plot of the results.
pdf("FigS1_rtsne_plot.pdf",height=8,width=8) par(mfrow=c(2,2)) for (p in c(4:7)){ tsne <- Rtsne(datVEN,perplexity=p)$Y plot(tsne,xlab="TSNE 1",ylab="TSNE2",main=paste("Perplexity =",p),pch=19) } dev.off() plot(tsne,xlab="TSNE 1",ylab="TSNE2",main=paste("Perplexity =",p),pch=19)
There are not obvious subclusters using these genes.
Output session information.
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.