In AllenInstitute/L5_VEN:

knitr::opts_chunk$set(echo = TRUE)

Overview

This script reads in the data output to the feather file and uses it to calculate and plot some QC information for Supplementary figure 1. It also performs a reclustering of excitatory cells to show that the PT (now called ET) cluster does not get split into further clusters using less stringent parameters.

If needed, set the working directory first: e.g., setwd("C:/Users/jeremym/Desktop/VEN_TEST").

Load the relevant libraries

suppressPackageStartupMessages({
  library(VENcelltypes)
  library(dplyr);
  library(feather)
  library(gplots)
  library(dplyr)
  library(ggplot2)
  library(ggbeeswarm)
  library(scrattch.hicat)
  library(Rtsne)
})
options(stringsAsFactors=FALSE)

Read in the data and metadata

These files were created in the previous code file, which must be run prior to running this code block.

inputFolder = "FI/"

Samp.dat   <- read_feather(paste(inputFolder,"anno.feather",sep="")) 
Expr.dat   <- feather(paste(inputFolder,"data.feather",sep=""))   # FPKM
Expr.dat   <- Expr.dat[match(Samp.dat$sample_id,Expr.dat$sample_id),] # Make sure the expression matches the sample information
datIn      <- as.matrix(Expr.dat[,names(Expr.dat)!="sample_id"])
rownames(datIn) <- Expr.dat$sample_id
datIn      <- t(datIn)
load(paste0(inputFolder,"clusterInfo.rda"))
load(paste0(inputFolder,"dend.rda"))

kpSamp     <- is.element(Samp.dat$cluster_type_label,c("inh","exc","glia"))
cl3        <- factor(Samp.dat$cluster_label,levels = clusterInfo$cluster_label)
names(cl3) <- colnames(datIn)
clustersF  <- droplevels(cl3[kpSamp])
clusts     <- levels(clustersF)
datKp      <- datIn[,kpSamp]

Calculate genes detected by broad class.

types = Samp.dat$cluster_type_label[kpSamp]
cols  = c("blue","purple","orange")
names(cols) = unique(types)
numGenes <- c(0,2,5,10,20,50,100,200,500)

yy = NULL
for (i in 1:length(numGenes)){
  maxCPM = colSums(datKp>numGenes[i])
  yTmp = NULL
  for (ty in unique(types)) yTmp = c(yTmp,median(maxCPM[types==ty]))
  yy = cbind(yy,round(yTmp))
}
rownames(yy) = unique(types)

data.frame(t(rbind(numGenes,yy)))

There are a median of 10,339 genes detected in excitatory cells with CPM>1 (and 9,426 in inhibitory cells, and 6,6146 in glia).

print(paste("We find a median of",median(Samp.dat$mappedReads_label),"reads per nucleus."))

Make jitterplots for other QC statistics

We next want to plot some QC statistics (genes detected, # mapped reads, % exonic reads, and % intronic reads) at the level of clusters. To do this we first need to prepare some variables.

cluster_anno <- Samp.dat %>%
  select(cluster_id, cluster_label, cluster_color) %>%
  unique()
cluster_anno <- cluster_anno[match(labels(dend),cluster_anno$cluster_label),]

# Note, the plots format the labels weird and we shorten the names just to check that they are ordered correctly
# They are and therefore the names are corrected in Adobe Illustrator
cluster_anno$cluster_label <- substr(cluster_anno$cluster_label,nchar(cluster_anno$cluster_label)-6,nchar(cluster_anno$cluster_label))

anno = as.data.frame(Samp.dat)

Next we output these plots to a file.

qcPlot(anno,"genecountsGTzeroFPKM_label",scaleLimits = c(-4000, 16056), scaleBreaks = seq(0, 16000, 2000), 
  scaleLabels = seq(0,16,2),ylab="Genes>0 (thousands)",width=4,height=2)
qcPlot(anno,"mappedReads_label",scaleLimits = c(-3750000, 15000000), scaleBreaks = seq(0, 15000000, 5000000), 
  scaleLabels = seq(0,15,5),ylab="Mapped reads (millions)",width=4,height=2)
qcPlot(anno,"PCT_INTRONIC_BASES_label",scaleLimits = c(-0.25,1), scaleBreaks = seq(0, 1, 0.25), 
  scaleLabels = seq(0,100,25),ylab="% intronic reads",width=4,height=2)
qcPlot(anno,"PCT_HUMAN_EXONS_BASES_label",scaleLimits = c(-0.25,1), scaleBreaks = seq(0, 1, 0.25), 
  scaleLabels = seq(0,100,25),ylab="% exonic reads",width=4,height=2)

Plot the fraction of cells from each donor

We noticed that cells from one donor tended to be of higher quality and higher pass rate than cells from the second donor and want to see whether this introduced a bias in our sampling. First, let's look at the class level to see if we have a comparable fraction.

cellClass <- substr(anno$cluster_label,1,3)
cellClass[!is.element(cellClass,c("Exc","Inh"))] = "Glia"
dat2 <- table(factor(cellClass,levels=c("Inh","Exc","Glia")),anno$patient_id_label)
dat2 <- round(100*t(t(dat2)/colSums(dat2)))

par(mar=c(14,4,2,1))
barplot(t(dat2/100),las=2,beside=TRUE,ylab="Fraction in specimen")
abline(h=0)

pdf("FigS1_class_barplot.pdf",height=5,width=2.5)
par(mar=c(14,4,2,1))
barplot(t(dat2/100),las=2,beside=TRUE,ylab="Fraction in specimen")
abline(h=0)
dev.off()

We do find a dearth of exictatory cells in one of the donors relative tot he other donor and relative to expectations. What about at the level of cell types, if we control for this difference in cells per class?

datScale <- table(factor(anno$cluster_label,levels=labels(dend)),anno$patient_id_label)

i = 1:5;
datScale[i,] <- t(t(datScale[i,])/colSums(datScale[i,]))
i = 6:18;
datScale[i,] <- t(t(datScale[i,])/colSums(datScale[i,]))
i = 19:22;
datScale[i,] <- t(t(datScale[i,])/colSums(datScale[i,]))

par(mar=c(14,4,2,1))
barplot(t(datScale),las=2,beside=TRUE,ylab="Fraction in class",legend=TRUE,xlim=c(0,85))
abline(v=c(15.5,54.5))
abline(h=0)

pdf("FigS1_cluster_barplot.pdf",height=5,width=12)
par(mar=c(14,4,2,1))
barplot(t(datScale),las=2,beside=TRUE,ylab="Fraction in class",legend=TRUE,xlim=c(0,85))
abline(v=c(15.5,54.5))
abline(h=0)
dev.off()

Most cell types have a relatively even distribution in the two donors, after you scale for the number of cells from each class.

Recluster excitatory cells.

Since the primary focus of this manuscript is the Exc FEZF2 GABRQ cluster, we want check that this cannot be further divided into subclusters with the available resolution of the current data set, and using reasonably statistically sound methods. To do so, we first adjust our scrattch.hicat clustering pipeline to use parameters that are much more lenient that recommended and repeat the clustering and see how the results change.

isExc  <- is.element(Samp.dat$cluster_type_label,"exc")
datFI  <- log2(datIn[,isExc]+1)
datFI  <- datFI[rowSums(datFI)>=1,]
annoFI <- Samp.dat[isExc,] 

de.param <- de_param(padj.th     = 0.1, # 0.05, # Recommended or default parameters commented out
                     lfc.th      = 0.2, # 1, 
                     low.th      = 1, # 1,
                     q1.th       = 0.2, # 0.5, 
                     q.diff.th   = 0.3, # 0.7
                     de.score.th = 5,   # 40
                     min.cells   = 3)   # 4

cluster_new <- iter_clust(datFI,dim.method = "pca", de.param = de.param, split.size = 4)
(tab <- table(annoFI$cluster_label,as.numeric(as.character(cluster_new$cl))))

Plot the results as compared with the initial clustering.

cl22 <- as.numeric(factor(cluster_new$cl,levels = as.numeric(sort(names(apply(tab,2,which.max))))))
names(cl22) <- annoFI$sample_id

ref.cl    <- setNames(factor(annoFI$cluster_id), annoFI$sample_id)
ref.cl.df <- as.data.frame(unique(annoFI[,c("cluster_label", "cluster_id")]))
ref.cl.df <- ref.cl.df[order(ref.cl.df$cluster_id),]
rownames(ref.cl.df) <- ref.cl.df$cluster_id
compare.result <- compare_annotate(cl22, ref.cl, ref.cl.df)
compare.result$g
ggsave("FigS1_clusterComparisonLenientParams.pdf", width = 7, height = 4)

Supervised analysis of VEN-associated cluster

Several studies have identified genes (or proteins) expressed in VENs using a variety of strategies. Here we will use these genes as a starting point to attempt to split the VEN-associated cluster into multiple cell types using a supervised approach. First, let's see which of these genes are available for plotting in our study.

literature <- c("VAT1L","CHST8","LYPD1","SULF2","BCL11B","FEZF2","SLC18A2","GABRQ","ADRA1A","ATF3","IL4R",
                "NMB","DISC1","VGF","ABAT","CBLN2","CHRM1","CHRNA4","CNR1","CRYM","DLD","GABBR1","GABRR2",
                "GABRA","GABRB2","GABRB3","GABRD","GLRB","GLS","GOT1","GOT2","GRIA1","GRIA2","GRIA3","GRIK2",
                "GRIN1","GRIN2A","GRIN2B","GRIN3A","HRH3","HTR3B","NEFH","NNAT","SCG2","SYT2","TAC1","NPY",
                "IGFBP2","MEF2C","CPLX1","MBP","PLP1","RNA5SP352","ACA64","ALKBH3","FABP6","PLP1","METRNL",
                "LINC00982","RNU6-1240P","RGMA","PRR5","HAPLN4","SOHLH1","HTR2B","DRD3","GRP","LMO4")
print(paste("Genes not in our data set:",paste(setdiff(literature,rownames(datKp)),collapse=", ")))
literature <- intersect(literature,rownames(normDat))

Next, calculate PCS of the VEN-associated cluster using this set of genes, and 100 random sets of genes for comparison.

kpVEN  <- clustersF=="Exc FEZF2 GABRQ"
datVEN <- t(datKp[literature,kpVEN])
datVEN <- log2(datVEN[,colSums(datVEN)>0]+1)
pcVEN  <- prcomp(datVEN, center = TRUE, scale = TRUE)
veVEN  <- (summary(pcVEN))[[6]][2,1]

datRAN <- t(datKp[,kpVEN][rowSums(datKp[,kpVEN])>0,])
veRAN  <- NULL
for (i in 1:100){
  gnRAN <- subsampleCells(rep(0,dim(datRAN)[2]),dim(datVEN)[2],seed=i)
  pcRAN <- prcomp(datRAN[,gnRAN], center = TRUE, scale = TRUE)
  veRAN <- c(veRAN,(summary(pcRAN))[[6]][2,1])
}

hist(veRAN,main=paste("P =",mean(veRAN>veVEN)), xlab="Variance explained by PC1")
abline(v=veVEN,lwd=3,col="green")

pdf("FigS1_PC1varianceExplained.pdf")
hist(veRAN,main=paste("P =",mean(veRAN>veVEN)), xlab="Variance explained by PC1")
abline(v=veVEN,lwd=3,col="green")
dev.off()

The variance explained by PC1 is not significantly different from chance.

Finally, show a TNSE plot of the results.

pdf("FigS1_rtsne_plot.pdf",height=8,width=8)
par(mfrow=c(2,2))
for (p in c(4:7)){
  tsne <- Rtsne(datVEN,perplexity=p)$Y
  plot(tsne,xlab="TSNE 1",ylab="TSNE2",main=paste("Perplexity =",p),pch=19)
}
dev.off()
plot(tsne,xlab="TSNE 1",ylab="TSNE2",main=paste("Perplexity =",p),pch=19)