knitr::opts_chunk$set(echo = TRUE)
This script reads in the data and assigned clusters, organizes the clusters into a dendrogram, renames the clusters to include class information and informative genes, and then plots the summary plots for inclusion in Figure 1.
If needed, set the working directory first: e.g. setwd("C:/Users/jeremym/Desktop/VEN_TEST")
.
suppressPackageStartupMessages({ library(VENcelltypes) library(WGCNA) # For vectorizeMatrix library(edgeR) library(matrixStats) library(feather) library(dendextend) library(ggplot2) library(dplyr) library(gplots) }) options(stringsAsFactors=FALSE)
These files are included as part of the R package installation.
data(FI_layer5_count_data) data(FI_layer5_sample_information) exprData <- FI_layer5_count_data Samp.dat <- FI_layer5_sample_information
First subset the data only only included non-outlier clusters.
clusterType = Samp.dat$cluster_type_label cl = Samp.dat$cluster_id names(cl) = colnames(exprData) includeClas = c("inh","exc","glia") excludeClas = sort(setdiff(clusterType,includeClas)) kpSamp = !is.element(clusterType,excludeClas)
Get median expression per cluster and the proportions.
normDat = log2(cpm(exprData)+1) colnames(normDat) = colnames(exprData) exprThresh = 1 medianExpr = do.call("cbind", tapply(names(cl), cl, function(x) rowMedians(normDat[,x]))) propExpr = do.call("cbind", tapply(names(cl), cl, function(x) rowMeans(normDat[,x]>exprThresh))) rownames(medianExpr) <- rownames(propExpr) <- rownames(normDat) clusterOrd = as.character(sort(unique(Samp.dat$cluster_id))) clusterKp = as.character(sort(unique(Samp.dat$cluster_id[kpSamp]))) medianExpr = medianExpr[,clusterOrd] propExpr = propExpr[,clusterOrd] presentGn = apply(medianExpr[,clusterKp],1,max)>0
Identify uninformative genes to exclude from the cluster names. We also exclude a set of known mitochondrial and ribosomal genes. Note that all genes are use for all marker gene (and other) computations.
shinyFail = (grepl("\\-",rownames(medianExpr)))|is.element(substr(rownames(medianExpr),1,1),0:9) # -'s don't plot correctly, numbers add an "X" excludeGenes = sort(rownames(medianExpr)[grepl("LOC",rownames(medianExpr))|grepl("LINC",rownames(medianExpr))| grepl("FAM",rownames(medianExpr))|grepl("ORF",rownames(medianExpr))|grepl("KIAA",rownames(medianExpr))| grepl("FLJ",rownames(medianExpr))|grepl("DKFZ",rownames(medianExpr))|grepl("RNF",rownames(medianExpr))| grepl("RPS",rownames(medianExpr))|grepl("RPL",rownames(medianExpr))|shinyFail]) # Also exclude ribosomal genes data(mito_genes) excludeGenes = sort(unique(c(excludeGenes,mito_genes))) keepGenes = setdiff(rownames(medianExpr)[presentGn],excludeGenes)
Identify the most specific genes for each cluster for naming.
topGenes <- getTopMarkersByProp(propExpr[keepGenes,clusterKp],1,NULL, medianExpr[keepGenes,clusterKp],fcThresh=0.5,minProp=0.5) topGenes
This is the reference: https://www.biorxiv.org/content/10.1101/384826v1 (UPDATE)
First we need to build the cluster info table
clusterInfoAll = updateAndOrderClusters(Samp.dat,topGenes=topGenes, classLevels=c(includeClas,excludeClas),regionNameColumn=NULL,newColorNameColumn="cluster_color",layerNameColumn=NULL) sampleInfo = as.data.frame(Samp.dat[kpSamp,]) clusterInfo = clusterInfoAll[is.element(clusterInfoAll$cluster_type_label,includeClas),] clusterInfo
Next, we rename the clusters. This involves appending the top specific genes calculated above, with the cluster class type, and some pre-defined markers for cell classes (defined just below).
### IMPORTANT: Genes used for broad class definitions. ### majorGenes = c("GAD2","SLC17A7","C3","AQP4","CSPG4","OPALIN") majorLabels = c("Inh","Exc","Micro","Astro","OPC","Oligo") broadGenes = c("C3","AQP4","CSPG4","OPALIN","LAMP5","VIP","SST","PVALB","LHX6","LINC00507","RORB","THEMIS","FEZF2","CTGF") # Rename the clusters keepGenes2 = c(keepGenes,majorGenes,broadGenes) clusterKp = clusterInfo$cluster_id newCluster = renameClusters(sampleInfo,clusterInfo, propExpr[keepGenes2,clusterKp], medianExpr[keepGenes2,clusterKp], propDiff = 0, propMin=0.4, medianFC = 1, layerNameColumn=NULL, excludeGenes = excludeGenes, majorGenes=majorGenes, majorLabels=majorLabels, broadGenes=broadGenes) # Cluster name clean-up newCluster <- gsub(" "," ",newCluster) # Remove double spaces newCluster <- gsub("VIP VIP","VIP",newCluster) # Don't need same gene twice newCluster <- gsub("SST SST","SST",newCluster) # Don't need same gene twice clusterInfo$cluster_label = newCluster[clusterInfo[,1]] # Add top genes to cluster information clustSplit <- strsplit(clusterInfo$cluster_label," ") clusterInfo$broadGene <- as.character(lapply(clustSplit,function(x) return(x[2]))) clusterInfo$topGene <- as.character(lapply(clustSplit,function(x) return(x[length(x)]))) # Update sample info with new cluster labels clusterId <- clusterInfo$cluster_id names(clusterId) <- clusterLabel <- clusterInfo$cluster_label names(clusterLabel) <- clusterInfo$old_cluster_label sampleInfo$cluster_label = clusterLabel[sampleInfo$cluster_label]
We want the cluster order to match that from the MTG paper. Since cluster order does not affect computation, we will do that here by matching each FI cluster to each MTG cluster by correlating cluster medians, and then setting the order annd color of FI colors to match these matched types. Note that the previously numbered script must be run to covert the MTG data to the appropriate feather files before this section of the code will work.
# Load the cluster info for MTG clusters data("clusterInfoMTG.RData") # Read in and format MTG medians mediansMTG <- read_feather("MTG/medians.feather") mediansMTG <- as.data.frame(mediansMTG) rownames(mediansMTG) <- mediansMTG$gene mediansMTG <- mediansMTG[,colnames(mediansMTG)!="gene"] ord <- match(clusterInfoMTG$cluster_id,colnames(mediansMTG)) colnames(mediansMTG) <- clusterInfoMTG$cluster_label[ord] mediansMTG <- log2(mediansMTG+1) # Format FI medians mediansFI <- medianExpr mediansFI <- mediansFI[,clusterInfo$cluster_id] ord <- match(clusterInfo$cluster_id,colnames(mediansFI)) colnames(mediansFI) <- clusterInfo$cluster_label[ord] # Take all genes differentially expressed across clusters (FC>100) in both data sets as input dif <- 100 geneMTG <- rownames(mediansMTG)[apply(mediansMTG,1,function(x) diff(range(x)))>log2(dif)] geneFI <- rownames(mediansFI)[apply(mediansFI,1,function(x) diff(range(x)))>log2(dif)] geneKp <- sort(intersect(geneMTG,geneFI)) # Assign an MTG cluster to each FI cell based on correlation of DEX genes corFM <- cor(normDat[geneKp,],mediansMTG[geneKp,]) clusterMTGa <- colnames(corFM)[apply(corFM,1,which.max)] clusterMTG <- clusterMTGa[kpSamp] clusterFI <- sampleInfo$cluster_label fclusterMTG <- factor(clusterMTG,levels=intersect(colnames(mediansMTG),clusterMTG)) fclusterFI <- factor(clusterFI,levels=intersect(colnames(mediansFI),clusterFI)) # Match the MTG and FI clusters and plot the results tabMF <- t(table(fclusterMTG,fclusterFI)) ord <- order(-apply(tabMF,1,function(x){ x = x>=(max(x)/1.5); mean(cumsum(x/sum(x))); } )) tabMF <- tabMF[ord,] tabMF <- tabMF/apply(tabMF,1,max) celMF <- round(100*tabMF/apply(tabMF,1,sum)) celMF[tabMF<1] = "" arealMatch <- colnames(tabMF)[apply(tabMF,1,function(x) which.max(x)[1])] names(arealMatch) <- rownames(tabMF) mediansMTG_match <- mediansMTG[,arealMatch] colnames(mediansMTG_match) <- names(arealMatch) # Update cluster info to order and color clusters as with the MTG data set clusterInfo <- clusterInfo[match(rownames(tabMF),clusterInfo$cluster_label),] clusterInfo$lrank <- 1:dim(clusterInfo)[1] clusterInfo$cellmap_label <- colnames(tabMF)[apply(tabMF,1,function(x) which.max(x)[1])] colMTG <- clusterInfoMTG$cluster_color[match(colnames(tabMF),clusterInfoMTG$cluster_label)] colorFI <- apply(tabMF,1, function(x,col) mixColors(col,2,weights=x), colMTG) clusterInfo$cluster_color <- colorFI # Update sample info (cluster colors and cellmap info) # NEED TO CHANGE TO SAMP.DAT ord <- match(sampleInfo$cluster_label,clusterInfo$cluster_label) sampleInfo$cluster_color <- clusterInfo$cluster_color[ord] ord <- match(clusterMTG,clusterInfoMTG$cluster_label) sampleInfo$cellmap_color <- clusterInfoMTG$cluster_color[ord] sampleInfo$cellmap_label <- clusterInfoMTG$cluster_label[ord] sampleInfo$cellmap_id <- clusterInfoMTG$cluster_id[ord]
Plot the comparison between FI and MTG clusters. NOTE: This figure is currently not included in the paper. Instead we compare four data sets together using Seurat in figure 3, and perform a more focused FI vs. MTG comparison in figure 4.
heatmap.2(tabMF,trace="none",dendrogram="none",Rowv=FALSE,Colv=FALSE,margins=c(12,12),cellnote=celMF,notecol="black")
As with the MTG paper, we are only going to consider the top 1200 binary genes by beta for building the tree.
# Determine a score for cell type specificity in non-outlier clusters (marker/beta score) specificityScoreRank <- getBetaScore(propExpr[presentGn,clusterKp],FALSE) topNgenes <- 1200 betaGenes <- names(specificityScoreRank)[specificityScoreRank<=topNgenes] # Build and reorder the dendrogram medianExpr2 <- medianExpr[betaGenes,match(clusterInfo$cluster_id,colnames(medianExpr))] colnames(medianExpr2) <- clusterInfo$cluster_label dend <- getDend(medianExpr2) l.rank <- setNames(as.integer(clusterInfo$lrank), clusterInfo$cluster_label) dend <- reorder.dend(dend,l.rank) dend <- collapse_branch(dend, 0.01) dend <- dend %>% set("leaves_pch", 19) %>% set("leaves_cex", 2)
Plot the result and save it to a pdf file.
main = paste("Dendrogram based on top",topNgenes,"specificity genes, correlation distance") label_color2 = clusterInfo$cluster_color[match(dend %>% labels,clusterInfo$cluster_label)] rankCompleteDendPlot(dend=dend,label_color=label_color2,main=main,node_size=4) pdf("clusterDendrogramFinal.pdf",height=10,width=12) label_color2 = clusterInfo$cluster_color[match(dend %>% labels,clusterInfo$cluster_label)] rankCompleteDendPlot(dend=dend,label_color=label_color2,main=main,node_size=4) dev.off()
The violin and heatmap functions require data to be in a specific feather format, and this section ensures that the outputted data is in the correct format.
## Create a folder for these data newInputFolder = "FI" dir.create(newInputFolder) ## Output the dendrogram save(dend, file=paste0(newInputFolder,"/dend.rda")) ## Format and output the data exprOut <- cpm(exprData) colnames(exprOut) <- Samp.dat$sample_id exprOut <- exprOut[,sampleInfo$sample_id] exprOut <- as.data.frame(t(exprOut)) exprOut$sample_id <- sampleInfo$sample_id write_feather(exprOut,paste0(newInputFolder,"/data.feather")) # CPM ## Format and output the cluster information clusterInfo = clusterInfo[match(labels(dend),clusterInfo$cluster_label),] clusterInfo$cluster_id = 1:dim(clusterInfo)[1] save(clusterInfo,file=paste0(newInputFolder,"/clusterInfo.rda")) ## Format and output the annotation information ord <- match(sampleInfo$cluster_label,clusterInfo$cluster_label) sampleInfo$cluster_id <- clusterInfo$cluster_id[ord] write_feather(sampleInfo,paste0(newInputFolder,"/anno.feather")) ## Format and output a descriptor file for the annotations cn <- colnames(sampleInfo)[grepl("_label",colnames(sampleInfo))] type <- rep("cat",length(cn)) for (i in 1:length(cn)) if(is.numeric(as.data.frame(Samp.dat[,cn[i]])[,1])) type[i] = "num" cn <- gsub("_label","",cn) desc <- data.frame(base=cn,name=cn,type = type) write_feather(desc,paste0(newInputFolder,"/desc.feather"))
all_clusters <- clusterInfo$cluster_id broadGenes <- c("GAD2","LAMP5","VIP","LHX6","SST","PVALB","SLC17A7","LINC00507", "RORB","THEMIS","CTGF","FEZF2","SLC1A3","CSPG4","AQP4","OPALIN","C3") broad_plot <- group_heatmap_plot(data_source = newInputFolder, genes = broadGenes, group_by = "cluster", clusters = all_clusters, calculation = "median", # "trimmed_mean" in MTG paper labelheight = 40, showcounts = F) broad_plot ggsave("Fig1_broad_markers.pdf",broad_plot,height = 4, width = 6)
We need to calculate a set of genes relatively enriched in each cluster. As a first step, we will exclude genes that have wildly high values in off-target cell types.
# Get maximum expression per cluster (note medians and proportions previously calculated) maxExpr <- do.call("cbind", tapply(names(cl), cl, function(x) rowMax(normDat[,x]))) rownames(maxExpr) <- rownames(propExpr) maxExpr <- maxExpr[,colnames(propExpr)] # Only consider genes where no cells have 1.5x higher expression that the highest expression in the most common cluster maxExpr2 <- maxExpr[,clusterKp] for (i in 1:length(maxExpr2[,1])) maxExpr2[i,which.max(propExpr[i,clusterKp])] = maxExpr2[i,which.max(propExpr[i,clusterKp])]*1.5 bestGenes <- apply(maxExpr2,1,which.max)==apply(propExpr[,clusterKp],1,which.max)
Next, find the top marker genes for each excitatory cluster.
kpEx <- as.character(cl[match(clusterInfo$old_cluster_label,Samp.dat$cluster_label)]) kpEx <- intersect(kpEx,cl[clusterType=="exc"]) topGenesExc <- getTopMarkersByProp(propExpr[bestGenes,kpEx],4,NULL,medianExpr[bestGenes,kpEx],fcThresh=0.5,minProp=0.5) topGenesExc <- vectorizeMatrix(t(topGenesExc)) topGenesExc <- formatPlotGenes(topGenesExc) # Format genes to be compatible with plotting function
Next, make the plots!
kp <- clusterInfo$cluster_type_label=="exc" exc_plot <- group_violin_plot(data_source = newInputFolder, clusters=all_clusters[kp], genes=topGenesExc,labelheight = 15) exc_plot
Finally, save the plots!
ggsave("Fig1_exc_violinPlots_specificGenes.pdf", width = 5, height = 15)
In our study we valided expression of several genes in layer 5 of FI. Here is the expression of those genes in excitatory types.
valGenes <- c("SLC17A7","FEZF2","GABRQ","ADRA1A","POU3F1","ITGA4","BMP3","RORB","HTR2C","GABRE","GABRG1","ADRA2A","ALDH1A1") kp <- clusterInfo$cluster_type_label=="exc" val_plot <- group_violin_plot(data_source = newInputFolder, clusters=all_clusters[kp], genes=valGenes,labelheight = 25) val_plot
Save the plots!
ggsave("Fig2_violinPlots_validatedGenes.pdf", width = 5, height = 6)
Several studies have identified genes (or proteins) expressed in VENs using a variety of strategies. Here we will plot the expression of these genes in our data set to get a sense of how specific these genes are to a specific cluster. First, let's see which of these genes are available for plotting in our study.
literature <- c("VAT1L","CHST8","LYPD1","SULF2","BCL11B","FEZF2","SLC18A2","GABRQ","ADRA1A","ATF3","IL4R", "NMB","DISC1","VGF","ABAT","CBLN2","CHRM1","CHRNA4","CNR1","CRYM","DLD","GABBR1","GABRR2", "GABRA","GABRB2","GABRB3","GABRD","GLRB","GLS","GOT1","GOT2","GRIA1","GRIA2","GRIA3","GRIK2", "GRIN1","GRIN2A","GRIN2B","GRIN3A","HRH3","HTR3B","NEFH","NNAT","SCG2","SYT2","TAC1","NPY", "IGFBP2","MEF2C","CPLX1","MBP","PLP1","RNA5SP352","ACA64","ALKBH3","FABP6","PLP1","METRNL", "LINC00982","RNU6-1240P","RGMA","PRR5","HAPLN4","SOHLH1","HTR2B","DRD3","GRP","LMO4") print(paste("Genes not in our data set:",paste(setdiff(literature,rownames(normDat)),collapse=", "))) literature <- intersect(literature,rownames(normDat))
Second, roughly order these genes by their specificity in the VEN cluster (Exc FEZF2 GABRQ).
val2 <- 2*propExpr[literature,match("20",colnames(propExpr))] - apply(propExpr[literature,match(clusterInfo$cluster_id,colnames(propExpr))][,kp],1,quantile,0.75) plotGenes <- names(sort(-(val2)))
Next, make the plots!
kp <- clusterInfo$cluster_type_label=="exc" lit_plot <- group_violin_plot(data_source = newInputFolder, clusters=all_clusters[kp], genes=plotGenes,labelheight = 15) lit_plot
Finally, save the plots!
ggsave("FigS3_violinPlots_literatureVENgenes_excClusters.pdf", width = 5, height = 18)
Next, make the plots!
lit_plot <- group_violin_plot(data_source = newInputFolder, clusters=all_clusters, genes=plotGenes,labelheight = 15) lit_plot
Finally, save the plots!
ggsave("FigS3_violinPlots_literatureVENgenes_allClusters.pdf", width = 7, height = 18)
Output session information.
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.