knitr::opts_chunk$set(echo = TRUE)
This script performs alignment of deep excitatory neurons in human frontoinsula, human middle temporal gyrus, mouse primary visual cortex, and mouse anterior lateral motor cortex. It also identifies genes with common and distinct patterning across these data sets.
If needed, set the working directory first: e.g. setwd("C:/Users/jeremym/Desktop/VEN_TEST")
.
suppressPackageStartupMessages({ library(dplyr) library(feather) library(gplots) library(Matrix) library(cowplot) library(Seurat) library(pheatmap) library(VENcelltypes) }) options(stringsAsFactors=FALSE)
These files were created in the previous code file, which must be run prior to running this code block. First, let's read in the data from this manuscript (human FI). We will subset these data to only include excitatory neurons.
## Read in the data inputFolder = "FI/" Expr.dat <- feather(paste(inputFolder,"data.feather",sep="")) annoFI <- read_feather(paste(inputFolder,"anno.feather",sep="")) exprData <- as.matrix(Expr.dat[,colnames(Expr.dat)[colnames(Expr.dat)!="sample_id"]]) rownames(exprData) = Expr.dat$sample_id datFI <- t(exprData) datFI <- log2(datFI+1) load(paste(inputFolder,"clusterInfo.rda",sep="")) infoFI <- clusterInfo ## Only include excitatory nuclei clusterType <- annoFI$cluster_type_label datFI <- datFI[,clusterType=="exc"] annoFI <- annoFI[clusterType=="exc",]
Second, let's read in the data from human MTG. We will subset these data to only include excitatory neurons from clusters found in layers 4 and 5, and will subset to a maximum of 50 nuclei per cell type to roughly match the number of nuclei in the two data sets. For compatibility with the original analysis, we set the seed below. We have tried the analysis multiple time and get qualitatively and nearly quantitatively matched results with different cell subsets, although the resolution value in Suerat (below) may need minor adjustments depending on the seed.
## Read in the data mtgFolder <- "MTG/" annoMTG <- read_feather(paste(mtgFolder,"anno.feather",sep="")) Expr.datp <- feather(paste(mtgFolder,"data.feather",sep="")) annoMTG <- annoMTG[match(Expr.datp$sample_id,annoMTG$sample_id),] datMTG <- as.matrix(Expr.datp[,names(Expr.datp)!="sample_id"]) rownames(datMTG) <- annoMTG$sample_id datMTG <- t(datMTG) datMTG <- log2(datMTG+1) ## Set the seed for subsampling for reproducibility seed <- 6 ## Only include excitatory nuclei from clusters found in layers 4 and 5 fracL <- table(annoMTG$cluster_label,annoMTG$brain_subregion_label) cntL5 <- fracL[,"L5"] fracL <- t(t(fracL)*colSums(fracL)) fracL5 <- ceiling(100*fracL[,"L5"]/rowSums(fracL)) kpL5 <- names(fracL5)[pmax(fracL5,cntL5)>10] kpMTG <- is.element(annoMTG$brain_subregion_label,c("L5","L4"))&is.element(annoMTG$cluster_label,kpL5)& subsampleCells(annoMTG$cluster_label,50,seed)&(substr(annoMTG$cluster_label,1,3)=="Exc") # (The above step also subsets to 50 nuclei per cluster) datMTG <- datMTG[,kpMTG] annoMTG <- annoMTG[kpMTG,]
Third, let's read in the data from mouse VISp. For mouse data we will work at the subclass level, which for excitatory types lists the predominant layer of origin and measured projection targets, since these two pieces of information are our primary interest, and since we have a relatively small sample size and the brain regions across species aren't perfectly matched. Within each subclass, we will select a maximum of 100 randomly chosen cells per subclass to roughly match the number of nuclei in the two data sets.
## Read in the data vispFolder<- "VISp/" annoVISp <- read_feather(paste(vispFolder,"anno.feather",sep="")) exprVISp <- feather(paste(vispFolder,"data.feather",sep="")) annoVISp <- annoVISp[match(exprVISp$sample_id,annoVISp$sample_id),] ## Select cells from VISp-specific excitatory subclasses kpVISp <- (annoVISp$class_label=="Glutamatergic")&(annoVISp$cluster_label!="CR Lhx5")& (!grepl("ALM",annoVISp$cluster_label)) # Remove any cells from ALM-predominant clusters kpVISp[kpVISp][!subsampleCells(annoVISp[kpVISp,]$subclass_label,100,seed)] = FALSE annoVISp <- annoVISp[kpVISp,] datVISp <- as.matrix(exprVISp[kpVISp,names(exprVISp)!="sample_id"]) rownames(datVISp) <- annoVISp$sample_id datVISp <- t(datVISp) datVISp <- log2(datVISp+1)
Fourth, let's read and format the data from mouse ALM, as with VISp above.
almFolder <- "ALM/" annoALM <- read_feather(paste(almFolder,"anno.feather",sep="")) exprALM <- feather(paste(almFolder,"data.feather",sep="")) annoALM <- annoALM[match(exprALM$sample_id,annoALM$sample_id),] ## Second ALM-specific excitatory clusters kpALM <- (annoALM$class_label=="Glutamatergic")&(annoALM$cluster_label!="CR Lhx5")& (!grepl("VISp",annoALM$cluster_label)) # Remove any cells from VISp-predominant clusters kpALM[kpALM][!subsampleCells(annoALM[kpALM,]$subclass_label,100,seed)] = FALSE annoALM <- annoALM[kpALM,] datALM <- as.matrix(exprALM[kpALM,names(exprALM)!="sample_id"]) rownames(datALM) <- annoALM$sample_id datALM <- t(datALM) datALM <- log2(datALM+1)
Finally, find the intersection of all mouse and human genes available for analysis. We will exclude sex and mitochondrial genes, as with previous studies.
## Select the genes data(mito_genes) data(sex_genes) excludeGenes <- sort(unique(c(sex_genes,mito_genes))) rownames(datVISp) <- rownames(datALM) <- toupper(rownames(datALM)) kpGenes <- intersect(rownames(datALM),intersect(rownames(datMTG),rownames(datFI))) kpGenes <- sort(setdiff(kpGenes,excludeGenes)) ## Subset the data datVISp <- datVISp[kpGenes,] datALM <- datALM[kpGenes,] datFI <- datFI[kpGenes,] datMTG <- datMTG[kpGenes,]
Data integration has been shown to allow matching of very disparate data sets that measure the same (or comparable things), including previous work from our group on comparing human MTG to mouse VISp and ALM (https://www.biorxiv.org/content/early/2018/08/05/384826). This part of the analysis pipeline directly follows the vignette provided as part of the Satija lab, here:https://satijalab.org/seurat/pancreas_integration_label_transfer.html.
First we need to set up the required data and meta-data files for input into the the analysis.
## Prepare brain.data and brain.metadata (e.g., combine all data sets) brain.data <- cbind(datVISp,datALM,datFI,datMTG) brain.metadata <- data.frame(set=c(rep("mouseVISp",dim(datVISp)[2]), rep("mouseALM",dim(datALM)[2]),rep("humanFI",dim(datFI)[2]),rep("humanMTG",dim(datMTG)[2])), celltype = c(paste("mouse VISp",annoVISp$subclass_label),paste("mouse ALM",annoALM$subclass_label), annoFI$cluster_label,annoMTG$cluster_label)) rownames(brain.metadata) <- colnames(brain.data) ## Construct reference brain <- CreateSeuratObject(brain.data, meta.data = brain.metadata) brain.list <- SplitObject(object = brain, split.by = "set") ## Gene selection using variance stabilizing transformation (vst) for (i in 1:length(x = brain.list)) { brain.list[[i]] <- NormalizeData(object = brain.list[[i]], verbose = FALSE) brain.list[[i]] <- FindVariableFeatures(object = brain.list[[i]], selection.method = "vst", nfeatures = 2000, verbose = FALSE) }
Next, find the integration anchors and run the main integration / alignment analysis, using all the parameters shown in the above vignette. Note: in order for the "RunUMAP" function to work, you need to have Python installed with numpy, which is done outside of R. I used pip install --user numpy==1.15.1
to do this. At the time of writing, the current version of numpy (1.15.4) does not work with RunMAP.
## Set up a reference integrated data set using all four data sets brain.anchors <- FindIntegrationAnchors(object.list = brain.list, dims = 1:30) brain.integrated <- IntegrateData(anchorset = brain.anchors, dims = 1:30) ## Run the main integration analysis DefaultAssay(object = brain.integrated) <- "integrated" brain.integrated <- ScaleData(object = brain.integrated, verbose = FALSE) brain.integrated <- RunPCA(object = brain.integrated, npcs = 30, verbose = FALSE) brain.integrated <- RunUMAP(object = brain.integrated, reduction = "pca", dims = 1:30)
Define clusters on the intergrated data set using Louvain community detection. We use resolution of 0.3 instead of the default 0.6 here.
resolution <- 0.3 FindNeighbors.Seurat <- VENcelltypes::FindNeighbors.Seurat ## REQUIRED FOR CONSISTENT OUTPUT IN WINDOWS AND UNIX brain.integrated <- FindNeighbors(object = brain.integrated, dims = 1:30) brain.integrated <- FindClusters(object = brain.integrated, resolution = resolution) eval(parse(text=paste0("brain.integrated@meta.data$clusterCall <- brain.integrated@meta.data$integrated_snn_res.",resolution)))
Plot the fraction of each cell type in each Seurat cluster. This is used to determine the cluster names moving forward.
compare = table(brain.integrated@meta.data$celltype,brain.integrated@meta.data$clusterCall) compare = round(100*compare/rowSums(compare)) heatmap.2(compare, dendrogram="row", trace="none", Colv=FALSE, margins = c(5, 15), colsep=1:100, rowsep=1:100, RowSideColors=rep("black",48))
Rename clusters accordingly.
cellClasses <- c("3. L5 IT","2. L4* IT","1. L2/3 IT","4. L6 IT","8. NP","7. L6b","5. L5 PT","6. L6 CT") names(cellClasses) <- colnames(compare) brain.integrated@meta.data$className <- cellClasses[brain.integrated@meta.data$clusterCall]
How many genes from each data set are in each class?
table(brain.integrated@meta.data$className,brain.integrated@meta.data$set)
The number of cells from each data set in each type is very close to the expected number based on the sampling, confirming that these are meaningful clusters.
Plot a dendrogram of cell classes.
heat.colors <- colorRampPalette(c("grey99", "orange", "red"))(100) combined.cl <- brain.integrated@meta.data %>% mutate(ref.cl = celltype) ref.cl <- setNames(combined.cl$celltype, rownames(brain.integrated@meta.data)) seurat.cl <- setNames(combined.cl$className, rownames(brain.integrated@meta.data)) comb.cl.anno <- combined.cl %>% group_by(celltype) %>% summarise(set = dplyr::first(set)) ct <- comb.cl.anno$celltype comb.cl.anno <- as.data.frame(comb.cl.anno[,-1]) rownames(comb.cl.anno) <- ct cols <- list(set=scales::hue_pal()(4)) # Colors used in ggplot2 names(cols[["set"]]) <- unique(comb.cl.anno$set) cl.conf <- compareClusterCalls(ref.cl, seurat.cl, comb.cl.anno, heat.colors=heat.colors, annotation_colors = cols, file="Fig3_dendrogramComparison_cortexIntegrated.pdf",height=10,width=5) cl.conf$ph # NOTE, this only displays to the screen if you copy and paste into the console
Plot the data in UMAP space.
p1 <- DimPlot(object = brain.integrated, group.by = "set", reduction = "umap", do.return = TRUE, pt.size = 1) p2 <- DimPlot(object = brain.integrated, group.by = "className", reduction = "umap", do.return = TRUE, pt.size = 1, label=TRUE, label.size = 6) plot_grid(p1, p2) ggsave("Fig3_UMAP_cortexIntegrated.pdf",height=6,width=14)
Data integration only ensures that the cells are matched, but does not say anything about agreement in gene expression. While the method does adjust gene expression levels, in our experience these adjusted levels are often too divorced from the initial values to be of use. We next seek to identify common marker genes across data sets using the log2(CPM+1) data as a starting point.
Identify cluster averages and gene specificity across data sets, using the Seurat cluster calls as the subclass definitions.
## Use cell type calls from Seurat as the subclass labels ctVISp <- brain.integrated@active.ident[colnames(datVISp)] ctALM <- brain.integrated@active.ident[colnames(datALM)] ctFI <- brain.integrated@active.ident[colnames(datFI)] ctMTG <- brain.integrated@active.ident[colnames(datMTG)] ## Calculate trimmed averages across data sets dats <- list(2^datVISp-1,2^datALM-1,2^datFI-1,2^datMTG-1) # CONVERT BACK TO LINEAR SPACE cts <- list(ctVISp,ctALM,ctFI,ctMTG) avg <- list() for (i in 1:4){ avg[[i]] <- do.call("cbind", tapply(names(cts[[i]]), cts[[i]], function(x) apply(dats[[i]][,x],1,mean,trim=0.25))) rownames(avg[[i]]) <- rownames(dats[[i]]) avg[[i]] <- avg[[i]][,names(sort(cellClasses))] colnames(avg[[i]]) <- sort(cellClasses) } ## Calculate a specificity index (lower means expressed in a smaller number of types) spec <- avg[[1]][,1]*0+1 for (i in 1:4){ tmp2 <- avg[[i]]/rowSums(avg[[i]]+0.00000000001) tmp2 <- t(apply(tmp2,1,function(x) cumsum(sort(x)))) tmp2 <- rowSums(tmp2) tmp2[tmp2==0] = 8 spec <- pmax(spec,tmp2) }
Next, find the correlation between data sets and identify genes that are both specific to a small number of subtypes and also correlated across all data sets.
## Calculate correlations across data sets corMin <- avg[[1]][,1]*0+1 for (i in 2:4) for(j in 1:(i-1)){ datTmp <- cbind(avg[[i]],avg[[j]]) corMin <- pmin(corMin,apply(datTmp,1,function(x) cor(x[1:8],x[9:16]))) } corMin[is.na(corMin)]=-1 ## Identify genes that are expressed relatively specifically to a subtype and highly correlated across all data sets plotGn <- names(corMin)[(corMin>0.75)&(tmp2<2.75)]
Plot the common marker genes across all four data sets. These plots are showing the trimmed mean expression (trim=0.25) of each gene in each cluster. This ensures that marker genes are expressed in at least 25% of cells in the "on" cluster in all four species.
## Order the genes and plot the results in all four data sets tmp <- avg[[1]][plotGn,] + avg[[2]][plotGn,] + avg[[3]][plotGn,] + avg[[4]][plotGn,] tmp <- tmp/rowSums(tmp) ord <- order(-apply(tmp,1,which.max)*10,rowMeans(t(apply(tmp,1,cumsum)))) mains <- c("Mouse VISp","Mouse ALM","Human FI","Human MTG") for (i in 1:4){ plotVal <- avg[[i]][plotGn,][ord,] plotVal <- plotVal/apply(plotVal,1,max) heatmap(plotVal,Rowv=NA,Colv=NA, main=mains[i],scale="none"); }
Now plot to the file
pdf("FigS2_commonGenes_heatmap.pdf",height=10,width=10) for (i in 1:4){ plotVal <- avg[[i]][plotGn,][ord,] plotVal <- plotVal/apply(plotVal,1,max) heatmap(plotVal,Rowv=NA,Colv=NA, main=mains[i],scale="none"); } heatmap.2(plotVal/apply(plotVal,1,max),Rowv=NA,Colv=NA, main="Plot for color bar only") dev.off()
In additional to making nice plots, we also would like to know how many (and which) genes are good markers for each subtype in each data set. This will allow us to ask a few relevant questions. Given the morphological diversity of the ET cluster in FI, do we find some comparable measure of transcriptomic diversity? For example, does this type have more marker genes than other types? Is the transcriptional profile particularly different in FI? Is it more distinct as compared with other the other cell types? The next couple of analyses provided different strategies for assessing differential expression between subtypes.
Top marker genes for each type using the built in FindMarkers
function in Seurat.
nSub <- 1000 iters <- 1 numOkay <- 1 tests <- c ("bimod") csMarks <- list() samples <- names(brain.integrated$className) for (ct in levels(brain.integrated)){ for (set in sort(unique(brain.integrated$set))){ mark = NULL for (i in 1:iters){ test.use <- tests[(i %% length(tests))+1] class <- brain.integrated$className class[brain.integrated$set != set] = "no" set.seed(1); subs2 <- nSub kpSam <- samples[subsampleCells(class,subs2,seed = i)&(class!="no")] brTmp <- subset(brain.integrated,cells=kpSam) out <- FindMarkers(brTmp, ident.1 = ct, verbose=0, min.pct = 0.5, only.pos=TRUE, test.use=test.use, assay = "RNA") mark <- c(mark, rownames(out)) } tab <- table(mark) csMarks[[ct]][[set]] <- sort(names(tab)[tab>=numOkay]) } } names(csMarks) <- as.character(cellClasses[levels(brain.integrated)]) csMarks <- csMarks[colnames(avg[[1]])]
out <- matrix(0, nrow=length(colnames(avg[[1]])),ncol=length(mains)) colnames(out) <- sort(unique(brain.integrated$set)) rownames(out) <- colnames(avg[[1]]) for (i in cellClasses) { print(i) print(table(table(c(csMarks[[i]][[1]],csMarks[[i]][[2]],csMarks[[i]][[3]],csMarks[[i]][[4]])))) print("-------------------------") print(" ") for (j in sort(unique(brain.integrated$set))){ out[i,j] <- length(csMarks[[i]][[j]]) } } out
We find that the distinct types in the UMAP have the most distinct markers based on Seurat, which makes sense. By this measure there are more ET-associated genes in MTG than FI. Note that this method is looking at gene expression in ET as compared with the average of all other types and may not be finding the most selective genes.
To account for the differences in scales of these data sets, we will look for genes defined as DEX using ranks rather than absolute expression levels, and we will subsample each subtype to the same number of cells multiple times. As such, this analysis is comparing the difference in ranked average expression values rather than any absolute measure of expression.
ranks <- scales <- list() for (i in 1:4) { scales[[i]] <- avg[[i]]/(apply(avg[[i]],1,mean)+0.00000001) ranks[[i]] <- apply(-scales[[i]],2,rank,ties.method="first") ranks[[i]][ranks[[i]]>7219] <- 7220 # 50% of all genes ranks[[i]] <- 7219-ranks[[i]] } dexs <- list() for (i in 1:4) { dexs[[i]] <- ranks[[i]]*0 for (j in 1:dim(ranks[[i]])[2]){ dexs[[i]][,j] <- ranks[[i]][,j] - apply(ranks[[i]][,-j],1,max) } dexs[[i]] <- pmax(dexs[[i]],0) } mcpm = 5 rdex = 3609.5 # 25% of all genes out <- NULL for (i in 1:4){ out <- rbind(out,colSums((dexs[[i]]>rdex)&(avg[[i]]>=mcpm))) } rownames(out) <- mains mouse_common <- (colSums((((dexs[[1]]>rdex)&(avg[[1]]>=mcpm))+ ((dexs[[2]]>rdex)&(avg[[2]]>=mcpm)))==2)) human_common <- (colSums((((dexs[[3]]>rdex)&(avg[[3]]>=mcpm))+ ((dexs[[4]]>rdex)&(avg[[4]]>=mcpm)))==2)) out <- rbind(out,mouse_common,human_common) out
While the specific counts differ, the same results hold here, with many ET markers in all data sets, and the most in MTG.
Next, let's directly look for markers of the ET subclasses using the method to be described in the next code documents. The details don't matter here other than that it also looks at the proportion of cells expressed in each class.
meanFI <- log2(avg[[3]]+1) meanMTG <- log2(avg[[4]]+1) ## proportions exprThresh <- 1 propFI <- do.call("cbind", tapply(names(ctFI), ctFI, function(x) rowMeans(datFI[,x]>exprThresh))) propMTG <- do.call("cbind", tapply(names(ctMTG), ctMTG, function(x) rowMeans(datMTG[,x]>exprThresh))) colnames(propFI) <- colnames(propMTG) <- cellClasses propFI <- propFI[,colnames(meanFI)] propMTG <- propMTG[,colnames(meanMTG)] # FI dex pti <- 5 wme <- apply(meanFI,1,which.max) fce <- apply(meanFI,1,function(x) diff(range(-sort(-x)[1:2]))) VEe <- apply(meanFI,1,function(x) x[pti]-mean(x[c(1:(pti-1),(pti+1):length(x))])) p2e <- apply(propFI,1,function(x) -sort(-x)[2]) p1e <- apply(propFI,1,max) pce <- p1e-p2e topVENgeneFI <- names(fce)[(fce>1)&(wme==pti)&(pce>0.25)&(p1e>0.5)&(p2e<0.5)] topVENgeneFI <- topVENgeneFI[order(-fce[(topVENgeneFI)])] # MTG dex ptib <- 5 wmeb <- apply(meanMTG,1,which.max) fce <- apply(meanMTG,1,function(x) diff(range(-sort(-x)[1:2]))) VEe <- apply(meanMTG,1,function(x) x[ptib]-mean(x[c(1:(ptib-1),(ptib+1):length(x))])) p2e <- apply(propMTG,1,function(x) -sort(-x)[2]) p1e <- apply(propMTG,1,max) pme <- apply(propMTG,1,mean) pce <- p1e-p2e topVENgeneMTG <- names(fce)[(fce>1)&(wmeb==ptib)&(pce>0.25)&(p1e>0.5)&(p2e<0.5)] topVENgeneMTG <- topVENgeneMTG[order(-fce[(topVENgeneMTG)])] print(paste(length(topVENgeneFI),"FI genes in the ET cluster.")) print(paste(length(topVENgeneMTG),"MTG genes in the ET cluster.")) print(paste(length(intersect(topVENgeneFI,topVENgeneMTG)),"genes in the ET cluster common to both regions."))
We find roughly the same number of genes and the same result here as with the previous two methods.
In addition to identifying common genes, we also want to know how gene expression patterns of different clusters diverge between data sets. One goal of this is to see whether the ET type diverges more betwen data sets, and if so, whether FI (which should contain VENs) shows the biggest gene gain. To do so, we perform Spearman correlation of cluster medians across each pair of data sets. Cell types with lower correlation would have more differential expression. This method allows us to properly control for technical differences in a way that is not possible by directly identifying specific genes.
First, we calculate the subtype medians. To account for the different number of cells in each subtype, we subsample equally from each subtype 10 times and then find the average and range of correlations at the end.
## Use cell type calls from Seurat as the subclass labels ctVISp <- brain.integrated@active.ident[colnames(datVISp)] ctALM <- brain.integrated@active.ident[colnames(datALM)] ctFI <- brain.integrated@active.ident[colnames(datFI)] ctMTG <- brain.integrated@active.ident[colnames(datMTG)] cts <- list(ctVISp,ctALM,ctFI,ctMTG) ## Calculate median expression and marker genes within a data set dats <- list(datVISp,datALM,datFI,datMTG) # log2 space meds <- list() # CLUSTER MEDIANS iter <- 10 subs <- 10 for (seed in 1:iter){ meds[[seed]] <- list() for (i in 1:4){ kp <- subsampleCells(cts[[i]],subs,seed=seed) meds[[seed]][[i]] <- do.call("cbind", tapply(names(cts[[i]])[kp], cts[[i]][kp], function(x) apply(dats[[i]][,kp][,x],1,median))) meds[[seed]][[i]] <- meds[[seed]][[i]][,names(sort(cellClasses))] colnames(meds[[seed]][[i]]) <- as.character(sort(cellClasses)) } }
Next, we calculate and output the average correlations (as well as the range).
## Calculate and output the results for correlation (e.g., differential marker genes) corsL <- list() for (seed in 1:iter){ cors <- rn <- NULL nm <- c("VISp","ALM","FI","MTG") for (i in 2:4) for (j in 1:(i-1)){ kp <- rowSums(pmax(meds[[seed]][[i]],meds[[seed]][[j]]))>=0 rn <- c(rn,paste(nm[i],"|",nm[j])) corTmp <- NULL for (k in 1:dim(meds[[seed]][[i]])[2]){ # kp <- pmax(meds[[seed]][[i]][,k],meds[[seed]][[j]][,k])>0 corTmp <- c(corTmp,cor(meds[[seed]][[i]][kp,k],meds[[seed]][[j]][kp,k],method="s")) } names(corTmp)<-colnames(meds[[seed]][[i]]) cors <- rbind(cors,corTmp) } rownames(cors) <- rn average <- colMeans(cors) cors <- rbind(cors,average) corsL[[seed]] <- cors } corsAvg <- corsMin <- corsMax <- corsL[[1]] for(seed in 2:iter){ corsAvg <- corsAvg + corsL[[seed]] corsMin <- pmin(corsMin, corsL[[seed]]) corsMax <- pmax(corsMax, corsL[[seed]]) } round(100000*corsAvg/length(iter))/1000000 round(1000000*corsMax-1000000*corsMin)/1000000
We find that the ET subtype has the highest correlation between each pair of data sets, suggesting that this type does not diverge dramatically in human FI. This cannot be explained by the transcriptionally distinctness of the ET subtype as the NP subtype (which also has many common marker genes) has the lowest correlation between almost all pairs of sets. The L4* IT type also has low correlation between data sets, consistent with expectations since FI and ALM don't contain a layer 4, while VISp and MTG do. Together these results suggest that the distinctness of ET types is captured transcriptomically more than any species or regional differences due to morphological specialization.
As a complementary method to Seurat we will perform a simple, correlation-based mapping of human FI to mouse VISp and ALM to assess whether human and mouse ET cells align when using a less sophisticated method. The basic idea is to scale the mouse and human data to match, and then compare each human cell to the cluster means of each mouse cell types using the genes most highly enriched or depleted in the mouse ET clusters.
# Calculate means and ordered PT genes in mouse meanALM <- findFromGroups(t(datALM),annoALM$subclass_label,mean) meanVISp <- findFromGroups(t(datVISp),annoVISp$subclass_label,mean) kpGn <- (rowSums(meanALM)>0)&(rowSums(meanVISp)>0)&(rowSums(datFI)>0) inetALM <- meanALM[kpGn,"L5 PT"] - rowMeans(meanALM[kpGn,]) inetVISp <- meanVISp[kpGn,"L5 PT"] - rowMeans(meanVISp[kpGn,]) # Calculate top correlated types using these genes and check how many map to PT for (nGenes in c(75,100,150,250,550)){ gnALM <- names(c(head(sort(inetALM),nGenes),head(sort(-inetALM),nGenes))) corALM <- cor(datFI[gnALM,]/apply(datFI[gnALM,],1,max),meanALM[gnALM,]/apply(meanALM[gnALM,],1,max)) mapALM <- colnames(meanALM)[apply(corALM,1,which.max)] gnVISp <- names(c(head(sort(inetVISp),nGenes),head(sort(-inetVISp),nGenes))) corVISp <- cor(datFI[gnVISp,]/apply(datFI[gnVISp,],1,max),meanVISp[gnVISp,]/apply(meanVISp[gnVISp,],1,max)) mapVISp <- colnames(meanVISp)[apply(corVISp,1,which.max)] print(paste("Using",nGenes,"genes:")) print(table(mapALM=="L5 PT",mapVISp=="L5 PT",annoFI$cluster_label=="Exc FEZF2 GABRQ")) }
Using different numbers of DEX genes, we find that all 23 ET cells are most highly correlated with the mouse PT clusters for both VISp and ALM. A few additional cells from other clusters also map to the ET cluster in ALM, but the mapping in VISp is perfectly consistent with our clustering and with the Seurat mapping. It is important to note that this method is only confirmatory; Seurat (or one of the other related alignment strategies) are more robust for whole data set mapping.
Output session information.
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.