In AllenInstitute/L5_VEN:

knitr::opts_chunk$set(echo = TRUE)

Overview

This script performs alignment of deep excitatory neurons in human frontoinsula, human middle temporal gyrus, mouse primary visual cortex, and mouse anterior lateral motor cortex. It also identifies genes with common and distinct patterning across these data sets.

If needed, set the working directory first: e.g. setwd("C:/Users/jeremym/Desktop/VEN_TEST").

Load the relevant libraries

suppressPackageStartupMessages({
  library(dplyr)
  library(feather)
  library(gplots)
  library(Matrix)
  library(cowplot)
  library(Seurat)
  library(pheatmap)
  library(VENcelltypes)  
})
options(stringsAsFactors=FALSE)

Read in the data and metadata, and subset to match data

These files were created in the previous code file, which must be run prior to running this code block. First, let's read in the data from this manuscript (human FI). We will subset these data to only include excitatory neurons.

## Read in the data
inputFolder = "FI/"
Expr.dat <- feather(paste(inputFolder,"data.feather",sep=""))
annoFI   <- read_feather(paste(inputFolder,"anno.feather",sep="")) 
exprData <- as.matrix(Expr.dat[,colnames(Expr.dat)[colnames(Expr.dat)!="sample_id"]])
rownames(exprData) = Expr.dat$sample_id
datFI    <- t(exprData)
datFI    <- log2(datFI+1)

load(paste(inputFolder,"clusterInfo.rda",sep="")) 
infoFI   <- clusterInfo

## Only include excitatory nuclei
clusterType <- annoFI$cluster_type_label 
datFI       <- datFI[,clusterType=="exc"]
annoFI      <- annoFI[clusterType=="exc",]

Second, let's read in the data from human MTG. We will subset these data to only include excitatory neurons from clusters found in layers 4 and 5, and will subset to a maximum of 50 nuclei per cell type to roughly match the number of nuclei in the two data sets. For compatibility with the original analysis, we set the seed below. We have tried the analysis multiple time and get qualitatively and nearly quantitatively matched results with different cell subsets, although the resolution value in Suerat (below) may need minor adjustments depending on the seed.

## Read in the data
mtgFolder <- "MTG/"
annoMTG   <- read_feather(paste(mtgFolder,"anno.feather",sep="")) 
Expr.datp <- feather(paste(mtgFolder,"data.feather",sep=""))
annoMTG   <- annoMTG[match(Expr.datp$sample_id,annoMTG$sample_id),] 
datMTG    <- as.matrix(Expr.datp[,names(Expr.datp)!="sample_id"])
rownames(datMTG) <- annoMTG$sample_id
datMTG    <- t(datMTG)
datMTG    <- log2(datMTG+1)

## Set the seed for subsampling for reproducibility
seed      <- 6

## Only include excitatory nuclei from clusters found in layers 4 and 5
fracL     <- table(annoMTG$cluster_label,annoMTG$brain_subregion_label) 
cntL5     <- fracL[,"L5"]    
fracL     <- t(t(fracL)*colSums(fracL))
fracL5    <- ceiling(100*fracL[,"L5"]/rowSums(fracL))
kpL5      <- names(fracL5)[pmax(fracL5,cntL5)>10]
kpMTG     <- is.element(annoMTG$brain_subregion_label,c("L5","L4"))&is.element(annoMTG$cluster_label,kpL5)&
           subsampleCells(annoMTG$cluster_label,50,seed)&(substr(annoMTG$cluster_label,1,3)=="Exc")
# (The above step also subsets to 50 nuclei per cluster)

datMTG    <- datMTG[,kpMTG]
annoMTG   <- annoMTG[kpMTG,]

Third, let's read in the data from mouse VISp. For mouse data we will work at the subclass level, which for excitatory types lists the predominant layer of origin and measured projection targets, since these two pieces of information are our primary interest, and since we have a relatively small sample size and the brain regions across species aren't perfectly matched. Within each subclass, we will select a maximum of 100 randomly chosen cells per subclass to roughly match the number of nuclei in the two data sets.

## Read in the data
vispFolder<- "VISp/"  
annoVISp  <- read_feather(paste(vispFolder,"anno.feather",sep="")) 
exprVISp  <- feather(paste(vispFolder,"data.feather",sep=""))
annoVISp  <- annoVISp[match(exprVISp$sample_id,annoVISp$sample_id),] 

## Select cells from VISp-specific excitatory subclasses
kpVISp    <- (annoVISp$class_label=="Glutamatergic")&(annoVISp$cluster_label!="CR Lhx5")&
             (!grepl("ALM",annoVISp$cluster_label))  # Remove any cells from ALM-predominant clusters
kpVISp[kpVISp][!subsampleCells(annoVISp[kpVISp,]$subclass_label,100,seed)] = FALSE

annoVISp  <- annoVISp[kpVISp,]
datVISp   <- as.matrix(exprVISp[kpVISp,names(exprVISp)!="sample_id"])
rownames(datVISp) <- annoVISp$sample_id
datVISp   <- t(datVISp)
datVISp   <- log2(datVISp+1)

Fourth, let's read and format the data from mouse ALM, as with VISp above.

almFolder <- "ALM/"  
annoALM   <- read_feather(paste(almFolder,"anno.feather",sep="")) 
exprALM   <- feather(paste(almFolder,"data.feather",sep=""))
annoALM   <- annoALM[match(exprALM$sample_id,annoALM$sample_id),] 

## Second ALM-specific excitatory clusters
kpALM     <- (annoALM$class_label=="Glutamatergic")&(annoALM$cluster_label!="CR Lhx5")&
             (!grepl("VISp",annoALM$cluster_label))  # Remove any cells from VISp-predominant clusters
kpALM[kpALM][!subsampleCells(annoALM[kpALM,]$subclass_label,100,seed)] = FALSE

annoALM   <- annoALM[kpALM,]
datALM    <- as.matrix(exprALM[kpALM,names(exprALM)!="sample_id"])
rownames(datALM) <- annoALM$sample_id
datALM    <- t(datALM)
datALM    <- log2(datALM+1)

Finally, find the intersection of all mouse and human genes available for analysis. We will exclude sex and mitochondrial genes, as with previous studies.

## Select the genes
data(mito_genes)  
data(sex_genes)  
excludeGenes <- sort(unique(c(sex_genes,mito_genes)))  
rownames(datVISp) <- rownames(datALM) <- toupper(rownames(datALM))
kpGenes <- intersect(rownames(datALM),intersect(rownames(datMTG),rownames(datFI)))
kpGenes <- sort(setdiff(kpGenes,excludeGenes))

## Subset the data
datVISp <- datVISp[kpGenes,]
datALM  <- datALM[kpGenes,] 
datFI   <- datFI[kpGenes,]
datMTG  <- datMTG[kpGenes,]

Integrate mouse and human data sets

Data integration has been shown to allow matching of very disparate data sets that measure the same (or comparable things), including previous work from our group on comparing human MTG to mouse VISp and ALM (https://www.biorxiv.org/content/early/2018/08/05/384826). This part of the analysis pipeline directly follows the vignette provided as part of the Satija lab, here:https://satijalab.org/seurat/pancreas_integration_label_transfer.html.

First we need to set up the required data and meta-data files for input into the the analysis.

## Prepare brain.data and brain.metadata (e.g., combine all data sets)
brain.data     <- cbind(datVISp,datALM,datFI,datMTG)
brain.metadata <- data.frame(set=c(rep("mouseVISp",dim(datVISp)[2]),
 rep("mouseALM",dim(datALM)[2]),rep("humanFI",dim(datFI)[2]),rep("humanMTG",dim(datMTG)[2])),
 celltype = c(paste("mouse VISp",annoVISp$subclass_label),paste("mouse ALM",annoALM$subclass_label),
 annoFI$cluster_label,annoMTG$cluster_label))
rownames(brain.metadata) <- colnames(brain.data)

## Construct reference
brain <- CreateSeuratObject(brain.data, meta.data = brain.metadata)
brain.list <- SplitObject(object = brain, split.by = "set")

## Gene selection using variance stabilizing transformation (vst)
for (i in 1:length(x = brain.list)) {
    brain.list[[i]] <- NormalizeData(object = brain.list[[i]], verbose = FALSE)
    brain.list[[i]] <- FindVariableFeatures(object = brain.list[[i]], 
        selection.method = "vst", nfeatures = 2000, verbose = FALSE)
}

Next, find the integration anchors and run the main integration / alignment analysis, using all the parameters shown in the above vignette. Note: in order for the "RunUMAP" function to work, you need to have Python installed with numpy, which is done outside of R. I used pip install --user numpy==1.15.1 to do this. At the time of writing, the current version of numpy (1.15.4) does not work with RunMAP.

## Set up a reference integrated data set using all four data sets
brain.anchors    <- FindIntegrationAnchors(object.list = brain.list, dims = 1:30)
brain.integrated <- IntegrateData(anchorset = brain.anchors, dims = 1:30)

## Run the main integration analysis
DefaultAssay(object = brain.integrated) <- "integrated"
brain.integrated <- ScaleData(object = brain.integrated, verbose = FALSE)
brain.integrated <- RunPCA(object = brain.integrated, npcs = 30, verbose = FALSE)
brain.integrated <- RunUMAP(object = brain.integrated, reduction = "pca", dims = 1:30)

Define clusters on the intergrated data set using Louvain community detection. We use resolution of 0.3 instead of the default 0.6 here.

resolution       <- 0.3
FindNeighbors.Seurat <- VENcelltypes::FindNeighbors.Seurat  ## REQUIRED FOR CONSISTENT OUTPUT IN WINDOWS AND UNIX
brain.integrated <- FindNeighbors(object = brain.integrated, dims = 1:30)
brain.integrated <- FindClusters(object = brain.integrated, resolution = resolution)
eval(parse(text=paste0("brain.integrated@meta.data$clusterCall <- brain.integrated@meta.data$integrated_snn_res.",resolution)))

Plot the fraction of each cell type in each Seurat cluster. This is used to determine the cluster names moving forward.

compare = table(brain.integrated@meta.data$celltype,brain.integrated@meta.data$clusterCall)
compare = round(100*compare/rowSums(compare))
heatmap.2(compare, dendrogram="row", trace="none", Colv=FALSE, margins = c(5, 15), 
  colsep=1:100, rowsep=1:100, RowSideColors=rep("black",48))

Rename clusters accordingly.

cellClasses <- c("3. L5 IT","2. L4* IT","1. L2/3 IT","4. L6 IT","8. NP","7. L6b","5. L5 PT","6. L6 CT")
names(cellClasses) <- colnames(compare)
brain.integrated@meta.data$className <- cellClasses[brain.integrated@meta.data$clusterCall]

How many genes from each data set are in each class?

table(brain.integrated@meta.data$className,brain.integrated@meta.data$set)

The number of cells from each data set in each type is very close to the expected number based on the sampling, confirming that these are meaningful clusters.

Plot a dendrogram of cell classes.

heat.colors  <- colorRampPalette(c("grey99", "orange", "red"))(100)
combined.cl  <- brain.integrated@meta.data %>% mutate(ref.cl = celltype)
ref.cl       <- setNames(combined.cl$celltype, rownames(brain.integrated@meta.data))
seurat.cl    <- setNames(combined.cl$className, rownames(brain.integrated@meta.data))
comb.cl.anno <- combined.cl %>% group_by(celltype) %>% summarise(set = dplyr::first(set))
ct           <- comb.cl.anno$celltype
comb.cl.anno <- as.data.frame(comb.cl.anno[,-1])
rownames(comb.cl.anno)  <- ct

cols <- list(set=scales::hue_pal()(4))  # Colors used in ggplot2
names(cols[["set"]]) <- unique(comb.cl.anno$set)
cl.conf <- compareClusterCalls(ref.cl, seurat.cl, comb.cl.anno, heat.colors=heat.colors, annotation_colors = cols,
 file="Fig3_dendrogramComparison_cortexIntegrated.pdf",height=10,width=5)

cl.conf$ph # NOTE, this only displays to the screen if you copy and paste into the console

Plot the data in UMAP space.

p1 <- DimPlot(object = brain.integrated, group.by = "set", reduction = "umap", 
      do.return = TRUE, pt.size = 1)
p2 <- DimPlot(object = brain.integrated, group.by = "className",  reduction = "umap", 
      do.return = TRUE, pt.size = 1, label=TRUE, label.size = 6)
plot_grid(p1, p2)
ggsave("Fig3_UMAP_cortexIntegrated.pdf",height=6,width=14)

Identify common marker genes across data sets

Data integration only ensures that the cells are matched, but does not say anything about agreement in gene expression. While the method does adjust gene expression levels, in our experience these adjusted levels are often too divorced from the initial values to be of use. We next seek to identify common marker genes across data sets using the log2(CPM+1) data as a starting point.

Identify cluster averages and gene specificity across data sets, using the Seurat cluster calls as the subclass definitions.

## Use cell type calls from Seurat as the subclass labels
ctVISp <- brain.integrated@active.ident[colnames(datVISp)]
ctALM  <- brain.integrated@active.ident[colnames(datALM)]
ctFI   <- brain.integrated@active.ident[colnames(datFI)]
ctMTG  <- brain.integrated@active.ident[colnames(datMTG)]

## Calculate trimmed averages across data sets
dats <- list(2^datVISp-1,2^datALM-1,2^datFI-1,2^datMTG-1)  # CONVERT BACK TO LINEAR SPACE
cts  <- list(ctVISp,ctALM,ctFI,ctMTG)
avg  <- list()
for (i in 1:4){
  avg[[i]] <- do.call("cbind", tapply(names(cts[[i]]), cts[[i]], function(x) apply(dats[[i]][,x],1,mean,trim=0.25))) 
  rownames(avg[[i]]) <- rownames(dats[[i]])
  avg[[i]] <- avg[[i]][,names(sort(cellClasses))]
  colnames(avg[[i]]) <- sort(cellClasses)
}

## Calculate a specificity index (lower means expressed in a smaller number of types)
spec <- avg[[1]][,1]*0+1
for (i in 1:4){
  tmp2 <- avg[[i]]/rowSums(avg[[i]]+0.00000000001)
  tmp2 <- t(apply(tmp2,1,function(x) cumsum(sort(x))))
  tmp2 <- rowSums(tmp2)
  tmp2[tmp2==0] = 8
  spec <- pmax(spec,tmp2)
}

Next, find the correlation between data sets and identify genes that are both specific to a small number of subtypes and also correlated across all data sets.

## Calculate correlations across data sets
corMin <- avg[[1]][,1]*0+1
for (i in 2:4) for(j in 1:(i-1)){
 datTmp <- cbind(avg[[i]],avg[[j]])
 corMin <- pmin(corMin,apply(datTmp,1,function(x) cor(x[1:8],x[9:16])))
}
corMin[is.na(corMin)]=-1

## Identify genes that are expressed relatively specifically to a subtype and highly correlated across all data sets
plotGn <- names(corMin)[(corMin>0.75)&(tmp2<2.75)]

Plot the common marker genes across all four data sets. These plots are showing the trimmed mean expression (trim=0.25) of each gene in each cluster. This ensures that marker genes are expressed in at least 25% of cells in the "on" cluster in all four species.

## Order the genes and plot the results in all four data sets
tmp <- avg[[1]][plotGn,] + avg[[2]][plotGn,] + avg[[3]][plotGn,] + avg[[4]][plotGn,]
tmp <- tmp/rowSums(tmp)
ord <- order(-apply(tmp,1,which.max)*10,rowMeans(t(apply(tmp,1,cumsum))))
mains <- c("Mouse VISp","Mouse ALM","Human FI","Human MTG")

for (i in 1:4){
 plotVal <- avg[[i]][plotGn,][ord,]
 plotVal <- plotVal/apply(plotVal,1,max)
 heatmap(plotVal,Rowv=NA,Colv=NA, main=mains[i],scale="none");
}

Now plot to the file

pdf("FigS2_commonGenes_heatmap.pdf",height=10,width=10)
for (i in 1:4){
 plotVal <- avg[[i]][plotGn,][ord,]
 plotVal <- plotVal/apply(plotVal,1,max)
 heatmap(plotVal,Rowv=NA,Colv=NA, main=mains[i],scale="none");
}
heatmap.2(plotVal/apply(plotVal,1,max),Rowv=NA,Colv=NA, main="Plot for color bar only")
dev.off()

Alternate strategies of identifying DEX genes in each data set

In additional to making nice plots, we also would like to know how many (and which) genes are good markers for each subtype in each data set. This will allow us to ask a few relevant questions. Given the morphological diversity of the ET cluster in FI, do we find some comparable measure of transcriptomic diversity? For example, does this type have more marker genes than other types? Is the transcriptional profile particularly different in FI? Is it more distinct as compared with other the other cell types? The next couple of analyses provided different strategies for assessing differential expression between subtypes.

Top marker genes for each type using the built in FindMarkers function in Seurat.

nSub     <- 1000
iters    <- 1
numOkay  <- 1
tests    <- c ("bimod")
csMarks  <- list()
samples  <- names(brain.integrated$className)

for (ct in levels(brain.integrated)){
  for (set in sort(unique(brain.integrated$set))){
    mark = NULL
    for (i in 1:iters){
      test.use <- tests[(i %% length(tests))+1]
      class <- brain.integrated$className
      class[brain.integrated$set != set] = "no"
      set.seed(1); 
      subs2 <- nSub
      kpSam <- samples[subsampleCells(class,subs2,seed = i)&(class!="no")]
      brTmp <- subset(brain.integrated,cells=kpSam)
      out   <- FindMarkers(brTmp, ident.1 = ct, verbose=0, min.pct = 0.5, 
                           only.pos=TRUE, test.use=test.use, assay = "RNA")
      mark  <- c(mark, rownames(out))
    }
    tab <- table(mark)
    csMarks[[ct]][[set]] <- sort(names(tab)[tab>=numOkay])
  }
}
names(csMarks) <- as.character(cellClasses[levels(brain.integrated)])
csMarks <- csMarks[colnames(avg[[1]])]

out <- matrix(0, nrow=length(colnames(avg[[1]])),ncol=length(mains))
colnames(out) <- sort(unique(brain.integrated$set))
rownames(out) <- colnames(avg[[1]])

for (i in cellClasses) {
  print(i)
  print(table(table(c(csMarks[[i]][[1]],csMarks[[i]][[2]],csMarks[[i]][[3]],csMarks[[i]][[4]]))))
  print("-------------------------")
  print(" ")
  for (j in sort(unique(brain.integrated$set))){
    out[i,j] <- length(csMarks[[i]][[j]])
  }
}
out

We find that the distinct types in the UMAP have the most distinct markers based on Seurat, which makes sense. By this measure there are more ET-associated genes in MTG than FI. Note that this method is looking at gene expression in ET as compared with the average of all other types and may not be finding the most selective genes.

To account for the differences in scales of these data sets, we will look for genes defined as DEX using ranks rather than absolute expression levels, and we will subsample each subtype to the same number of cells multiple times. As such, this analysis is comparing the difference in ranked average expression values rather than any absolute measure of expression.

ranks <- scales <- list()
for (i in 1:4) {
  scales[[i]] <- avg[[i]]/(apply(avg[[i]],1,mean)+0.00000001)
  ranks[[i]]  <- apply(-scales[[i]],2,rank,ties.method="first")
  ranks[[i]][ranks[[i]]>7219] <- 7220  # 50% of all genes
  ranks[[i]]  <- 7219-ranks[[i]]
}

dexs <- list()
for (i in 1:4) {
  dexs[[i]] <- ranks[[i]]*0
  for (j in 1:dim(ranks[[i]])[2]){
    dexs[[i]][,j] <- ranks[[i]][,j] - apply(ranks[[i]][,-j],1,max)
  }
  dexs[[i]] <- pmax(dexs[[i]],0)
}

mcpm = 5
rdex = 3609.5  # 25% of all genes
out <- NULL
for (i in 1:4){
  out <- rbind(out,colSums((dexs[[i]]>rdex)&(avg[[i]]>=mcpm)))
}
rownames(out) <- mains
mouse_common <- (colSums((((dexs[[1]]>rdex)&(avg[[1]]>=mcpm))+
               ((dexs[[2]]>rdex)&(avg[[2]]>=mcpm)))==2))
human_common <- (colSums((((dexs[[3]]>rdex)&(avg[[3]]>=mcpm))+
               ((dexs[[4]]>rdex)&(avg[[4]]>=mcpm)))==2))
out <- rbind(out,mouse_common,human_common)
out

While the specific counts differ, the same results hold here, with many ET markers in all data sets, and the most in MTG.

Next, let's directly look for markers of the ET subclasses using the method to be described in the next code documents. The details don't matter here other than that it also looks at the proportion of cells expressed in each class.

meanFI   <- log2(avg[[3]]+1)
meanMTG  <- log2(avg[[4]]+1)

## proportions
exprThresh <- 1
propFI   <- do.call("cbind", tapply(names(ctFI), ctFI, function(x) rowMeans(datFI[,x]>exprThresh))) 
propMTG  <- do.call("cbind", tapply(names(ctMTG), ctMTG, function(x) rowMeans(datMTG[,x]>exprThresh))) 
colnames(propFI) <- colnames(propMTG) <- cellClasses
propFI   <- propFI[,colnames(meanFI)]
propMTG  <- propMTG[,colnames(meanMTG)]

# FI dex
pti     <- 5
wme <- apply(meanFI,1,which.max)
fce <- apply(meanFI,1,function(x) diff(range(-sort(-x)[1:2])))
VEe <- apply(meanFI,1,function(x) x[pti]-mean(x[c(1:(pti-1),(pti+1):length(x))]))
p2e <- apply(propFI,1,function(x) -sort(-x)[2])
p1e <- apply(propFI,1,max)
pce <- p1e-p2e

topVENgeneFI <- names(fce)[(fce>1)&(wme==pti)&(pce>0.25)&(p1e>0.5)&(p2e<0.5)]
topVENgeneFI <- topVENgeneFI[order(-fce[(topVENgeneFI)])]

# MTG dex
ptib     <- 5
wmeb <- apply(meanMTG,1,which.max)
fce <- apply(meanMTG,1,function(x) diff(range(-sort(-x)[1:2])))
VEe <- apply(meanMTG,1,function(x) x[ptib]-mean(x[c(1:(ptib-1),(ptib+1):length(x))]))
p2e <- apply(propMTG,1,function(x) -sort(-x)[2])
p1e <- apply(propMTG,1,max)
pme <- apply(propMTG,1,mean)
pce <- p1e-p2e

topVENgeneMTG <- names(fce)[(fce>1)&(wmeb==ptib)&(pce>0.25)&(p1e>0.5)&(p2e<0.5)]
topVENgeneMTG <- topVENgeneMTG[order(-fce[(topVENgeneMTG)])]

print(paste(length(topVENgeneFI),"FI genes in the ET cluster."))
print(paste(length(topVENgeneMTG),"MTG genes in the ET cluster."))
print(paste(length(intersect(topVENgeneFI,topVENgeneMTG)),"genes in the ET cluster common to both regions."))

We find roughly the same number of genes and the same result here as with the previous two methods.

Identify differentially expressed genes between data sets

In addition to identifying common genes, we also want to know how gene expression patterns of different clusters diverge between data sets. One goal of this is to see whether the ET type diverges more betwen data sets, and if so, whether FI (which should contain VENs) shows the biggest gene gain. To do so, we perform Spearman correlation of cluster medians across each pair of data sets. Cell types with lower correlation would have more differential expression. This method allows us to properly control for technical differences in a way that is not possible by directly identifying specific genes.

First, we calculate the subtype medians. To account for the different number of cells in each subtype, we subsample equally from each subtype 10 times and then find the average and range of correlations at the end.

## Use cell type calls from Seurat as the subclass labels
ctVISp <- brain.integrated@active.ident[colnames(datVISp)]
ctALM  <- brain.integrated@active.ident[colnames(datALM)]
ctFI   <- brain.integrated@active.ident[colnames(datFI)]
ctMTG  <- brain.integrated@active.ident[colnames(datMTG)]
cts    <- list(ctVISp,ctALM,ctFI,ctMTG)

## Calculate median expression and marker genes within a data set
dats <- list(datVISp,datALM,datFI,datMTG) # log2 space
meds <- list() # CLUSTER MEDIANS
iter <- 10
subs <- 10

for (seed in 1:iter){
  meds[[seed]] <- list()
  for (i in 1:4){
    kp <- subsampleCells(cts[[i]],subs,seed=seed)
    meds[[seed]][[i]] <- do.call("cbind", tapply(names(cts[[i]])[kp], cts[[i]][kp], function(x) apply(dats[[i]][,kp][,x],1,median))) 
    meds[[seed]][[i]] <- meds[[seed]][[i]][,names(sort(cellClasses))]
    colnames(meds[[seed]][[i]]) <- as.character(sort(cellClasses))
  }
}

Next, we calculate and output the average correlations (as well as the range).

## Calculate and output the results for correlation (e.g., differential marker genes)
corsL <- list()
for (seed in 1:iter){
  cors <- rn <- NULL
  nm <- c("VISp","ALM","FI","MTG")
  for (i in 2:4) for (j in 1:(i-1)){
    kp <- rowSums(pmax(meds[[seed]][[i]],meds[[seed]][[j]]))>=0
    rn <- c(rn,paste(nm[i],"|",nm[j]))
    corTmp <- NULL
    for (k in 1:dim(meds[[seed]][[i]])[2]){
      # kp <- pmax(meds[[seed]][[i]][,k],meds[[seed]][[j]][,k])>0
      corTmp <- c(corTmp,cor(meds[[seed]][[i]][kp,k],meds[[seed]][[j]][kp,k],method="s"))
    }
    names(corTmp)<-colnames(meds[[seed]][[i]])
    cors <- rbind(cors,corTmp)
  }
  rownames(cors) <- rn
  average <- colMeans(cors)
  cors <- rbind(cors,average)
  corsL[[seed]] <- cors
}

corsAvg <- corsMin <- corsMax <- corsL[[1]]
for(seed in 2:iter){
  corsAvg <- corsAvg + corsL[[seed]]
  corsMin <- pmin(corsMin, corsL[[seed]])
  corsMax <- pmax(corsMax, corsL[[seed]])
}

round(100000*corsAvg/length(iter))/1000000
round(1000000*corsMax-1000000*corsMin)/1000000

We find that the ET subtype has the highest correlation between each pair of data sets, suggesting that this type does not diverge dramatically in human FI. This cannot be explained by the transcriptionally distinctness of the ET subtype as the NP subtype (which also has many common marker genes) has the lowest correlation between almost all pairs of sets. The L4* IT type also has low correlation between data sets, consistent with expectations since FI and ALM don't contain a layer 4, while VISp and MTG do. Together these results suggest that the distinctness of ET types is captured transcriptomically more than any species or regional differences due to morphological specialization.

Correlation based mapping of human FI to mouse

As a complementary method to Seurat we will perform a simple, correlation-based mapping of human FI to mouse VISp and ALM to assess whether human and mouse ET cells align when using a less sophisticated method. The basic idea is to scale the mouse and human data to match, and then compare each human cell to the cluster means of each mouse cell types using the genes most highly enriched or depleted in the mouse ET clusters.

# Calculate means and ordered PT genes in mouse
meanALM  <- findFromGroups(t(datALM),annoALM$subclass_label,mean)
meanVISp <- findFromGroups(t(datVISp),annoVISp$subclass_label,mean)
kpGn     <- (rowSums(meanALM)>0)&(rowSums(meanVISp)>0)&(rowSums(datFI)>0)
inetALM  <- meanALM[kpGn,"L5 PT"] - rowMeans(meanALM[kpGn,])
inetVISp <- meanVISp[kpGn,"L5 PT"] - rowMeans(meanVISp[kpGn,])

# Calculate top correlated types using these genes and check how many map to PT
for (nGenes in c(75,100,150,250,550)){
 gnALM    <- names(c(head(sort(inetALM),nGenes),head(sort(-inetALM),nGenes)))
 corALM   <- cor(datFI[gnALM,]/apply(datFI[gnALM,],1,max),meanALM[gnALM,]/apply(meanALM[gnALM,],1,max))
 mapALM   <- colnames(meanALM)[apply(corALM,1,which.max)] 

 gnVISp   <- names(c(head(sort(inetVISp),nGenes),head(sort(-inetVISp),nGenes)))
 corVISp  <- cor(datFI[gnVISp,]/apply(datFI[gnVISp,],1,max),meanVISp[gnVISp,]/apply(meanVISp[gnVISp,],1,max))
 mapVISp  <- colnames(meanVISp)[apply(corVISp,1,which.max)]

 print(paste("Using",nGenes,"genes:"))
 print(table(mapALM=="L5 PT",mapVISp=="L5 PT",annoFI$cluster_label=="Exc FEZF2 GABRQ"))
}

Using different numbers of DEX genes, we find that all 23 ET cells are most highly correlated with the mouse PT clusters for both VISp and ALM. A few additional cells from other clusters also map to the ET cluster in ALM, but the mapping in VISp is perfectly consistent with our clustering and with the Seurat mapping. It is important to note that this method is only confirmatory; Seurat (or one of the other related alignment strategies) are more robust for whole data set mapping.

Output session information.