library(Seurat)
#library(SeuratDisk)
library(tidyverse)
library(biomaRt)
#library(dbplyr).
library(Matrix)
library(stringr)
GRCh38.p13_GenomeCoords = read.csv("/Users/hannahglover/Dropbox/Columbia/Siletti/GRCh38.p13_GenomeCoords.csv")
PCGenes = subset(GRCh38.p13_GenomeCoords, GRCh38.p13_GenomeCoords$Gene.type %in% "protein_coding")
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
BrainRegions = list("Spinal Cord" = "SpC",
"Cortex" = c("Cx_AG_LEC", "Cx_APH_MEC", "Cx_CgGC", "Cx_CgGrs", "Cx_CgGr", "Cx_V2", "Cx_Pro", "Cx_FI", "Cx_ReG", "Cx_IFG", "Cx_ITG", "Cx_LiG_V1C", "Cx_LIG_Idg", "Cx_MFG", "Cx_MTG", "Cx_FuGt", "Cx_PaO", "Cx_PRG", "Cx_PoCG", "Cx_PoRG", "Cx_PPh", "Cx_PrCG_M1C", "Cx_RoG", "Cx_Ig", "Cx_SCG", "Cx_SOG", "Cx_STC", "Cx_SMG", "Cx_SPL", "Cx_TP", "Cx_TTG", "PalCx_AON", "PalCx_Pir", "A35"),
"Cerebral Nuclei" = c("AMY_BLN_BL", "AMY_BLN_BM", "AMY_BLN_La", "AMY_CEN", "AMY_CMN", "AMY_CMN_CoA", "BF_SEP", "BF_SI", "BN_CaB", "BN_GP_Gpe", "BN_GP_Gpi", "BN_NAC", "BN_Pu", "Cla", "EXA"),
"Hippocampus" = c("HiH_HiT", "HiH_CA1", "HiH_CA1_3", "HiH_CA2_3", "HiH_DG_CA4", "HiB_Ca1_2", "HiB_CaA1_CA3", "HiB_CA3", "HiB_DG_CA4", "HiT_CA1_CA3", "HiT_CA4_DGC"),
"Hypothalamus" = c("HTH_HTHma", "HTH_HTHma_MN", "HTH_HTHma_HTHtub", "HTH_HTHpo", "HTH_HTHpo_HTHso", "HTH_HTHso", "HTH_HTHso_HTHtub", "HTH_HGHtub"),
"Cerebellum" = c("CB_CbDN", "CB_CBV", "CB_CBL"),
"Midbrain" = c("M_SN", "M_IC", "M_PAG_DR", "M_PAG", "M_PTR", "M_SN_RN", "M_SC", "M_RN"),
"Pons" = c("Pn_PnAN", "PN_PnEN", "PN_XPNTg", "PN_PN", "PN_PnRF", "PN_PnRF_PB"),
"Thalamus" = c("THM_ANC", "THM_ILN_PILN_CM_pf", "THM_ILN_PILN_CM", "THM_LNC_VA", "THM_LNC_LP", "THM_LNC_LP_VPL", "THM_LNC_Pul", "THM_LNC_VLN", "THM_LNC_VPL", "THM_MNC_MD", "THM_MLN_MD_Re", "THM_PoN_LG", "THM_PoN_MG", "THM_STH", "ETH"),
"Myelencephalon" = c("Mo_MoAN", "Mo_MoRF_MoEN", "Mo_IO", "Mo_MoSR")
)
#BrainRegions = list("Hypothalamus" = c("HTH_HTHma", "HTH_HTHma_MN", "HTH_HTHma_HTHtub", "HTH_HTHpo", "HTH_HTHpo_HTHso", "HTH_HTHso", "HTH_HTHso_HTHtub", "HTH_HGHtub"))
for(BR in "Myelencephalon"){#names(BrainRegions)
CompileCounts = list()
CompileMeta = list()
for(x in BrainRegions[[BR]]){
Data <- readRDS(paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/", x, ".rds"))
if(BR %in% c("Cortex", "Cerebral Nuclei")){
if(BR == "Cerebral Nuclei"){DS = 0.9}else if(BR == "Cortex"){DS = 0.3}
set.seed(1)
Data = subset(Data, downsample =dim(Data)[2]*DS)
}
Data@meta.data$Sample = x
Data@meta.data$Barcode = row.names(Data@meta.data)
CompileMeta[[x]] = Data@meta.data
countsMatrix = Data@assays$RNA@data
genes <- rownames(countsMatrix)
dataSeulist <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id", "hgnc_symbol"),values=genes,mart= mart)
#Determine if there are missing genes - we see 34 ensembl IDs.
missingGenes <- subset(countsMatrix@Dimnames[[1]], ! countsMatrix@Dimnames[[1]] %in% dataSeulist$ensembl_gene_id)
#cat(paste("Missing ensembl IDs:", paste(missingGenes, collapse = ", ")))
#Are any there any blanks or duplicates?
dataSeulist <- dataSeulist %>% mutate_all(na_if,"")
dataSeulist$dups <- duplicated(dataSeulist$hgnc_symbol) | duplicated(dataSeulist$hgnc_symbol, fromLast =T)
blanks <- subset(dataSeulist, is.na(dataSeulist$hgnc_symbol) == T)
duplicates <- subset(dataSeulist, dups == T & is.na(dataSeulist$hgnc_symbol) == F)
#cat(paste("There are", dim(blanks)[1]), "blank gene symbols and", dim(duplicates)[1], "duplicates")
dataSeulist <- subset(dataSeulist, ! ensembl_gene_id %in% c(blanks$ensembl_gene_id, duplicates$ensembl_gene_id))
#Subset the counts data
countsMatrix <- countsMatrix[dataSeulist$ensembl_gene_id, ]
#Reorder the gene symbols to match the counts matrix
hgncSymbolsOrdered <- dataSeulist$hgnc_symbol[match(rownames(countsMatrix), dataSeulist$ensembl_gene_id)]
#Sanity check - do the first 5 genes match?
checkGenes <- subset(dataSeulist, hgnc_symbol %in% head(hgncSymbolsOrdered, 5) & ensembl_gene_id %in% head(countsMatrix@Dimnames[[1]], 5))
if(dim(checkGenes)[1] == 5){print("Genes match!")}else{print("Genes do not match")}
#Set the gene symbols as the new row.names
row.names(countsMatrix) <- hgncSymbolsOrdered
Duplicates_v2 <- as.data.frame(table(row.names(countsMatrix))) # 40071 888263
Duplicates_v2 <- subset(Duplicates_v2, Duplicates_v2$Freq == 1)
PCGenesInData = subset(PCGenes, Gene.name %in% Duplicates_v2$Var1)
countsMatrix <- countsMatrix[PCGenesInData$Gene.name, ]
CompileCounts[[x]] = countsMatrix
}
#Compile Metadata
MetaCols = colnames(CompileMeta[[1]])
for(z in 1:length(CompileMeta)){
CompileMeta[[z]] = CompileMeta[[z]] %>% dplyr::select(all_of(MetaCols))
}
AllMeta <- do.call(rbind, CompileMeta)
#Rename Metadata
#Class
AllMeta$Class = str_to_title(AllMeta$cell_type)
AllMeta$Class = gsub(" ", "", AllMeta$Class)
#Subclass
AllMeta$Subclass = NA
AllMeta$supercluster_term = as.character(AllMeta$supercluster_term)
AllMeta$subcluster_id = as.numeric(AllMeta$subcluster_id)
AllMeta$roi = as.character(AllMeta$roi)
##For V1 - Subclasses only
AllMeta_V1 = AllMeta
for(z in unique(na.omit(AllMeta_V1$Class))){
if(z == "Neuron"){
SubsetNeur = subset(AllMeta_V1, Class == "Neuron")
SubsetNeurSuperClust = subset(unique(na.omit(SubsetNeur$supercluster_term)), ! unique(na.omit(SubsetNeur$supercluster_term)) %in% c("Splatter", "Miscellaneous"))
for(y in SubsetNeurSuperClust){
SelectClass = subset(SubsetNeur, supercluster_term == y)
SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
SubclassTable = as.data.frame(table(SelectClass$subcluster_id))
SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
SubclassTable$SubclassID = paste(y, SubclassTable$SubclustNo, sep="_")
SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL
AllMeta_V1 = merge(AllMeta_V1, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
AllMeta_V1$Subclass = ifelse(AllMeta_V1$supercluster_term == y, AllMeta_V1$SubclassID, AllMeta_V1$Subclass)
AllMeta_V1$SubclassID = NULL
}
SelectClass = subset(SubsetNeur, supercluster_term %in% c("Splatter", "Miscellaneous"))
SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
SubclassTable = as.data.frame(table(SelectClass$subcluster_id))
SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
SubclassTable$SubclassID = paste("Unannotated", SubclassTable$SubclustNo, sep="_")
SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL
AllMeta_V1 = merge(AllMeta_V1, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
AllMeta_V1$Subclass = ifelse(AllMeta_V1$supercluster_term == c("Splatter", "Miscellaneous"), AllMeta_V1$SubclassID, AllMeta_V1$Subclass)
AllMeta_V1$SubclassID = NULL
}
if(z != "Neuron"){
SelectClass = subset(AllMeta_V1, Class == z)
SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
SubclassTable = as.data.frame(table(SelectClass$subcluster_id))
SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
SubclassTable$SubclassID = paste(z, SubclassTable$SubclustNo, sep="_")
SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL
AllMeta_V1 = merge(AllMeta_V1, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
AllMeta_V1$Subclass = ifelse(AllMeta_V1$Class == z, AllMeta_V1$SubclassID, AllMeta_V1$Subclass)
AllMeta_V1$SubclassID = NULL
}}
##For V2 - Subclass for each dissestion
AllMeta_V2 = AllMeta
for(w in unique(AllMeta_V2$roi)){
SubsetROI = subset(AllMeta_V2, roi == w)
for(z in unique(na.omit(SubsetROI$Class))){
if(z == "Neuron"){
SubsetNeur = subset(SubsetROI, Class == "Neuron")
SubsetNeurSuperClust = subset(unique(na.omit(SubsetNeur$supercluster_term)), ! unique(na.omit(SubsetNeur$supercluster_term)) %in% c("Splatter", "Miscellaneous"))
for(y in SubsetNeurSuperClust){
SelectClass = subset(SubsetNeur, supercluster_term == y)
SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
SubclassTable = as.data.frame(table(SelectClass$subcluster_id))
SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
SubclassTable$SubclassID = paste(y, SubclassTable$SubclustNo, sep="_")
SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL
AllMeta_V2 = merge(AllMeta_V2, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
AllMeta_V2$Subclass = ifelse(AllMeta_V2$supercluster_term == y & AllMeta_V2$roi == w, paste(w, AllMeta_V2$SubclassID, sep=" - "), AllMeta_V2$Subclass)
AllMeta_V2$SubclassID = NULL
}
SelectClass = subset(SubsetNeur, supercluster_term %in% c("Splatter", "Miscellaneous"))
SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
SubclassTable = as.data.frame(table(SelectClass$subcluster_id))
SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
SubclassTable$SubclassID = paste("Unannotated", SubclassTable$SubclustNo, sep="_")
SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL
AllMeta_V2 = merge(AllMeta_V2, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
AllMeta_V2$Subclass = ifelse(AllMeta_V2$supercluster_term == c("Splatter", "Miscellaneous")& AllMeta_V2$roi == w, paste(w, AllMeta_V2$SubclassID, sep=" - "), AllMeta_V2$Subclass)
AllMeta_V2$SubclassID = NULL
}
if(z != "Neuron"){
SelectClass = subset(SubsetROI, Class == z)
SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
SubclassTable = as.data.frame(table(SelectClass$subcluster_id))
SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
SubclassTable$SubclassID = paste(z, SubclassTable$SubclustNo, sep="_")
SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL
AllMeta_V2 = merge(AllMeta_V2, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
AllMeta_V2$Subclass = ifelse(AllMeta_V2$Class == z & AllMeta_V2$roi == w, paste(w, AllMeta_V2$SubclassID, sep=" - "), AllMeta_V2$Subclass)
AllMeta_V2$SubclassID = NULL
}}
}
##
#Compile Data
AllCounts = CompileCounts[[1]]
if(length(CompileCounts) > 1){
for(z in 2:(length(CompileCounts))){
AllCounts = RowMergeSparseMatrices(AllCounts, CompileCounts[[z]])
}
}
#Save original Hypo
if(BR == "Hypothalamus"){
saveRDS(AllCounts, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_OriginalDataMatrix_", BR, ".rds"))
write.csv(AllMeta, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_", BR, "_Original_Metadata.csv"))
}
#Setup summary data
SummaryData = as.data.frame(dim(AllCounts)[2])
colnames(SummaryData) = "Original_N_Cells"
SummaryData$Original_N_Subclasses = length(unique(AllMeta$Subclass))
#Summarize V1
MetaTab = as.data.frame(table(AllMeta_V1$Subclass))
KeepMeta = subset(MetaTab, MetaTab$Freq >=20)
KeepBarcodes = subset(AllMeta_V1, Subclass %in% KeepMeta$Var1)
AllCountsV1 <- AllCounts[, KeepBarcodes$Barcode]
SummaryData$V1_N_Subclasses = length(unique(KeepBarcodes$Subclass))
SummaryData$V1_N_Cells_Data = dim(AllCounts)[2]
SummaryData$V1_N_Cells_Metadata = dim(KeepBarcodes)[1]
SummaryData$Data_V1Metadata_Match = dim(KeepBarcodes)[1] == dim(AllCountsV1)[2]
write.csv(KeepBarcodes, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_", BR, "_CLEAN_Metadata_V1.csv"))
saveRDS(AllCounts, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_DataMatrix_", BR, "_CLEAN_V1.rds"))
#Summarize V2
MetaTab = as.data.frame(table(AllMeta_V2$Subclass))
KeepMeta = subset(MetaTab, MetaTab$Freq >=20)
KeepBarcodes = subset(AllMeta_V2, Subclass %in% KeepMeta$Var1)
AllCountsV2 <- AllCounts[, KeepBarcodes$Barcode]
SummaryData$V2_N_Subclasses = length(unique(KeepBarcodes$Subclass))
SummaryData$V2_N_Cells_Data = dim(AllCounts)[2]
SummaryData$V2_N_Cells_Metadata = dim(KeepBarcodes)[1]
SummaryData$Data_V2Metadata_Match = dim(KeepBarcodes)[1] == dim(AllCountsV2)[2]
write.csv(KeepBarcodes, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_", BR, "_CLEAN_Metadata_V2.csv"))
saveRDS(AllCountsV2, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_DataMatrix_", BR, "_CLEAN_V2.rds"))
SummaryData$MetadataSamples = paste(unique(KeepBarcodes$roi), collapse = ", ")
write.csv(SummaryData, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_", BR, "_Summary.csv"))
}
########################
HypoSciAdv_Meta = read.csv("/Users/hannahglover/Library/CloudStorage/Box-Box/HG2553 Main Folder/Science Advances/FIG2Meta_OCT23.csv")
HypoSciAdv_Meta$SciAdvSubclass = NA
for(z in unique(na.omit(HypoSciAdv_Meta$H369_Nuclei))){
SelectClass = subset(HypoSciAdv_Meta, H369_Nuclei == z)
SelectClass$H369 = as.numeric(SelectClass$H369)
SubclassTable = as.data.frame(table(SelectClass$H369))
SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
SubclassTable$SubclassID = paste(z, SubclassTable$SubclustNo, sep="_")
SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL
HypoSciAdv_Meta = merge(HypoSciAdv_Meta, SubclassTable, by.x = "H369", by.y = "Var1", all = T)
HypoSciAdv_Meta$SciAdvSubclass = ifelse(HypoSciAdv_Meta$H369_Nuclei == z, HypoSciAdv_Meta$SubclassID, HypoSciAdv_Meta$SciAdvSubclass)
HypoSciAdv_Meta$SubclassID = NULL
}
HypoSciAdv_KeyMeta = HypoSciAdv_Meta %>% dplyr::select(X, SciAdvSubclass)
HypoOGMeta = read.csv("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_Hypothalamus_CLEAN_Metadata_V1.csv")
#HypoOGMeta$Barcs2 = gsub("^.*\\.", "", HypoOGMeta$X)
HypoNewMeta = merge(HypoSciAdv_KeyMeta, HypoOGMeta, by.x = "X", by.y = "Barcode", all.y =T)
HypoNewMeta$CombinedSubclass = ifelse(HypoNewMeta$Class %in% "Neuron", HypoNewMeta$SciAdvSubclass, HypoNewMeta$Subclass)
###
MetaTab = as.data.frame(table(HypoNewMeta$CombinedSubclass))
KeepMeta = subset(MetaTab, MetaTab$Freq >=20)
KeepBarcodes = subset(HypoNewMeta, CombinedSubclass %in% KeepMeta$Var1)
KeepBarcodes$Barcs2 = gsub("^.*\\.", "", KeepBarcodes$X)
AllCounts = readRDS(paste0("/Users/hannahglover/Library/CloudStorage/Box-Box/HG2553 Main Folder/Siletti - Whole Brain scRNAseq/Siletti_OriginalDataMatrix_Hypothalamus.rds"))
AllCounts2 <- AllCounts[, KeepBarcodes$Barcs2]
write.csv(KeepBarcodes, paste0("/Users/hannahglover/Library/CloudStorage/Box-Box/HG2553 Main Folder/Siletti - Whole Brain scRNAseq/Siletti_Hypothalamus_HGANNOT_Metadata.csv"))
saveRDS(AllCounts2, "/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_DataMatrix_Hypothalamus_HGANNOT.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.