ReorgWholeBrainAtlas.R

library(Seurat)
#library(SeuratDisk)
library(tidyverse)
library(biomaRt)
#library(dbplyr).    
library(Matrix)
library(stringr) 

GRCh38.p13_GenomeCoords = read.csv("/Users/hannahglover/Dropbox/Columbia/Siletti/GRCh38.p13_GenomeCoords.csv")
PCGenes = subset(GRCh38.p13_GenomeCoords, GRCh38.p13_GenomeCoords$Gene.type %in% "protein_coding")

mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) 


BrainRegions = list("Spinal Cord" = "SpC",
                    "Cortex" = c("Cx_AG_LEC", "Cx_APH_MEC", "Cx_CgGC", "Cx_CgGrs", "Cx_CgGr", "Cx_V2", "Cx_Pro", "Cx_FI", "Cx_ReG", "Cx_IFG", "Cx_ITG", "Cx_LiG_V1C", "Cx_LIG_Idg", "Cx_MFG", "Cx_MTG", "Cx_FuGt", "Cx_PaO", "Cx_PRG", "Cx_PoCG", "Cx_PoRG", "Cx_PPh", "Cx_PrCG_M1C", "Cx_RoG", "Cx_Ig", "Cx_SCG", "Cx_SOG", "Cx_STC", "Cx_SMG", "Cx_SPL", "Cx_TP", "Cx_TTG", "PalCx_AON", "PalCx_Pir", "A35"),
                    "Cerebral Nuclei" = c("AMY_BLN_BL", "AMY_BLN_BM", "AMY_BLN_La", "AMY_CEN", "AMY_CMN", "AMY_CMN_CoA", "BF_SEP", "BF_SI", "BN_CaB", "BN_GP_Gpe", "BN_GP_Gpi", "BN_NAC", "BN_Pu", "Cla", "EXA"),
                    "Hippocampus" = c("HiH_HiT", "HiH_CA1", "HiH_CA1_3", "HiH_CA2_3", "HiH_DG_CA4", "HiB_Ca1_2", "HiB_CaA1_CA3", "HiB_CA3", "HiB_DG_CA4", "HiT_CA1_CA3", "HiT_CA4_DGC"),
                    "Hypothalamus" = c("HTH_HTHma", "HTH_HTHma_MN", "HTH_HTHma_HTHtub", "HTH_HTHpo", "HTH_HTHpo_HTHso", "HTH_HTHso", "HTH_HTHso_HTHtub", "HTH_HGHtub"),
                    "Cerebellum" = c("CB_CbDN", "CB_CBV", "CB_CBL"),
                    "Midbrain" = c("M_SN", "M_IC", "M_PAG_DR", "M_PAG", "M_PTR", "M_SN_RN", "M_SC", "M_RN"),
                    "Pons" = c("Pn_PnAN", "PN_PnEN", "PN_XPNTg", "PN_PN", "PN_PnRF", "PN_PnRF_PB"),
                    "Thalamus" = c("THM_ANC", "THM_ILN_PILN_CM_pf", "THM_ILN_PILN_CM", "THM_LNC_VA", "THM_LNC_LP", "THM_LNC_LP_VPL", "THM_LNC_Pul", "THM_LNC_VLN", "THM_LNC_VPL", "THM_MNC_MD", "THM_MLN_MD_Re", "THM_PoN_LG", "THM_PoN_MG", "THM_STH", "ETH"), 
                    "Myelencephalon" = c("Mo_MoAN", "Mo_MoRF_MoEN", "Mo_IO", "Mo_MoSR")
 )


#BrainRegions = list("Hypothalamus" = c("HTH_HTHma", "HTH_HTHma_MN", "HTH_HTHma_HTHtub", "HTH_HTHpo", "HTH_HTHpo_HTHso", "HTH_HTHso", "HTH_HTHso_HTHtub", "HTH_HGHtub"))

for(BR in "Myelencephalon"){#names(BrainRegions)
  CompileCounts = list()
  CompileMeta = list()  
  for(x in BrainRegions[[BR]]){
    Data <- readRDS(paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/", x, ".rds"))
    if(BR %in% c("Cortex", "Cerebral Nuclei")){
    if(BR == "Cerebral Nuclei"){DS = 0.9}else if(BR == "Cortex"){DS = 0.3}
    set.seed(1)
    Data = subset(Data, downsample =dim(Data)[2]*DS)
    }
    Data@meta.data$Sample = x
    Data@meta.data$Barcode = row.names(Data@meta.data)
    CompileMeta[[x]] = Data@meta.data
    
    countsMatrix = Data@assays$RNA@data
    genes <- rownames(countsMatrix)
    dataSeulist <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id", "hgnc_symbol"),values=genes,mart= mart)
    
    #Determine if there are missing genes - we see 34 ensembl IDs.
    missingGenes <- subset(countsMatrix@Dimnames[[1]], ! countsMatrix@Dimnames[[1]] %in% dataSeulist$ensembl_gene_id)
    #cat(paste("Missing ensembl IDs:", paste(missingGenes, collapse = ", ")))
    
    #Are any there any blanks or duplicates?
    dataSeulist <- dataSeulist %>% mutate_all(na_if,"")
    dataSeulist$dups <- duplicated(dataSeulist$hgnc_symbol) | duplicated(dataSeulist$hgnc_symbol, fromLast =T)
    blanks <- subset(dataSeulist, is.na(dataSeulist$hgnc_symbol) == T)
    duplicates <- subset(dataSeulist, dups == T & is.na(dataSeulist$hgnc_symbol) == F)
    #cat(paste("There are", dim(blanks)[1]), "blank gene symbols and",  dim(duplicates)[1], "duplicates")
    dataSeulist <- subset(dataSeulist, ! ensembl_gene_id %in% c(blanks$ensembl_gene_id, duplicates$ensembl_gene_id))
    
    #Subset the counts data
    countsMatrix <- countsMatrix[dataSeulist$ensembl_gene_id, ]
    
    #Reorder the gene symbols to match the counts matrix
    hgncSymbolsOrdered <- dataSeulist$hgnc_symbol[match(rownames(countsMatrix), dataSeulist$ensembl_gene_id)]
    
    #Sanity check - do the first 5 genes match?
    checkGenes <- subset(dataSeulist, hgnc_symbol %in% head(hgncSymbolsOrdered, 5) & ensembl_gene_id %in% head(countsMatrix@Dimnames[[1]], 5))
    if(dim(checkGenes)[1] == 5){print("Genes match!")}else{print("Genes do not match")}
    
    #Set the gene symbols as the new row.names
    row.names(countsMatrix) <- hgncSymbolsOrdered
    
    Duplicates_v2 <- as.data.frame(table(row.names(countsMatrix))) # 40071 888263
    Duplicates_v2 <- subset(Duplicates_v2, Duplicates_v2$Freq == 1)
    
    PCGenesInData = subset(PCGenes, Gene.name %in% Duplicates_v2$Var1)
    countsMatrix <- countsMatrix[PCGenesInData$Gene.name, ]
    CompileCounts[[x]] = countsMatrix
  }

  
  #Compile Metadata
  MetaCols = colnames(CompileMeta[[1]])
  for(z in 1:length(CompileMeta)){
    CompileMeta[[z]] = CompileMeta[[z]] %>% dplyr::select(all_of(MetaCols)) 
  }
  AllMeta <- do.call(rbind, CompileMeta) 
  
  #Rename Metadata
  #Class
  AllMeta$Class = str_to_title(AllMeta$cell_type) 
  AllMeta$Class = gsub(" ", "", AllMeta$Class)
  #Subclass
  AllMeta$Subclass = NA
  AllMeta$supercluster_term = as.character(AllMeta$supercluster_term)
  AllMeta$subcluster_id = as.numeric(AllMeta$subcluster_id)
  AllMeta$roi = as.character(AllMeta$roi)
  
  ##For V1 - Subclasses only
  AllMeta_V1 = AllMeta
  for(z in unique(na.omit(AllMeta_V1$Class))){
    if(z == "Neuron"){
      SubsetNeur = subset(AllMeta_V1, Class == "Neuron")
      SubsetNeurSuperClust = subset(unique(na.omit(SubsetNeur$supercluster_term)), ! unique(na.omit(SubsetNeur$supercluster_term)) %in% c("Splatter", "Miscellaneous"))
      for(y in SubsetNeurSuperClust){
        SelectClass = subset(SubsetNeur, supercluster_term == y)  
        SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
        SubclassTable = as.data.frame(table(SelectClass$subcluster_id))  
        SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
        SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
        SubclassTable$SubclassID = paste(y, SubclassTable$SubclustNo, sep="_")
        SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL 
        AllMeta_V1 = merge(AllMeta_V1, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
        AllMeta_V1$Subclass = ifelse(AllMeta_V1$supercluster_term == y, AllMeta_V1$SubclassID, AllMeta_V1$Subclass) 
        AllMeta_V1$SubclassID = NULL
      }
      
      SelectClass = subset(SubsetNeur, supercluster_term %in% c("Splatter", "Miscellaneous"))  
      SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
      SubclassTable = as.data.frame(table(SelectClass$subcluster_id))  
      SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
      SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
      SubclassTable$SubclassID = paste("Unannotated", SubclassTable$SubclustNo, sep="_")
      SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL 
      AllMeta_V1 = merge(AllMeta_V1, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
      AllMeta_V1$Subclass = ifelse(AllMeta_V1$supercluster_term == c("Splatter", "Miscellaneous"), AllMeta_V1$SubclassID, AllMeta_V1$Subclass)  
      AllMeta_V1$SubclassID = NULL
    }
    
    if(z != "Neuron"){
      SelectClass = subset(AllMeta_V1, Class == z)  
      SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
      SubclassTable = as.data.frame(table(SelectClass$subcluster_id))  
      SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
      SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
      SubclassTable$SubclassID = paste(z, SubclassTable$SubclustNo, sep="_")
      SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL 
      AllMeta_V1 = merge(AllMeta_V1, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
      AllMeta_V1$Subclass = ifelse(AllMeta_V1$Class == z, AllMeta_V1$SubclassID, AllMeta_V1$Subclass)
      AllMeta_V1$SubclassID = NULL
    }}
  
  
  ##For V2 - Subclass for each dissestion
  AllMeta_V2 = AllMeta
  for(w in unique(AllMeta_V2$roi)){
    SubsetROI = subset(AllMeta_V2, roi == w) 

  for(z in unique(na.omit(SubsetROI$Class))){
    if(z == "Neuron"){
      SubsetNeur = subset(SubsetROI, Class == "Neuron")
      SubsetNeurSuperClust = subset(unique(na.omit(SubsetNeur$supercluster_term)), ! unique(na.omit(SubsetNeur$supercluster_term)) %in% c("Splatter", "Miscellaneous"))
      for(y in SubsetNeurSuperClust){
        SelectClass = subset(SubsetNeur, supercluster_term == y)  
        SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
        SubclassTable = as.data.frame(table(SelectClass$subcluster_id))  
        SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
        SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
        SubclassTable$SubclassID = paste(y, SubclassTable$SubclustNo, sep="_")
        SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL 
        AllMeta_V2 = merge(AllMeta_V2, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
        AllMeta_V2$Subclass = ifelse(AllMeta_V2$supercluster_term == y & AllMeta_V2$roi == w, paste(w, AllMeta_V2$SubclassID, sep=" - "), AllMeta_V2$Subclass) 
        AllMeta_V2$SubclassID = NULL
      }
      
      SelectClass = subset(SubsetNeur, supercluster_term %in% c("Splatter", "Miscellaneous"))  
      SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
      SubclassTable = as.data.frame(table(SelectClass$subcluster_id))  
      SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
      SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
      SubclassTable$SubclassID = paste("Unannotated", SubclassTable$SubclustNo, sep="_")
      SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL 
      AllMeta_V2 = merge(AllMeta_V2, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
      AllMeta_V2$Subclass = ifelse(AllMeta_V2$supercluster_term == c("Splatter", "Miscellaneous")& AllMeta_V2$roi == w, paste(w, AllMeta_V2$SubclassID, sep=" - "), AllMeta_V2$Subclass)  
      AllMeta_V2$SubclassID = NULL
    }
    
    if(z != "Neuron"){
      SelectClass = subset(SubsetROI, Class == z)  
      SelectClass$subcluster_id = as.numeric(SelectClass$subcluster_id)
      SubclassTable = as.data.frame(table(SelectClass$subcluster_id))  
      SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
      SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
      SubclassTable$SubclassID = paste(z, SubclassTable$SubclustNo, sep="_")
      SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL 
      AllMeta_V2 = merge(AllMeta_V2, SubclassTable, by.x = "subcluster_id", by.y = "Var1", all = T)
      AllMeta_V2$Subclass = ifelse(AllMeta_V2$Class == z & AllMeta_V2$roi == w, paste(w, AllMeta_V2$SubclassID, sep=" - "), AllMeta_V2$Subclass)
      AllMeta_V2$SubclassID = NULL
    }}
  }
  
  ##
  #Compile Data  
  AllCounts = CompileCounts[[1]]
  if(length(CompileCounts) > 1){
    for(z in 2:(length(CompileCounts))){
      AllCounts  = RowMergeSparseMatrices(AllCounts, CompileCounts[[z]])
    }
  }
  
  #Save original Hypo
  if(BR == "Hypothalamus"){
  saveRDS(AllCounts, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_OriginalDataMatrix_", BR, ".rds"))
  write.csv(AllMeta, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_", BR, "_Original_Metadata.csv"))
  }
  
  #Setup summary data
  SummaryData = as.data.frame(dim(AllCounts)[2])
  colnames(SummaryData) = "Original_N_Cells"
  SummaryData$Original_N_Subclasses = length(unique(AllMeta$Subclass))
  
  #Summarize V1
  MetaTab = as.data.frame(table(AllMeta_V1$Subclass))
  KeepMeta = subset(MetaTab, MetaTab$Freq >=20)
  KeepBarcodes = subset(AllMeta_V1, Subclass %in% KeepMeta$Var1)
  AllCountsV1  <- AllCounts[, KeepBarcodes$Barcode]
  SummaryData$V1_N_Subclasses = length(unique(KeepBarcodes$Subclass))
  SummaryData$V1_N_Cells_Data = dim(AllCounts)[2]
  SummaryData$V1_N_Cells_Metadata = dim(KeepBarcodes)[1]
  SummaryData$Data_V1Metadata_Match = dim(KeepBarcodes)[1] == dim(AllCountsV1)[2]
  write.csv(KeepBarcodes, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_", BR, "_CLEAN_Metadata_V1.csv"))
  saveRDS(AllCounts, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_DataMatrix_", BR, "_CLEAN_V1.rds"))
  
  #Summarize V2
  MetaTab = as.data.frame(table(AllMeta_V2$Subclass))
  KeepMeta = subset(MetaTab, MetaTab$Freq >=20)
  KeepBarcodes = subset(AllMeta_V2, Subclass %in% KeepMeta$Var1)
  AllCountsV2  <- AllCounts[, KeepBarcodes$Barcode]
  SummaryData$V2_N_Subclasses = length(unique(KeepBarcodes$Subclass))
  SummaryData$V2_N_Cells_Data = dim(AllCounts)[2]
  SummaryData$V2_N_Cells_Metadata = dim(KeepBarcodes)[1]
  SummaryData$Data_V2Metadata_Match = dim(KeepBarcodes)[1] == dim(AllCountsV2)[2]
  write.csv(KeepBarcodes, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_", BR, "_CLEAN_Metadata_V2.csv"))
  saveRDS(AllCountsV2, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_DataMatrix_", BR, "_CLEAN_V2.rds"))
  
  
  SummaryData$MetadataSamples = paste(unique(KeepBarcodes$roi), collapse = ", ")
  write.csv(SummaryData, paste0("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_", BR, "_Summary.csv"))
}



########################

HypoSciAdv_Meta = read.csv("/Users/hannahglover/Library/CloudStorage/Box-Box/HG2553 Main Folder/Science Advances/FIG2Meta_OCT23.csv")
HypoSciAdv_Meta$SciAdvSubclass = NA

for(z in unique(na.omit(HypoSciAdv_Meta$H369_Nuclei))){
  SelectClass = subset(HypoSciAdv_Meta, H369_Nuclei == z)  
  SelectClass$H369 = as.numeric(SelectClass$H369)
  SubclassTable = as.data.frame(table(SelectClass$H369))  
  SubclassTable = SubclassTable[order(-SubclassTable$Freq), ]
  SubclassTable$SubclustNo = seq(1, dim(SubclassTable)[1], 1)
  SubclassTable$SubclassID = paste(z, SubclassTable$SubclustNo, sep="_")
  SubclassTable$Freq = NULL; SubclassTable$SubclustNo = NULL 
  HypoSciAdv_Meta = merge(HypoSciAdv_Meta, SubclassTable, by.x = "H369", by.y = "Var1", all = T)
  HypoSciAdv_Meta$SciAdvSubclass = ifelse(HypoSciAdv_Meta$H369_Nuclei == z, HypoSciAdv_Meta$SubclassID, HypoSciAdv_Meta$SciAdvSubclass)
  HypoSciAdv_Meta$SubclassID = NULL
}

HypoSciAdv_KeyMeta = HypoSciAdv_Meta %>% dplyr::select(X, SciAdvSubclass)

HypoOGMeta = read.csv("/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_Hypothalamus_CLEAN_Metadata_V1.csv")
#HypoOGMeta$Barcs2 = gsub("^.*\\.", "", HypoOGMeta$X)

HypoNewMeta = merge(HypoSciAdv_KeyMeta, HypoOGMeta, by.x = "X", by.y = "Barcode", all.y =T)
HypoNewMeta$CombinedSubclass = ifelse(HypoNewMeta$Class %in% "Neuron", HypoNewMeta$SciAdvSubclass, HypoNewMeta$Subclass)


###
MetaTab = as.data.frame(table(HypoNewMeta$CombinedSubclass))
KeepMeta = subset(MetaTab, MetaTab$Freq >=20)
KeepBarcodes = subset(HypoNewMeta, CombinedSubclass %in% KeepMeta$Var1)
KeepBarcodes$Barcs2 = gsub("^.*\\.", "", KeepBarcodes$X)
AllCounts = readRDS(paste0("/Users/hannahglover/Library/CloudStorage/Box-Box/HG2553 Main Folder/Siletti - Whole Brain scRNAseq/Siletti_OriginalDataMatrix_Hypothalamus.rds"))
AllCounts2  <- AllCounts[, KeepBarcodes$Barcs2]


write.csv(KeepBarcodes, paste0("/Users/hannahglover/Library/CloudStorage/Box-Box/HG2553 Main Folder/Siletti - Whole Brain scRNAseq/Siletti_Hypothalamus_HGANNOT_Metadata.csv"))
saveRDS(AllCounts2, "/Users/hannahglover/Dropbox/Columbia/Siletti/Siletti_DataMatrix_Hypothalamus_HGANNOT.rds")
Hannahglover/Glowworm documentation built on Jan. 16, 2024, 11:47 p.m.