This vignette replicates our results for Figure 4 (\textit{Ptgds} analysis)
library(ggplot2) library(tidyr) library(GenomicRanges) library(rtracklayer) library(reshape2) library(motifmatchr) library(GenomicFeatures) library(BSgenome.Mmusculus.UCSC.mm10) library(JASPAR2020) library(TFBSTools) library(data.table) library(dplyr) library(latex2exp)
# convert REL1505 snp data into bed files # also find peak ranges that overlap SNPs, so that can search for motifs in them # scATAC data from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6499306/ # this file is not included in data since it's so big, but it should be # straightforward to obtain from the GSE accession. peaks <- read.table('GSE118987_Combined.txt.Called_Peaks.bed', col.names=c('chrom','start','end','peakname')) peaksr <- makeGRangesFromDataFrame(peaks) peaksOverlappingSNPs <- c() rel1505files <- list.files(pattern='rel-1505') for (f in rel1505files) { df <- read.csv(f) df <- df %>% filter(X129S1_SvImJ != CAST_EiJ) df <- df %>% mutate(chrom=paste0('chr',Chr), start=Position-1, end=Position, name=paste(Ref,X129S1_SvImJ,CAST_EiJ,sep='_')) %>% dplyr::select(chrom, start, end, name) dfgr <- makeGRangesFromDataFrame(df) fo <- findOverlaps(peaksr,dfgr) if (length(fo)>0) { peaksOverlappingSNPs <- c(peaksOverlappingSNPs, unique(fo@from)) } newname <- unlist(strsplit(f,'_'))[3] newname <- paste0('rel1505_snps_',newname, '.bed') write.table(df, file=newname, quote=F, sep='\t', col.names = F, row.names=F) } peaksOverlappingSNPs <- peaksr[sort(peaksOverlappingSNPs)]
# find motifs in peaks of interest # downloaded all of the JASPAR motifs into 3 separate csv files (there may be # an easier way to get all of them) jasp <- read.csv('../inst/extdata/jaspar-mouse-1.csv') jasp <- bind_rows(jasp, read.csv('../inst/extdata/jaspar-mouse-2.csv')) jasp <- bind_rows(jasp, read.csv('../inst/extdata/jaspar-mouse-3.csv')) pfm <- getMatrixByID(JASPAR2020, ID = jasp$ID) motif_pos <- matchMotifs(pfm, peaksOverlappingSNPs, genome='mm10', out='positions') reduced <- reduce(unlist(motif_pos)) unlisted <- unlist(motif_pos) rn <- names(unlisted) names(unlisted) <- NULL unlisted <- as.data.frame(unlisted) unlisted$motif <- rn unlisted$name <- paste0(unlisted$motif, '_', round(unlisted$score,2)) # the lines below were for visualizing in IGV #write.table(unlisted %>% dplyr::select(seqnames, start, end, name), file='allMotifsOverlappingPeaks.bed', sep='\t', quote=F, row.names=F, col.names=F) # write one out just for PB0044.1 #write.table(data.frame(motif_pos[['PB0044.1']])[,1:3], file='PB0044.1.bed',sep='\t',quote=F,row.names=F,col.names=F)
# load counts matrix # again this needs to be downloaded from the GSE accession. atac <- fread('GSE118987_Combined.txt.Counts.matrix') # get cell annotations rn <- atac$V1 atac$V1 <- NULL atac <- as.matrix(atac) atac[atac>0] <- 1 # binarize the matrix rownames(atac) <- rn colnames(atac) <- read.table('GSE118987_Combined.txt.Cluster_Identity.annot') %>% pull(V2)
# Investigate peak near Ptgds peak near.ptgds.idx <- which(rownames(atac)=='chr2_25478224_25478776') data.frame(celltype = colnames(atac), counts = atac[near.ptgds.idx,]) %>% group_by(celltype) %>% summarise(totalcount = sum(counts), avgcount = mean(counts), sdcount = sd(counts)) %>% mutate(avgcount.low = avgcount-2*sdcount, avgcount.high = avgcount+2*sdcount) %>% ggplot(aes(x = celltype)) + geom_point(aes(y = avgcount)) + theme_bw() + ylab('average of peak counts') + xlab('cell type') + ggtitle('scatac') ggsave('ptgds-scatac-avg.png', width=3, height=3, units='in')
# Investigate Ptgds promoter peak ptgds.idx <- which(rownames(atac)=='chr2_25469293_25470278') data.frame(celltype = colnames(atac), counts = atac[ptgds.idx,]) %>% group_by(celltype) %>% summarise(totalcount = sum(counts), avgcount = mean(counts), sdcount = sd(counts)) %>% mutate(avgcount.low = avgcount-2*sdcount, avgcount.high = avgcount+2*sdcount) %>% ggplot(aes(x = celltype)) + geom_point(aes(y = avgcount)) + theme_bw() + ylab('average of peak counts') + xlab('cell type') + ggtitle('scatac') ggsave('ptgds-promoter-scatac-avg.png', width=3, height=3, units='in')
# look for correlation between the peaks # using all the cell types, not just the one in my reference mypal <- data.frame(celltype = c('Astrocyte', 'CA1', 'CA3', 'Cajal Retzius', 'Dentate', 'Endothelial', 'Micro/Macro', 'Oligodendrocyte', 'Interneuron'), fill = c('#4DAF4A', '#FF7F00', '#377EB8', '#8DA0CB', '#984EA3', '#B3B3B3', '#E5C494', '#E41A1C', '#A65628')) cordf <- data.frame(celltype = colnames(atac), ptgds = atac[ptgds.idx,], gm35287 = atac[near.ptgds.idx,]) %>% group_by(celltype) %>% summarise(ptgds = mean(ptgds), gm35287 = mean(gm35287)) %>% mutate(celltype = case_when( celltype=='AST'~'Astrocyte', celltype=='GRN'~'Dentate', celltype=='INT'~'Interneuron', celltype=='NR1'~'CA1', celltype=='NR3'~'CA3', celltype=='OLI'~'Oligodendrocyte', TRUE~'other' )) %>% filter(!celltype=='other') %>% left_join(mypal, by='celltype') pearsoncor <- as.character(round(cor(cordf$ptgds, cordf$gm35287),2)) cordf %>% ggplot(aes(x = ptgds, y = gm35287, color=fill)) + geom_point(size=3) + scale_color_identity() + annotate('text', x=0.15, y=0.1, label=TeX(sprintf(r'($r^2 = %s$)', pearsoncor))) + theme_classic() + xlab('Ptgds peak accessibility') + ylab('Gm35287 peak') ggsave('ptgds-gm35287-cor-scatac-avg.png', width=2.2, height=2.2, units='in')
# look at gene expression for cell types in my reference # this reference was downloaded from # https://portal.brain-map.org/atlases-and-data/rnaseq/mouse-whole-cortex-and-hippocampus-10x # the gene expression by cluster, median csv scrna <- fread('scbrain-rnaseq-medians.csv') ref <- c('CA1','CA3','DG','Endo','Oligo', 'Astro','PVM','CR') rn<- scrna$feature scrna$feature <- NULL cn <- colnames(scrna) scrna <- as.matrix(scrna) colnames(scrna) <- cn rownames(scrna) <- rn scrna <- scrna[,which(grepl(paste(ref,collapse='|'), colnames(scrna)))] dsc <- scrna[c('Ptgds', 'Mtf1'),] %>% melt() %>% mutate(celltype = case_when( grepl('CA1',Var2)~'CA1', grepl('CA3',Var2)~'CA3', grepl('Oligo',Var2)~'Oligodendrocyte', grepl('Astro',Var2)~'Astrocyte', grepl('DG',Var2)~'Dentate', grepl('Endo',Var2)~'Endothelial', grepl('PVM',Var2)~'Micro/Macro', grepl('CR',Var2)~'Cajal Retzius', )) mypal <- data.frame(celltype = c('Astrocyte', 'CA1', 'CA3', 'Cajal Retzius', 'Dentate', 'Endothelial', 'Micro/Macro', 'Oligodendrocyte'), fill = c('#4DAF4A', '#FF7F00', '#377EB8', '#8DA0CB', '#984EA3', '#B3B3B3', '#E5C494', '#E41A1C')) dsc <- dsc %>% left_join(mypal) dsc %>% filter(Var1=='Mtf1') %>% ggplot(aes(x = Var1, y=value, color=fill)) + geom_point(position=position_jitterdodge(jitter.width=0.1)) + scale_color_identity() + theme_classic() + xlab('') + ylab('median cluster expression') + ggtitle('') ggsave('mtf1-scrnaseq-med.png', width=2, height=2, units='in')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.