In lulizou/spASE: Detecting Allele-Specific Expression in Spatial Transcriptomics

This vignette replicates our results for Figure 4 (\textit{Ptgds} analysis)

library(ggplot2)
library(tidyr)
library(GenomicRanges)
library(rtracklayer)
library(reshape2)
library(motifmatchr)
library(GenomicFeatures)
library(BSgenome.Mmusculus.UCSC.mm10)
library(JASPAR2020)
library(TFBSTools)
library(data.table)
library(dplyr)
library(latex2exp)

# convert REL1505 snp data into bed files
# also find peak ranges that overlap SNPs, so that can search for motifs in them
# scATAC data from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6499306/ 
# this file  is not included  in data since it's so big, but it should be
# straightforward to obtain from the GSE accession.
peaks <- read.table('GSE118987_Combined.txt.Called_Peaks.bed',
                    col.names=c('chrom','start','end','peakname'))
peaksr <- makeGRangesFromDataFrame(peaks)
peaksOverlappingSNPs <- c()
rel1505files <- list.files(pattern='rel-1505')
for (f in rel1505files) {
  df <- read.csv(f)
  df <- df %>% filter(X129S1_SvImJ != CAST_EiJ)
  df <- df %>%
    mutate(chrom=paste0('chr',Chr),
           start=Position-1,
           end=Position,
           name=paste(Ref,X129S1_SvImJ,CAST_EiJ,sep='_')) %>%
    dplyr::select(chrom, start, end, name)
  dfgr <- makeGRangesFromDataFrame(df)
  fo <- findOverlaps(peaksr,dfgr)
  if (length(fo)>0) {
    peaksOverlappingSNPs <- c(peaksOverlappingSNPs, unique(fo@from))
  }
  newname <- unlist(strsplit(f,'_'))[3]
  newname <- paste0('rel1505_snps_',newname, '.bed')
  write.table(df, file=newname, quote=F, sep='\t', col.names = F, row.names=F)
}
peaksOverlappingSNPs <- peaksr[sort(peaksOverlappingSNPs)]

# find motifs in peaks of interest
# downloaded all of the JASPAR motifs into 3 separate csv files (there may be
# an easier way to get all of them)
jasp <- read.csv('../inst/extdata/jaspar-mouse-1.csv')
jasp <- bind_rows(jasp, read.csv('../inst/extdata/jaspar-mouse-2.csv'))
jasp <- bind_rows(jasp, read.csv('../inst/extdata/jaspar-mouse-3.csv'))
pfm <- getMatrixByID(JASPAR2020, ID = jasp$ID)
motif_pos <- matchMotifs(pfm, peaksOverlappingSNPs,
                         genome='mm10', out='positions')
reduced <- reduce(unlist(motif_pos))
unlisted <- unlist(motif_pos)
rn <- names(unlisted)
names(unlisted) <- NULL
unlisted <- as.data.frame(unlisted)
unlisted$motif <- rn
unlisted$name <- paste0(unlisted$motif, '_', round(unlisted$score,2))
# the lines below were  for visualizing in IGV
#write.table(unlisted %>% dplyr::select(seqnames, start, end, name), file='allMotifsOverlappingPeaks.bed', sep='\t', quote=F, row.names=F, col.names=F)

# write one out just for PB0044.1
#write.table(data.frame(motif_pos[['PB0044.1']])[,1:3], file='PB0044.1.bed',sep='\t',quote=F,row.names=F,col.names=F)

# load counts matrix
# again this needs to be downloaded from the GSE accession.
atac <- fread('GSE118987_Combined.txt.Counts.matrix')
# get cell annotations
rn <- atac$V1
atac$V1 <- NULL
atac <- as.matrix(atac)
atac[atac>0] <- 1 # binarize the matrix
rownames(atac) <- rn
colnames(atac) <- read.table('GSE118987_Combined.txt.Cluster_Identity.annot') %>% pull(V2)

# Investigate peak near Ptgds peak
near.ptgds.idx <- which(rownames(atac)=='chr2_25478224_25478776')
data.frame(celltype = colnames(atac), counts = atac[near.ptgds.idx,]) %>%
  group_by(celltype) %>%
  summarise(totalcount = sum(counts), avgcount = mean(counts),
            sdcount = sd(counts)) %>%
  mutate(avgcount.low = avgcount-2*sdcount, avgcount.high = avgcount+2*sdcount)  %>%
  ggplot(aes(x = celltype)) +
  geom_point(aes(y = avgcount)) +
  theme_bw() +
  ylab('average of peak counts') +
  xlab('cell type')  +
  ggtitle('scatac')
ggsave('ptgds-scatac-avg.png', width=3, height=3, units='in')

# Investigate Ptgds promoter peak
ptgds.idx <- which(rownames(atac)=='chr2_25469293_25470278')
data.frame(celltype = colnames(atac), counts = atac[ptgds.idx,]) %>%
  group_by(celltype) %>%
  summarise(totalcount = sum(counts), avgcount = mean(counts),
            sdcount = sd(counts)) %>%
  mutate(avgcount.low = avgcount-2*sdcount, avgcount.high = avgcount+2*sdcount)  %>%
  ggplot(aes(x = celltype)) +
  geom_point(aes(y = avgcount)) +
  theme_bw() +
  ylab('average of peak counts') +
  xlab('cell type')  +
  ggtitle('scatac')
ggsave('ptgds-promoter-scatac-avg.png', width=3, height=3, units='in')

# look for correlation between the peaks
# using all the cell types, not just the one in my reference
mypal <- data.frame(celltype = c('Astrocyte', 'CA1', 'CA3', 'Cajal Retzius',
                                 'Dentate', 'Endothelial', 'Micro/Macro',
                                 'Oligodendrocyte', 'Interneuron'),
                    fill = c('#4DAF4A', '#FF7F00', '#377EB8', '#8DA0CB', 
                             '#984EA3', '#B3B3B3', '#E5C494', '#E41A1C',
                             '#A65628'))

cordf <- data.frame(celltype = colnames(atac), ptgds = atac[ptgds.idx,],
           gm35287 = atac[near.ptgds.idx,]) %>%
  group_by(celltype) %>%
  summarise(ptgds = mean(ptgds), gm35287 = mean(gm35287)) %>%
  mutate(celltype = case_when(
    celltype=='AST'~'Astrocyte',
    celltype=='GRN'~'Dentate',
    celltype=='INT'~'Interneuron',
    celltype=='NR1'~'CA1',
    celltype=='NR3'~'CA3',
    celltype=='OLI'~'Oligodendrocyte',
    TRUE~'other'
  )) %>%
  filter(!celltype=='other') %>%
  left_join(mypal, by='celltype')

pearsoncor <- as.character(round(cor(cordf$ptgds, cordf$gm35287),2))

cordf %>%
  ggplot(aes(x = ptgds, y = gm35287, color=fill)) +
  geom_point(size=3) +
  scale_color_identity() +
  annotate('text', x=0.15, y=0.1, label=TeX(sprintf(r'($r^2 = %s$)', pearsoncor))) +
  theme_classic() +
  xlab('Ptgds peak accessibility') +
  ylab('Gm35287 peak')
ggsave('ptgds-gm35287-cor-scatac-avg.png', width=2.2, height=2.2, units='in')

# look at gene expression for cell  types in my reference
#  this reference was downloaded from 
# https://portal.brain-map.org/atlases-and-data/rnaseq/mouse-whole-cortex-and-hippocampus-10x
# the gene expression by cluster, median csv
scrna <- fread('scbrain-rnaseq-medians.csv')
ref <- c('CA1','CA3','DG','Endo','Oligo', 'Astro','PVM','CR')
rn<- scrna$feature
scrna$feature <- NULL
cn <- colnames(scrna)
scrna <- as.matrix(scrna)
colnames(scrna) <- cn
rownames(scrna) <- rn
scrna <- scrna[,which(grepl(paste(ref,collapse='|'), colnames(scrna)))]
dsc <- scrna[c('Ptgds', 'Mtf1'),] %>%
  melt() %>%
  mutate(celltype = case_when(
    grepl('CA1',Var2)~'CA1',
    grepl('CA3',Var2)~'CA3',
    grepl('Oligo',Var2)~'Oligodendrocyte',
    grepl('Astro',Var2)~'Astrocyte',
    grepl('DG',Var2)~'Dentate',
    grepl('Endo',Var2)~'Endothelial',
    grepl('PVM',Var2)~'Micro/Macro',
    grepl('CR',Var2)~'Cajal Retzius',
  )) 
mypal <- data.frame(celltype = c('Astrocyte', 'CA1', 'CA3', 'Cajal Retzius',
                                 'Dentate', 'Endothelial', 'Micro/Macro',
                                 'Oligodendrocyte'),
                    fill = c('#4DAF4A', '#FF7F00', '#377EB8', '#8DA0CB', 
                             '#984EA3', '#B3B3B3', '#E5C494', '#E41A1C'))
dsc <- dsc %>% left_join(mypal)
dsc %>%
  filter(Var1=='Mtf1') %>%
  ggplot(aes(x = Var1, y=value, color=fill)) +
  geom_point(position=position_jitterdodge(jitter.width=0.1)) +
  scale_color_identity() +
  theme_classic() +
  xlab('') +
  ylab('median cluster expression') +
  ggtitle('')
ggsave('mtf1-scrnaseq-med.png', width=2, height=2, units='in')