run_umap <- function(x,seed = 2020){
return(umap::umap(Matrix::t(x),random_state = seed)$layout)
}
plot_umap <- function(x,labels,title='UMAP',legend.title = 'Cell Type',seed = 2020,shape = NULL,shape.title = NULL){
umap_1 = umap_2 = celltype = NULL
set.seed(seed)
df_umap = data.frame(cbind(x,labels),stringsAsFactors = FALSE)
colnames(df_umap) = c('umap_1','umap_2','celltype')
df_umap$umap_1 = as.numeric(df_umap$umap_1)
df_umap$umap_2 = as.numeric(df_umap$umap_2)
df_umap$celltype <- factor(df_umap$celltype,levels = sort(unique(df_umap$celltype)))
options(repr.plot.width=4, repr.plot.height=4)
p <- ggplot2::ggplot(df_umap, ggplot2::aes(x = umap_1, y = umap_2, color = celltype)) +
ggplot2::ggtitle(title) +
ggplot2::theme_classic() +
ggplot2::xlab('UMAP 1') + ggplot2::ylab('UMAP 2') +
ggplot2::guides(color = ggplot2::guide_legend(title = legend.title)) +
ggplot2::scale_colour_brewer(palette = "Set1")
if(is.null(shape)){
p <- p + ggplot2::geom_point(size = 1,alpha = 0.5)
} else{
p <- p + ggplot2::geom_point(size = 1,alpha = 0.5,ggplot2::aes(shape = shape)) +
ggplot2::guides(shape = ggplot2::guide_legend(title = shape.title)) +
ggplot2::scale_shape_manual(values = c(19,4))
}
return(p)
}
runClustering <- function(sc,feature,clustering,method){
clust <- list()
target <- colData(sc)$Cluster
nClusters = length(unique(target))
if(clustering == 'all'){
clust[['hierarchical']] <- cutree(tree = fastcluster::hclust.vector(t(feature), method="ward", metric="euclidean"),k = nClusters)
clust[['louvain']] <- igraph::cluster_louvain(scran::buildSNNGraph(feature,type="number"))$membership
clust[['kmeans']] <- kmeans(t(feature), centers=nClusters)$cluster
clust[['metrics']] <- rbindlist(list(data.table::data.table(Method = method,Clustering = 'Hierarchical',ARI = mclust::adjustedRandIndex(clust[['hierarchical']],target),AMI = aricode::AMI(clust[['hierarchical']],target)),
data.table::data.table(Method = method,Clustering = 'Kmeans',ARI = mclust::adjustedRandIndex(clust[['kmeans']] ,target),AMI = aricode::AMI(clust[['kmeans']] ,target)),
data.table::data.table(Method = method,Clustering = 'Louvain',ARI = mclust::adjustedRandIndex(clust[['louvain']],target),AMI = aricode::AMI(clust[['louvain']],target))))
} else{
if(clustering == 'hierarchical'){
clust[['hierarchical']] <- cutree(tree = fastcluster::hclust.vector(t(feature), method="ward", metric="euclidean"),k = nClusters)
clust[['louvain']] <- NA
clust[['kmeans']] <- NA
clust[['metrics']] <- rbindlist(list(data.table::data.table(Method = method,Clustering = 'Hierarchical',ARI = mclust::adjustedRandIndex(clust[['hierarchical']],target),AMI = aricode::AMI(clust[['hierarchical']],target))))
}
if(clustering == 'louvain'){
clust[['hierarchical']] <- NA
clust[['louvain']] <- igraph::cluster_louvain(scran::buildSNNGraph(feature,type="number"))$membership
clust[['kmeans']] <- NA
clust[['metrics']] <- rbindlist(list(data.table::data.table(Method = method,Clustering = 'Louvain',ARI = mclust::adjustedRandIndex(clust[['louvain']],target),AMI = aricode::AMI(clust[['louvain']],target))))
}
if(clustering == 'kmeans'){
clust[['hierarchical']] <- NA
clust[['louvain']] <- NA
clust[['kmeans']] <- kmeans(t(feature), centers=nClusters)$cluster
clust[['metrics']] <- rbindlist(list(data.table::data.table(Method = method,Clustering = 'Kmeans',ARI = mclust::adjustedRandIndex(clust[['kmeans']] ,target),AMI = aricode::AMI(clust[['kmeans']] ,target))))
}
}
return(clust)
}
callPeaks <- function(sc,peaks = TRUE){
# This function calls peaks on the aggregated data from sc
# The called peaks are then used in downstream analyses for all methods, including cisTopic, SnapATAC, Cusanovich2018, and Scasat
if(is.logical(peaks)){
if(length(peaks)==1){
if(peaks){
# If peaks == TRUE, then call peaks
object <- epigraHMM::epigraHMMDataSetFromMatrix(countData = as.matrix(Matrix::rowSums(SummarizedExperiment::assay(sc))),
colData = data.frame(condition = 'A',replicate = 1),
rowRanges = SummarizedExperiment::rowRanges(sc))
object <- epigraHMM::epigraHMM(object,control = epigraHMM::controlEM(epsilon.em = c(1e-2,1e-2,1e-6,1e-2)),type = 'consensus',dist = 'nb')
index <- (S4Vectors::metadata(object)$viterbi=='E')
# Aggregate counts into peak regions
return(list('sc' = peakCount(sc,peaks = index),'peaks' = index))
} else{
# If peaks == FALSE, do nothing
return(list('sc' = sc,'peaks' = rep(TRUE,nrow(sc))))
}
} else{
# If peaks == a vector of logicals indicating the bins pertaining to peak regions, then subset
# Aggregate counts into peak regions
return(list('sc' = peakCount(sc,peaks = peaks),'peaks' = peaks))
}
} else{
if(peaks == 'mikado'){
return(diffPeaks(object = sc))
} else{
stop('peaks must be a logical or "mikado"')
}
}
}
peakCount <- function(sc,peaks){
# This function sums reads across bins of each peak for each cell
# The resulting sc is used by all methods for clustering
grSc <- SummarizedExperiment::rowRanges(sc)[peaks]
reducedGrSc <- GenomicRanges::reduce(grSc)
# Prunning extra windows
idx <- IRanges::overlapsAny(grSc,reducedGrSc)
idxBinned <- IRanges::overlapsAny(reducedGrSc,grSc)
grSc <- grSc[idx]
reducedGrSc <- reducedGrSc[idxBinned]
# Checking overlaps
overlaps <- GenomicRanges::findOverlaps(reducedGrSc,grSc)
# Using model matrix to sum reads from cells across peak windows
return(SingleCellExperiment::SingleCellExperiment(assays = list(counts = t(t(SummarizedExperiment::assay(sc[peaks,][idx,])) %*% Matrix::sparse.model.matrix(~0+Peak,data.table::data.table(Peak = as.factor(queryHits(overlaps)))))),
rowRanges = reducedGrSc,
colData = SummarizedExperiment::colData(sc)))
}
filterZeros <- function(mat){
# This functiion is used by all methods that require rows with 0 counts to be excluded, namely Cusanovich2018, and Scasat
# It was written based on the one presented in https://nbviewer.jupyter.org/github/pinellolab/scATAC-benchmarking/blob/master/Real_Data/Buenrostro_2018/run_methods/Scasat/Scasat_buenrostro2018.ipynb?flush_cache=true
# The original cutoff for Scasat was 0.01 (remove the lowest 1 percentile of peaks). However, I will just remove features with 0 counts across all cells.
# This is because we are not working with peak-based counts, but instead window-based counts.
# In our scenario, using a cutoff=0.01 would exclude most of the windows since the data is highly sparse in this way.
index <- which(Matrix::rowSums(mat)==0)
if(length(index)==0){
return(mat)
} else{
return(mat[-index,])
}
}
Scasat_getJaccardDist <- function(cdBinary){
# This functiion is used by Scasat
# It was written based on the one presented in https://nbviewer.jupyter.org/github/pinellolab/scATAC-benchmarking/blob/master/Real_Data/Buenrostro_2018/run_methods/Scasat/Scasat_buenrostro2018.ipynb?flush_cache=true
# The default function computes the Jaccard matrix based on the desne matrix
# I am using locStra::jaccardMatrix because it works with sparse matrices and it is much faster
# To get the same result as prabclus::jaccard, we need to subtract it from 1
# Check: x <- rnorm(100); mat <- cbind(x,x^3,abs(x),x/max(x),exp(x),1/x); mat <- 1*(mat<0.5); (1-locStra::jaccardMatrix(mat))==prabclus::jaccard(mat)
return(1-locStra::jaccardMatrix(1*(cdBinary>0)))
}
errorCatch <- function(cond,true,peaks,method){
return(list('true' = true,
'peaks' = peaks,
'feature' = NA,
'clustering' = list('hierarchical' = NA,'louvain' = NA,'kmeans' = NA,'metrics' = data.table(Method = method, Clustering = NA, ARI = NA, AMI = NA)),
'time' = NA,
'umap' = NA))
}
pseudoBulk <- function(sc,group,type){
# This function calculate pseudobulk on rows or columns
if(type == 'rows'){
if(!length(group)==nrow(sc)){stop('dimensions not compatible')}
mat <- t(t(SummarizedExperiment::assay(sc)) %*% Matrix::sparse.model.matrix(~0+group))
} else{
if(type == 'cols'){
if(!length(group)==ncol(sc)){stop('dimensions not compatible')}
mat <- SummarizedExperiment::assay(sc) %*% Matrix::sparse.model.matrix(~0+group)
} else{
stop('type not valid')
}
}
return(mat)
}
pseudoPlot <- function(sc,mat,ranges,normalize = FALSE,bulkOnly = FALSE,ptssize = 1,shuffle = F){
# THis function plots counts from bulk data (sc) and compares it to the pseudobulk (mat) for a given region (ranges)
Data = Start = Counts = Cell = Cluster = NewCell = V1 = NULL
index <- overlapsAny(sc,ranges)
str <- ranges(SummarizedExperiment::rowRanges(sc[index,]))@start
if(normalize){
newsc <- epigraHMM::createOffset(SummarizedExperiment::SummarizedExperiment(assays = list(counts = cbind(rowSums(SummarizedExperiment::assay(sc)[index,]),
mat[index,])),
rowRanges = SummarizedExperiment::rowRanges(sc[index,]),
colData = data.frame(Data = c('Bulk',paste0('Pseudobulk',1:ncol(mat))))),type = 'loess',span = 1)
SummarizedExperiment::assay(newsc,'counts') <- SummarizedExperiment::assay(newsc,'counts')/exp(SummarizedExperiment::assay(newsc,'offset'))
} else{
newsc <- SummarizedExperiment::SummarizedExperiment(assays = list(counts = cbind(rowSums(SummarizedExperiment::assay(sc)[index,]),
mat[index,])),
rowRanges = SummarizedExperiment::rowRanges(sc[index,]),
colData = data.frame(Data = c('Bulk',paste0('Pseudobulk',1:ncol(mat)))))
}
dt <- rbindlist(lapply(seq_len(ncol(newsc)),function(x){
data.table(Data = colData(newsc)$Data[x],Counts = SummarizedExperiment::assay(newsc,'counts')[,x], Start = str)
}))
if(bulkOnly){
fig <- ggplot2::ggplot(data = dt[Data == 'Bulk',],ggplot2::aes(x = Start,y = Counts))+
ggplot2::facet_grid(rows = ggplot2::vars(Data))+
ggplot2::geom_line()+
ggplot2::theme_bw()+
ggplot2::ylab(ifelse(normalize,'Normalized Counts','Counts'))+
ggplot2::xlab(paste0('Genomic Window (',seqnames(ranges),')'))
} else{
fig1 <- ggplot2::ggplot(data = dt[Data == 'Bulk',],ggplot2::aes(x = Start,y = Counts))+
ggplot2::facet_grid(rows = ggplot2::vars(Data))+
ggplot2::geom_line()+
ggplot2::theme_bw()+
ggplot2::theme(axis.text = ggplot2::element_text(size = 6),axis.title = ggplot2::element_text(size = 6),
panel.grid.major = ggplot2::element_blank(),panel.grid.minor = ggplot2::element_blank(),strip.text = ggplot2::element_text(size = 6))+
ggplot2::scale_x_continuous(labels = scales::scientific)+
ggplot2::ylab(ifelse(normalize,'Normalized Counts','Counts'))+
ggplot2::xlab(paste0('Genomic Window (',seqnames(ranges),')'))
fig2 <- ggplot2::ggplot(data = dt[!(Data == 'Bulk'),],ggplot2::aes(x = Start,y = Counts))+
ggplot2::facet_grid(rows = ggplot2::vars(Data))+
ggplot2::geom_line()+
ggplot2::theme_bw()+
ggplot2::theme(axis.text = ggplot2::element_text(size = 6),axis.title = ggplot2::element_text(size = 6),
panel.grid.major = ggplot2::element_blank(),panel.grid.minor = ggplot2::element_blank(),strip.text = ggplot2::element_text(size = 6))+
ggplot2::scale_x_continuous(labels = scales::scientific)+
ggplot2::ylab(ifelse(normalize,'Normalized Counts','Counts'))+
ggplot2::xlab(paste0('Genomic Window (',seqnames(ranges),')'))
subsc <- sc[overlapsAny(sc,ranges),]
subdt <- rbindlist(lapply(unique(sc$Cluster),function(x){
ctd <- data.table(Cluster = paste0('Cluster',x),Cell = seq_len(sum(colData(subsc)$Cluster==x)),Counts = c(as.matrix(SummarizedExperiment::assay(subsc[,colData(subsc)$Cluster==x],'counts'))), Start = str)
if(shuffle){
newctd <- ctd[,sum(Counts),by='Cell'][order(V1),][,NewCell := 1:.N][,-'V1']
ctd <- merge(ctd,newctd,by = 'Cell',all.x = T)[,Cell := NewCell][,NewCell := NULL]
}
ctd <- ctd[Counts>0,][,Counts := ifelse(Counts==1,'1',ifelse(Counts==2,'2','>2'))]
ctd$Counts <- factor(ctd$Counts,levels = c('1','2','>2'))
return(ctd)
}))
fig3 <- ggplot2::ggplot(data = subdt,ggplot2::aes(y = Cell,x = Start, color= Counts))+
ggplot2::facet_grid(rows = ggplot2::vars(Cluster),scales = 'free_y')+
ggplot2::geom_point(alpha = 0.3,shape=20,size = ptssize)+
ggplot2::theme_bw()+
ggplot2::theme(panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
legend.position = 'top',legend.direction = 'horizontal',
#legend.box.background = ggplot2::element_rect(colour = "black"),
legend.title = ggplot2::element_text(size = 8),
legend.text = ggplot2::element_text(size = 8),
axis.text = ggplot2::element_text(size = 6),axis.title = ggplot2::element_text(size = 6),strip.text = ggplot2::element_text(size = 6),
legend.key.size = ggplot2::unit(0.1, "lines"))+
ggplot2::scale_color_manual(values = c('black','red','purple'))+
ggplot2::scale_x_continuous(labels = scales::scientific)+
ggplot2::xlab(paste0('Genomic Window (',seqnames(ranges),')'))+
ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 2,alpha = 1)))
fig <- ggpubr::ggarrange(fig1,fig2,fig3,nrow = 1,ncol = 3,widths = c(1.875,1.875,2.25)/6,common.legend = T)
}
return(fig)
}
range01 <- function(x){
x <- ifelse(x<0,0,x)
x <- ifelse(x>1,1,x)
return(x)
}
generateScenarios <- function(type){
if(type == 'scATACseq'){
scenarios <- expand.grid(Data=c('scATACseq'),Groups=c(1,2),
Cells=c(1,2,3),Rarity=c(1,2),Noise=c(1,2),
Depth=c(1,2,3),Sim = 100)
scenarios$Label <- paste(scenarios[['Data']],paste0('Groups',scenarios[['Groups']]),
paste0('Cells',scenarios[['Cells']]),paste0('Rarity',scenarios[['Rarity']]),
paste0('Noise',scenarios[['Noise']]),paste0('Depth',scenarios[['Depth']]),sep = '_')
} else{
if(type == 'scChIPseq'){
scenarios <- expand.grid(Data=c('scChIPseq'),Groups=c(1,2),
Cells=c(1,2,3),Rarity=c(1,2),Noise=c(1), # scChIPseq is already pretty noise, no need to try other levels
Depth=c(1,2,3),Dissimilarity=c(1,2,3),Sim = 100)
scenarios$Label <- paste(scenarios[['Data']],paste0('Groups',scenarios[['Groups']]),paste0('Diss',scenarios[['Dissimilarity']]),
paste0('Cells',scenarios[['Cells']]),paste0('Rarity',scenarios[['Rarity']]),
paste0('Noise',scenarios[['Noise']]),paste0('Depth',scenarios[['Depth']]),sep = '_')
} else{
stop('Type is not recognized')
}
}
return(scenarios)
}
annotation_compass <- function(label,
position = c('N','NE','E','SE','S','SW','W','NW'),
padding = grid::unit(c(0.5,0.5),"line"),fontsize = 12,...){
position <- match.arg(position)
x <- switch(EXPR = position,N = 0.5,NE = 1,E = 1,SE = 1,S = 0.5,SW = 0,W = 0,NW = 0)
y <- switch(EXPR = position,N = 1,NE = 1,E = 0.5,SE = 0,S = 0,SW = 0,W = 0.5,NW = 1)
hjust <- switch(EXPR = position,N = 0.5,NE = 1,E = 1,SE = 1,S = 0.5,SW = 0,W = 0,NW = 0)
vjust <- switch(EXPR = position,N = 1,NE = 1,E = 0.5,SE = 0,S = 0,SW = 0,W = 0.5,NW = 1)
f1 <- switch(EXPR = position,N = 0,NE = -1,E = -1,SE = -1,S = 0,SW = 1,W = 1,NW = 1)
f2 <- switch(EXPR = position,N = -1,NE = -1,E = 0,SE = 1,S = 1,SW = 1,W = 0,NW = -1)
ggplot2::annotation_custom(grid::textGrob(label,gp = grid::gpar(col = "black", fontsize = fontsize),
x=grid::unit(x,"npc") + f1*padding[1] ,
y=grid::unit(y,"npc") + f2*padding[2],
hjust=hjust,vjust=vjust, ...))
}
mergeSC <- function(tmpdir,name1,name2){
V1 = V2 = V3 = rl = Counts = N = SumN = NULL
dt1 <- scater::readSparseCounts(file.path(tmpdir,name1))
dt1 <- dt1[unique(rownames(dt1)),]
dt2 <- scater::readSparseCounts(file.path(tmpdir,name2))
dt2 <- dt2[unique(rownames(dt2)),]
sc <- Seurat::RowMergeSparseMatrices(dt1,dt2)
rn <- rownames(sc)
grDatamatrix <- with(as.data.table(do.call(rbind,strsplit(rn,":|-"))),GenomicRanges::GRanges(V1,IRanges::IRanges(as.numeric(V2),as.numeric(V3))))
orderRanges <- order(grDatamatrix)
grDatamatrix <- grDatamatrix[orderRanges]
rn <- rn[orderRanges]
dt1 <- dt1[orderRanges,]
dt2 <- dt2[orderRanges,]
sc <- sc[orderRanges,]
total_cell <- c(ncol(dt1),ncol(dt2))
sample_name <- c('HBCx-22','HBCx-22-TamR')
annot_raw <- rbindlist(list(data.table(barcode=colnames(dt1), cell_id=paste0(sample_name[1], "_c", 1:total_cell[1]), sample_id=rep(sample_name[1], total_cell[1]), batch_id=1),
data.table(barcode=colnames(dt2), cell_id=paste0(sample_name[2], "_c", 1:total_cell[2]), sample_id=rep(sample_name[2], total_cell[2]), batch_id=1)))
colnames(sc) <- c(paste0(sample_name[1], "_c", 1:total_cell[1]),
paste0(sample_name[2], "_c", 1:total_cell[2]))
rm(dt1,dt2)
sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = sc),
rowRanges = grDatamatrix,
colData = data.frame(Cluster = annot_raw$sample_id))
sc <- sc[seqnames(sc)%in%c(paste0('chr',1:22),'chrX','chrY'),]
Rleagg <- data.table(Counts = Matrix::rowSums(SummarizedExperiment::assay(sc)))[,rl := rleid(Counts)][,N := 1:.N]
Rleagg <- merge(Rleagg,Rleagg[,list(SumN = .N),by = 'rl'],by = 'rl',all.x = TRUE)
sc <- sc[-Rleagg[,which(Counts==0 & SumN>2)],]
return(sc)
}
addSmallLegend <- function(myPlot, pointSize = 0.5, textSize = 3, spaceLegend = 0.1,legend.title = F) {
myPlot <- myPlot +
ggplot2::guides(shape = ggplot2::guide_legend(override.aes = list(size = pointSize)),
color = ggplot2::guide_legend(override.aes = list(size = pointSize))) +
ggplot2::theme(legend.text = ggplot2::element_text(size = textSize),
legend.background = ggplot2::element_rect(fill=scales::alpha('white',0)),
legend.key.size = ggplot2::unit(spaceLegend, "lines"))
if(legend.title == F){
myPlot <- myPlot + ggplot2::theme(legend.title = ggplot2::element_blank())
} else{
myPlot <- myPlot + ggplot2::guides(color = ggplot2::guide_legend(title = legend.title))
}
}
rd <- function(x,digits = 2){
format(round(x, digits=digits), nsmall = digits)
}
diffPeaks <- function(object){
fdr = NULL
# Creating control
control <- mikado::controlEM(epsilon.em=1e-4,maxit.em=50)
# Transforming data into binary
oldcounts <- SummarizedExperiment::assay(object,'counts')
SummarizedExperiment::assay(object,'counts') <- 1*(oldcounts>0)
# Run the peak caller
object <- mikado::MHMM_v4(object = object,control = control)
# Transforming the data back
SummarizedExperiment::assay(object,'counts') <- oldcounts
# Creating control elements
for(i in seq_along(control)){assign(names(control)[i],control[[i]])}
# Calling differential peaks
index <- S4Vectors::metadata(object)$viterbi == 1
peaks <- GenomicRanges::reduce(SummarizedExperiment::rowRanges(object[index,]))
peaks$name <- paste0('peak',1:length(peaks))
# Checking overlaps
overlaps <- GenomicRanges::findOverlaps(SummarizedExperiment::rowRanges(object),peaks)
vec <- rep('peakNA',nrow(object))
vec[S4Vectors::queryHits(overlaps)] <- peaks$name[S4Vectors::subjectHits(overlaps)]
# Aggregating windows into regions
aggpeaks <- pseudoBulk(sc = object,group = vec,type = 'rows')
aggpeaks <- aggpeaks[-which(rownames(aggpeaks)=="grouppeakNA"),]
# Creating reduced SC experiment
sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = unname(aggpeaks)),
rowRanges = peaks,
colData = SummarizedExperiment::colData(object))
return(list('sc' = sc,'peaks' = index))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.