R/base.R

Defines functions diffPeaks rd addSmallLegend mergeSC annotation_compass generateScenarios range01 pseudoPlot pseudoBulk errorCatch Scasat_getJaccardDist filterZeros peakCount callPeaks runClustering plot_umap run_umap

run_umap <- function(x,seed = 2020){
    return(umap::umap(Matrix::t(x),random_state = seed)$layout)
}

plot_umap <- function(x,labels,title='UMAP',legend.title = 'Cell Type',seed = 2020,shape = NULL,shape.title = NULL){
    umap_1 = umap_2 = celltype = NULL
    set.seed(seed)
    df_umap = data.frame(cbind(x,labels),stringsAsFactors = FALSE)
    colnames(df_umap) = c('umap_1','umap_2','celltype')
    df_umap$umap_1 = as.numeric(df_umap$umap_1)
    df_umap$umap_2 = as.numeric(df_umap$umap_2)
    df_umap$celltype <- factor(df_umap$celltype,levels = sort(unique(df_umap$celltype)))
    options(repr.plot.width=4, repr.plot.height=4)
    p <- ggplot2::ggplot(df_umap, ggplot2::aes(x = umap_1, y = umap_2, color = celltype)) +
        ggplot2::ggtitle(title) +
        ggplot2::theme_classic() +
        ggplot2::xlab('UMAP 1') + ggplot2::ylab('UMAP 2') +
        ggplot2::guides(color = ggplot2::guide_legend(title = legend.title)) +
        ggplot2::scale_colour_brewer(palette = "Set1")
    if(is.null(shape)){
        p <- p + ggplot2::geom_point(size = 1,alpha = 0.5)
    } else{
        p <- p + ggplot2::geom_point(size = 1,alpha = 0.5,ggplot2::aes(shape = shape)) +
            ggplot2::guides(shape = ggplot2::guide_legend(title = shape.title)) +
            ggplot2::scale_shape_manual(values = c(19,4))
    }

    return(p)
}

runClustering <- function(sc,feature,clustering,method){

    clust <- list()

    target <- colData(sc)$Cluster

    nClusters = length(unique(target))

    if(clustering == 'all'){
        clust[['hierarchical']] <- cutree(tree = fastcluster::hclust.vector(t(feature), method="ward", metric="euclidean"),k = nClusters)
        clust[['louvain']] <- igraph::cluster_louvain(scran::buildSNNGraph(feature,type="number"))$membership
        clust[['kmeans']] <- kmeans(t(feature), centers=nClusters)$cluster

        clust[['metrics']] <- rbindlist(list(data.table::data.table(Method = method,Clustering = 'Hierarchical',ARI = mclust::adjustedRandIndex(clust[['hierarchical']],target),AMI = aricode::AMI(clust[['hierarchical']],target)),
                                             data.table::data.table(Method = method,Clustering = 'Kmeans',ARI = mclust::adjustedRandIndex(clust[['kmeans']] ,target),AMI = aricode::AMI(clust[['kmeans']] ,target)),
                                             data.table::data.table(Method = method,Clustering = 'Louvain',ARI = mclust::adjustedRandIndex(clust[['louvain']],target),AMI = aricode::AMI(clust[['louvain']],target))))
    } else{
        if(clustering == 'hierarchical'){
            clust[['hierarchical']] <- cutree(tree = fastcluster::hclust.vector(t(feature), method="ward", metric="euclidean"),k = nClusters)
            clust[['louvain']] <- NA
            clust[['kmeans']] <- NA

            clust[['metrics']] <- rbindlist(list(data.table::data.table(Method = method,Clustering = 'Hierarchical',ARI = mclust::adjustedRandIndex(clust[['hierarchical']],target),AMI = aricode::AMI(clust[['hierarchical']],target))))
        }

        if(clustering == 'louvain'){
            clust[['hierarchical']] <- NA
            clust[['louvain']] <- igraph::cluster_louvain(scran::buildSNNGraph(feature,type="number"))$membership
            clust[['kmeans']] <- NA

            clust[['metrics']] <- rbindlist(list(data.table::data.table(Method = method,Clustering = 'Louvain',ARI = mclust::adjustedRandIndex(clust[['louvain']],target),AMI = aricode::AMI(clust[['louvain']],target))))
        }

        if(clustering == 'kmeans'){
            clust[['hierarchical']] <- NA
            clust[['louvain']] <- NA
            clust[['kmeans']] <- kmeans(t(feature), centers=nClusters)$cluster

            clust[['metrics']] <- rbindlist(list(data.table::data.table(Method = method,Clustering = 'Kmeans',ARI = mclust::adjustedRandIndex(clust[['kmeans']] ,target),AMI = aricode::AMI(clust[['kmeans']] ,target))))
        }
    }

    return(clust)
}

callPeaks <- function(sc,peaks = TRUE){
    # This function calls peaks on the aggregated data from sc
    # The called peaks are then used in downstream analyses for all methods, including cisTopic, SnapATAC, Cusanovich2018, and Scasat

    if(is.logical(peaks)){
        if(length(peaks)==1){
            if(peaks){
                # If peaks == TRUE, then call peaks
                object <- epigraHMM::epigraHMMDataSetFromMatrix(countData = as.matrix(Matrix::rowSums(SummarizedExperiment::assay(sc))),
                                                                colData = data.frame(condition = 'A',replicate = 1),
                                                                rowRanges = SummarizedExperiment::rowRanges(sc))

                object <- epigraHMM::epigraHMM(object,control = epigraHMM::controlEM(epsilon.em = c(1e-2,1e-2,1e-6,1e-2)),type = 'consensus',dist = 'nb')
                index <- (S4Vectors::metadata(object)$viterbi=='E')

                # Aggregate counts into peak regions

                return(list('sc' = peakCount(sc,peaks = index),'peaks' = index))

            } else{
                # If peaks == FALSE, do nothing

                return(list('sc' = sc,'peaks' = rep(TRUE,nrow(sc))))

            }
        } else{
            # If peaks == a vector of logicals indicating the bins pertaining to peak regions, then subset
            # Aggregate counts into peak regions

            return(list('sc' = peakCount(sc,peaks = peaks),'peaks' = peaks))

        }
    } else{
        if(peaks == 'mikado'){
            return(diffPeaks(object = sc))
        } else{
            stop('peaks must be a logical or "mikado"')
        }
    }
}

peakCount <- function(sc,peaks){

    # This function sums reads across bins of each peak for each cell
    # The resulting sc is used by all methods for clustering

    grSc <- SummarizedExperiment::rowRanges(sc)[peaks]
    reducedGrSc <- GenomicRanges::reduce(grSc)

    # Prunning extra windows

    idx <- IRanges::overlapsAny(grSc,reducedGrSc)
    idxBinned <- IRanges::overlapsAny(reducedGrSc,grSc)

    grSc <- grSc[idx]
    reducedGrSc <- reducedGrSc[idxBinned]

    # Checking overlaps

    overlaps <- GenomicRanges::findOverlaps(reducedGrSc,grSc)

    # Using model matrix to sum reads from cells across peak windows

    return(SingleCellExperiment::SingleCellExperiment(assays = list(counts = t(t(SummarizedExperiment::assay(sc[peaks,][idx,])) %*% Matrix::sparse.model.matrix(~0+Peak,data.table::data.table(Peak = as.factor(queryHits(overlaps)))))),
                                                      rowRanges = reducedGrSc,
                                                      colData = SummarizedExperiment::colData(sc)))
}

filterZeros <- function(mat){
    # This functiion is used by all methods that require rows with 0 counts to be excluded, namely Cusanovich2018, and Scasat
    # It was written based on the one presented in https://nbviewer.jupyter.org/github/pinellolab/scATAC-benchmarking/blob/master/Real_Data/Buenrostro_2018/run_methods/Scasat/Scasat_buenrostro2018.ipynb?flush_cache=true
    # The original cutoff for Scasat was 0.01 (remove the lowest 1 percentile of peaks). However, I will just remove features with 0 counts across all cells.
    # This is because we are not working with peak-based counts, but instead window-based counts.
    # In our scenario, using a cutoff=0.01 would exclude most of the windows since the data is highly sparse in this way.

    index <- which(Matrix::rowSums(mat)==0)

    if(length(index)==0){
        return(mat)
    } else{
        return(mat[-index,])
    }
}

Scasat_getJaccardDist <- function(cdBinary){
    # This functiion is used by Scasat
    # It was written based on the one presented in https://nbviewer.jupyter.org/github/pinellolab/scATAC-benchmarking/blob/master/Real_Data/Buenrostro_2018/run_methods/Scasat/Scasat_buenrostro2018.ipynb?flush_cache=true
    # The default function computes the Jaccard matrix based on the desne matrix
    # I am using locStra::jaccardMatrix because it works with sparse matrices and it is much faster
    # To get the same result as prabclus::jaccard, we need to subtract it from 1
    # Check: x <- rnorm(100); mat <- cbind(x,x^3,abs(x),x/max(x),exp(x),1/x); mat <- 1*(mat<0.5); (1-locStra::jaccardMatrix(mat))==prabclus::jaccard(mat)
    return(1-locStra::jaccardMatrix(1*(cdBinary>0)))
}

errorCatch <- function(cond,true,peaks,method){
    return(list('true' = true,
                'peaks' = peaks,
                'feature' = NA,
                'clustering' = list('hierarchical' = NA,'louvain' = NA,'kmeans' = NA,'metrics' = data.table(Method = method, Clustering = NA, ARI = NA, AMI = NA)),
                'time' = NA,
                'umap' = NA))
}

pseudoBulk <- function(sc,group,type){
    # This function calculate pseudobulk on rows or columns

    if(type == 'rows'){
        if(!length(group)==nrow(sc)){stop('dimensions not compatible')}

        mat <- t(t(SummarizedExperiment::assay(sc)) %*% Matrix::sparse.model.matrix(~0+group))

    } else{
        if(type == 'cols'){
            if(!length(group)==ncol(sc)){stop('dimensions not compatible')}

            mat <- SummarizedExperiment::assay(sc) %*% Matrix::sparse.model.matrix(~0+group)

        } else{
            stop('type not valid')
        }
    }
    return(mat)
}

pseudoPlot <- function(sc,mat,ranges,normalize = FALSE,bulkOnly = FALSE,ptssize = 1,shuffle = F){
    # THis function plots counts from bulk data (sc) and compares it to the pseudobulk (mat) for a given region (ranges)

    Data = Start = Counts = Cell = Cluster =  NewCell = V1 = NULL

    index <- overlapsAny(sc,ranges)
    str <- ranges(SummarizedExperiment::rowRanges(sc[index,]))@start

    if(normalize){
        newsc <-  epigraHMM::createOffset(SummarizedExperiment::SummarizedExperiment(assays = list(counts = cbind(rowSums(SummarizedExperiment::assay(sc)[index,]),
                                                                                                                  mat[index,])),
                                                                                     rowRanges = SummarizedExperiment::rowRanges(sc[index,]),
                                                                                     colData = data.frame(Data = c('Bulk',paste0('Pseudobulk',1:ncol(mat))))),type = 'loess',span = 1)
        SummarizedExperiment::assay(newsc,'counts') <- SummarizedExperiment::assay(newsc,'counts')/exp(SummarizedExperiment::assay(newsc,'offset'))

    } else{
        newsc <- SummarizedExperiment::SummarizedExperiment(assays = list(counts = cbind(rowSums(SummarizedExperiment::assay(sc)[index,]),
                                                                                         mat[index,])),
                                                            rowRanges = SummarizedExperiment::rowRanges(sc[index,]),
                                                            colData = data.frame(Data = c('Bulk',paste0('Pseudobulk',1:ncol(mat)))))
    }

    dt <- rbindlist(lapply(seq_len(ncol(newsc)),function(x){
        data.table(Data = colData(newsc)$Data[x],Counts = SummarizedExperiment::assay(newsc,'counts')[,x], Start = str)
    }))

    if(bulkOnly){
        fig <- ggplot2::ggplot(data = dt[Data == 'Bulk',],ggplot2::aes(x = Start,y = Counts))+
            ggplot2::facet_grid(rows = ggplot2::vars(Data))+
            ggplot2::geom_line()+
            ggplot2::theme_bw()+
            ggplot2::ylab(ifelse(normalize,'Normalized Counts','Counts'))+
            ggplot2::xlab(paste0('Genomic Window (',seqnames(ranges),')'))
    } else{
        fig1 <- ggplot2::ggplot(data = dt[Data == 'Bulk',],ggplot2::aes(x = Start,y = Counts))+
            ggplot2::facet_grid(rows = ggplot2::vars(Data))+
            ggplot2::geom_line()+
            ggplot2::theme_bw()+
            ggplot2::theme(axis.text = ggplot2::element_text(size = 6),axis.title = ggplot2::element_text(size = 6),
                           panel.grid.major = ggplot2::element_blank(),panel.grid.minor = ggplot2::element_blank(),strip.text = ggplot2::element_text(size = 6))+
            ggplot2::scale_x_continuous(labels = scales::scientific)+
            ggplot2::ylab(ifelse(normalize,'Normalized Counts','Counts'))+
            ggplot2::xlab(paste0('Genomic Window (',seqnames(ranges),')'))
        fig2 <- ggplot2::ggplot(data = dt[!(Data == 'Bulk'),],ggplot2::aes(x = Start,y = Counts))+
            ggplot2::facet_grid(rows = ggplot2::vars(Data))+
            ggplot2::geom_line()+
            ggplot2::theme_bw()+
            ggplot2::theme(axis.text = ggplot2::element_text(size = 6),axis.title = ggplot2::element_text(size = 6),
                           panel.grid.major = ggplot2::element_blank(),panel.grid.minor = ggplot2::element_blank(),strip.text = ggplot2::element_text(size = 6))+
            ggplot2::scale_x_continuous(labels = scales::scientific)+
            ggplot2::ylab(ifelse(normalize,'Normalized Counts','Counts'))+
            ggplot2::xlab(paste0('Genomic Window (',seqnames(ranges),')'))

        subsc <- sc[overlapsAny(sc,ranges),]
        subdt <- rbindlist(lapply(unique(sc$Cluster),function(x){
            ctd <- data.table(Cluster = paste0('Cluster',x),Cell =  seq_len(sum(colData(subsc)$Cluster==x)),Counts = c(as.matrix(SummarizedExperiment::assay(subsc[,colData(subsc)$Cluster==x],'counts'))), Start = str)

            if(shuffle){
                newctd <- ctd[,sum(Counts),by='Cell'][order(V1),][,NewCell := 1:.N][,-'V1']
                ctd <- merge(ctd,newctd,by = 'Cell',all.x = T)[,Cell := NewCell][,NewCell := NULL]
            }

            ctd <- ctd[Counts>0,][,Counts := ifelse(Counts==1,'1',ifelse(Counts==2,'2','>2'))]
            ctd$Counts <- factor(ctd$Counts,levels = c('1','2','>2'))
            return(ctd)
        }))

        fig3 <- ggplot2::ggplot(data = subdt,ggplot2::aes(y = Cell,x = Start, color= Counts))+
            ggplot2::facet_grid(rows = ggplot2::vars(Cluster),scales = 'free_y')+
            ggplot2::geom_point(alpha = 0.3,shape=20,size = ptssize)+
            ggplot2::theme_bw()+
            ggplot2::theme(panel.grid.major = ggplot2::element_blank(),
                  panel.grid.minor = ggplot2::element_blank(),
                  legend.position = 'top',legend.direction = 'horizontal',
                  #legend.box.background = ggplot2::element_rect(colour = "black"),
                  legend.title = ggplot2::element_text(size = 8),
                  legend.text  = ggplot2::element_text(size = 8),
                  axis.text = ggplot2::element_text(size = 6),axis.title = ggplot2::element_text(size = 6),strip.text = ggplot2::element_text(size = 6),
                  legend.key.size = ggplot2::unit(0.1, "lines"))+
            ggplot2::scale_color_manual(values = c('black','red','purple'))+
            ggplot2::scale_x_continuous(labels = scales::scientific)+
            ggplot2::xlab(paste0('Genomic Window (',seqnames(ranges),')'))+
            ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 2,alpha = 1)))

        fig <- ggpubr::ggarrange(fig1,fig2,fig3,nrow = 1,ncol = 3,widths = c(1.875,1.875,2.25)/6,common.legend = T)
    }
    return(fig)
}

range01 <- function(x){
    x <- ifelse(x<0,0,x)
    x <- ifelse(x>1,1,x)
    return(x)
}

generateScenarios <- function(type){
    if(type == 'scATACseq'){
        scenarios <- expand.grid(Data=c('scATACseq'),Groups=c(1,2),
                                 Cells=c(1,2,3),Rarity=c(1,2),Noise=c(1,2),
                                 Depth=c(1,2,3),Sim = 100)

        scenarios$Label <- paste(scenarios[['Data']],paste0('Groups',scenarios[['Groups']]),
                                 paste0('Cells',scenarios[['Cells']]),paste0('Rarity',scenarios[['Rarity']]),
                                 paste0('Noise',scenarios[['Noise']]),paste0('Depth',scenarios[['Depth']]),sep = '_')
    } else{
        if(type == 'scChIPseq'){
            scenarios <- expand.grid(Data=c('scChIPseq'),Groups=c(1,2),
                                     Cells=c(1,2,3),Rarity=c(1,2),Noise=c(1), # scChIPseq is already pretty noise, no need to try other levels
                                     Depth=c(1,2,3),Dissimilarity=c(1,2,3),Sim = 100)

            scenarios$Label <- paste(scenarios[['Data']],paste0('Groups',scenarios[['Groups']]),paste0('Diss',scenarios[['Dissimilarity']]),
                                     paste0('Cells',scenarios[['Cells']]),paste0('Rarity',scenarios[['Rarity']]),
                                     paste0('Noise',scenarios[['Noise']]),paste0('Depth',scenarios[['Depth']]),sep = '_')
        } else{
            stop('Type is not recognized')
        }
    }
    return(scenarios)
}

annotation_compass <- function(label,
                               position = c('N','NE','E','SE','S','SW','W','NW'),
                               padding = grid::unit(c(0.5,0.5),"line"),fontsize = 12,...){
    position <- match.arg(position)
    x <-        switch(EXPR = position,N = 0.5,NE = 1,E = 1,SE = 1,S = 0.5,SW = 0,W = 0,NW = 0)
    y <-        switch(EXPR = position,N = 1,NE = 1,E = 0.5,SE = 0,S = 0,SW = 0,W = 0.5,NW = 1)
    hjust <-    switch(EXPR = position,N = 0.5,NE = 1,E = 1,SE = 1,S = 0.5,SW = 0,W = 0,NW = 0)
    vjust <-    switch(EXPR = position,N = 1,NE = 1,E = 0.5,SE = 0,S = 0,SW = 0,W = 0.5,NW = 1)
    f1 <-       switch(EXPR = position,N = 0,NE = -1,E = -1,SE = -1,S = 0,SW = 1,W = 1,NW = 1)
    f2 <-       switch(EXPR = position,N = -1,NE = -1,E = 0,SE = 1,S = 1,SW = 1,W = 0,NW = -1)
    ggplot2::annotation_custom(grid::textGrob(label,gp = grid::gpar(col = "black", fontsize = fontsize),
                                     x=grid::unit(x,"npc") + f1*padding[1] ,
                                     y=grid::unit(y,"npc") + f2*padding[2],
                                     hjust=hjust,vjust=vjust, ...))
}

mergeSC <- function(tmpdir,name1,name2){
    V1 = V2 = V3 = rl = Counts = N = SumN = NULL

    dt1 <- scater::readSparseCounts(file.path(tmpdir,name1))
    dt1 <- dt1[unique(rownames(dt1)),]

    dt2 <- scater::readSparseCounts(file.path(tmpdir,name2))
    dt2 <- dt2[unique(rownames(dt2)),]

    sc <- Seurat::RowMergeSparseMatrices(dt1,dt2)

    rn <- rownames(sc)

    grDatamatrix <- with(as.data.table(do.call(rbind,strsplit(rn,":|-"))),GenomicRanges::GRanges(V1,IRanges::IRanges(as.numeric(V2),as.numeric(V3))))

    orderRanges <- order(grDatamatrix)

    grDatamatrix <- grDatamatrix[orderRanges]
    rn <- rn[orderRanges]
    dt1 <- dt1[orderRanges,]
    dt2 <- dt2[orderRanges,]
    sc <- sc[orderRanges,]

    total_cell <- c(ncol(dt1),ncol(dt2))
    sample_name <- c('HBCx-22','HBCx-22-TamR')
    annot_raw <- rbindlist(list(data.table(barcode=colnames(dt1), cell_id=paste0(sample_name[1], "_c", 1:total_cell[1]), sample_id=rep(sample_name[1], total_cell[1]), batch_id=1),
                                data.table(barcode=colnames(dt2), cell_id=paste0(sample_name[2], "_c", 1:total_cell[2]), sample_id=rep(sample_name[2], total_cell[2]), batch_id=1)))

    colnames(sc) <- c(paste0(sample_name[1], "_c", 1:total_cell[1]),
                      paste0(sample_name[2], "_c", 1:total_cell[2]))

    rm(dt1,dt2)

    sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = sc),
                                                     rowRanges = grDatamatrix,
                                                     colData = data.frame(Cluster = annot_raw$sample_id))

    sc <- sc[seqnames(sc)%in%c(paste0('chr',1:22),'chrX','chrY'),]

    Rleagg <- data.table(Counts = Matrix::rowSums(SummarizedExperiment::assay(sc)))[,rl := rleid(Counts)][,N := 1:.N]
    Rleagg <- merge(Rleagg,Rleagg[,list(SumN = .N),by = 'rl'],by = 'rl',all.x = TRUE)
    sc <- sc[-Rleagg[,which(Counts==0 & SumN>2)],]
    return(sc)
}

addSmallLegend <- function(myPlot, pointSize = 0.5, textSize = 3, spaceLegend = 0.1,legend.title = F) {
    myPlot <- myPlot +
        ggplot2::guides(shape = ggplot2::guide_legend(override.aes = list(size = pointSize)),
               color = ggplot2::guide_legend(override.aes = list(size = pointSize))) +
        ggplot2::theme(legend.text  = ggplot2::element_text(size = textSize),
              legend.background = ggplot2::element_rect(fill=scales::alpha('white',0)),
              legend.key.size = ggplot2::unit(spaceLegend, "lines"))

    if(legend.title == F){
        myPlot <- myPlot + ggplot2::theme(legend.title = ggplot2::element_blank())
    } else{
        myPlot <- myPlot + ggplot2::guides(color = ggplot2::guide_legend(title = legend.title))
    }
}

rd <- function(x,digits = 2){
    format(round(x, digits=digits), nsmall = digits)
}

diffPeaks <- function(object){

    fdr = NULL

    # Creating control

    control <- mikado::controlEM(epsilon.em=1e-4,maxit.em=50)

    # Transforming data into binary

    oldcounts <- SummarizedExperiment::assay(object,'counts')

    SummarizedExperiment::assay(object,'counts') <- 1*(oldcounts>0)

    # Run the peak caller

    object <- mikado::MHMM_v4(object = object,control = control)

    # Transforming the data back

    SummarizedExperiment::assay(object,'counts') <- oldcounts

    # Creating control elements

    for(i in seq_along(control)){assign(names(control)[i],control[[i]])}

    # Calling differential peaks

    index <- S4Vectors::metadata(object)$viterbi == 1
    peaks <- GenomicRanges::reduce(SummarizedExperiment::rowRanges(object[index,]))
    peaks$name <- paste0('peak',1:length(peaks))

    # Checking overlaps

    overlaps <- GenomicRanges::findOverlaps(SummarizedExperiment::rowRanges(object),peaks)

    vec <- rep('peakNA',nrow(object))
    vec[S4Vectors::queryHits(overlaps)] <- peaks$name[S4Vectors::subjectHits(overlaps)]

    # Aggregating windows into regions

    aggpeaks <- pseudoBulk(sc = object,group = vec,type = 'rows')

    aggpeaks <- aggpeaks[-which(rownames(aggpeaks)=="grouppeakNA"),]

    # Creating reduced SC experiment

    sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = unname(aggpeaks)),
                                                     rowRanges = peaks,
                                                     colData = SummarizedExperiment::colData(object))

    return(list('sc' = sc,'peaks' = index))
}
plbaldoni/scChIPseqsim documentation built on June 11, 2020, 7:41 p.m.