R/runGrosselin_multiCluster.R

Defines functions runGrosselin_multiCluster

Documented in runGrosselin_multiCluster

#' Multi-scenario analysis of single-cell epigenomic data
#'
#' Add description here
#'
#' @param path directory
#' @param pathdata path to the data
#' @param resvec resolution of the data to analyze
#' @param maxCluster maximum number of clusters
#' @param algCluster clustering algorithm
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @importFrom magrittr %<>%
#'
#' @export
#'
runGrosselin_multiCluster <- function(path,pathdata,resvec = c(50000,25000,5000,1000),maxCluster = 8,algCluster = 'kmeans'){

    V1 = V2 = V3 = rl = Counts = N = SumN = NULL

    system2("mkdir",file.path(path))

    for(resolution in resvec){
        message('Resolution: ',resolution)
        # General parameters
        res <- resolution/1000
        label <- paste0(res,'kb')
        assignments <- list()
        gzdir <- file.path(pathdata,'Data')

        # Loading the data
        load(file.path(pathdata,paste0('Analysis_',label,'.RData')))
        sc <- get(paste0('Cusanovich2018.',label))$sc

        # Runing each method for different number of clusters
        message('Running each method for different number of clusters...')
        for(ncluster in 2:maxCluster){
            assignments[[paste0(ncluster)]] <- list()
            tmpsc <- sc
            SummarizedExperiment::colData(tmpsc$sc)$Cluster <- sample(1:ncluster,ncol(tmpsc$sc),replace = TRUE)

            assignments[[paste0(ncluster)]][[paste0('cisTopic.',label)]] <- scChIPseqsim::runCisTopic(sc = tmpsc,legend.title = 'Random Assignment')
            assignments[[paste0(ncluster)]][[paste0('Cusanovich2018.',label)]] <- scChIPseqsim::runCusanovich2018(sc = tmpsc,legend.title = 'Random Assignment')
            assignments[[paste0(ncluster)]][[paste0('Scasat.',label)]] <- scChIPseqsim::runScasat(sc = tmpsc,legend.title = 'Random Assignment')
            assignments[[paste0(ncluster)]][[paste0('SnapATAC.',label)]] <- scChIPseqsim::runSnapATAC(sc = tmpsc,legend.title = 'Random Assignment')
        }

        # Running differential peak calling for different resolutions
        ## Loading the data
        dt1 <- scater::readSparseCounts(file.path(gzdir,paste0('dt1_',label,'.txt.gz')))
        dt1 <- dt1[unique(rownames(dt1)),]

        dt2 <- scater::readSparseCounts(file.path(gzdir,paste0('dt2_',label,'.txt.gz')))
        dt2 <- dt2[unique(rownames(dt2)),]

        sc <- Seurat::RowMergeSparseMatrices(dt1,dt2)

        rn <- rownames(sc)

        grDatamatrix <- with(data.table::as.data.table(do.call(rbind,strsplit(rn,":|-"))), GenomicRanges::GRanges(V1,IRanges::IRanges(as.numeric(V2),as.numeric(V3))))

        orderRanges <- order(grDatamatrix)

        grDatamatrix <- grDatamatrix[orderRanges]
        rn <- rn[orderRanges]
        dt1 <- dt1[orderRanges,]
        dt2 <- dt2[orderRanges,]
        sc <- sc[orderRanges,]

        total_cell <- c(ncol(dt1),ncol(dt2))
        sample_name <- c('HBCx-22','HBCx-22-TamR')
        annot_raw <- data.table::rbindlist(list(data.table::data.table(barcode=colnames(dt1), cell_id=paste0(sample_name[1], "_c", 1:total_cell[1]), sample_id=rep(sample_name[1], total_cell[1]), batch_id=1),
                                                data.table::data.table(barcode=colnames(dt2), cell_id=paste0(sample_name[2], "_c", 1:total_cell[2]), sample_id=rep(sample_name[2], total_cell[2]), batch_id=1)))

        colnames(sc) <- c(paste0(sample_name[1], "_c", 1:total_cell[1]),
                          paste0(sample_name[2], "_c", 1:total_cell[2]))

        rm(dt1,dt2)

        ## Setting up the sc object

        sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = sc),
                                                         rowRanges = grDatamatrix,
                                                         colData = data.frame(Cluster = rep(NA,ncol(sc))))

        sc <- subset(sc,as.character(seqnames)%in%c(paste0('chr',1:22),'chrX','chrY'))

        Rleagg <- data.table::data.table(Counts = Matrix::rowSums(SummarizedExperiment::assay(sc)))[,rl := data.table::rleid(Counts)][,N := 1:.N]
        Rleagg <- merge(Rleagg,Rleagg[,list(SumN = .N),by = 'rl'],by = 'rl',all.x = TRUE)
        sc <- sc[-Rleagg[,which(Counts==0 & SumN>2)],]

        ## Looping through different number of clusters
        for(ncluster in 2:maxCluster){
            for(methods in c('cisTopic','Cusanovich2018','Scasat','SnapATAC')){
                # Organizing the data
                SummarizedExperiment::colData(sc)$Cluster <- assignments[[paste0(ncluster)]][[paste0(methods,'.',label)]][['clustering']][[algCluster]]
                SummarizedExperiment::colData(sc)[[paste0(methods,'.',ncluster)]] <- SummarizedExperiment::colData(sc)$Cluster

                # Aggregating data and calling peaks
                mat <- scChIPseqsim:::pseudoBulk(sc = sc,group = as.factor(SummarizedExperiment::colData(sc)$Cluster),type = 'cols')
                for(cluster in 1:ncol(mat)){
                    message('Calling peaks from cluster ',cluster,' (out of ',ncluster,')',' for ',methods,'...')

                    object <- epigraHMM::epigraHMMDataSetFromMatrix(countData = as.matrix(mat[,cluster]),
                                                                    rowRanges = SummarizedExperiment::rowRanges(sc),
                                                                    colData = data.frame(condition = 'A',replicate = 1))
                    object <- epigraHMM::epigraHMM(object,control = epigraHMM::controlEM(),type = 'consensus',dist = 'nb')

                    S4Vectors::values(SummarizedExperiment::rowRanges(sc)) <- cbind(S4Vectors::values(SummarizedExperiment::rowRanges(sc)),S4Vectors::DataFrame('reference' = S4Vectors::metadata(object)$viterbi=='E'))
                    colnames(S4Vectors::values(SummarizedExperiment::rowRanges(sc)))[which(colnames(S4Vectors::values(SummarizedExperiment::rowRanges(sc)))=='reference')] <- paste0(methods,'.',ncluster,'.',cluster)
                    SummarizedExperiment::rowRanges(sc)$reference <- NULL
                }
            }
        }
        SummarizedExperiment::colData(sc)$Cluster <- NULL

        assign(paste0('sc_',label),sc)
        rm(sc)
        message('Saving the results...')
        save(list=paste0('sc_',label),file = file.path(path,paste0('Peaks_',label,'.RData')),compress = 'xz')
        message('Done!')
    }
}
plbaldoni/scChIPseqsim documentation built on June 11, 2020, 7:41 p.m.