R/runGrosselin_mikado.R

Defines functions runGrosselin_mikado

Documented in runGrosselin_mikado

#' Multi-scenario analysis of single-cell epigenomic data
#'
#' Add description here
#'
#' @param path directory
#' @param cores number of cores
#' @param minReads minimum number of reads per cell to consider
#' @param maxPct upper percentile of reads per cell to consider
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @importFrom magrittr %<>%
#'
#' @export
#'
runGrosselin_mikado <- function(path,cores = 4,minReads = 5000,maxPct = 1){

    mati2 = V1 = V2 = V3 = rl = Counts = N = SumN = NULL

    tmpdir <- file.path(path,'tmp')
    system2("mkdir",file.path(path))
    system2("mkdir",tmpdir)
    tmpdir <- normalizePath(tmpdir)

    control = mikado::controlEM(cores = cores,dir = tmpdir,chr = c(paste0('chr',1:22),'chrX'),epsilon.em = 1e-2,maxit.em = 20,init = 'fast')

    dtnames <- list('50kb' = c('/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536862/output_hg38_50000bp/counts/SRR7536862_hg38_50000bp_flagged_rmPCR_RT_rmDup_counts_50000_filt_1000.tsv',
                               '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536863/output_hg38_50000bp/counts/SRR7536863_hg38_50000bp_flagged_rmPCR_RT_rmDup_counts_50000_filt_1000.tsv'),
                    '25kb' = c('/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536862/output_hg38_25000bp/counts/SRR7536862_hg38_25000bp_flagged_rmPCR_RT_rmDup_counts_25000_filt_1000.tsv',
                               '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536863/output_hg38_25000bp/counts/SRR7536863_hg38_25000bp_flagged_rmPCR_RT_rmDup_counts_25000_filt_1000.tsv'),
                    '5kb' = c('/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536862/output_hg38_5000bp/counts/SRR7536862_hg38_5000bp_flagged_rmPCR_RT_rmDup_counts_5000_filt_1000.tsv',
                               '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536863/output_hg38_5000bp/counts/SRR7536863_hg38_5000bp_flagged_rmPCR_RT_rmDup_counts_5000_filt_1000.tsv'),
                    '1kb' = c('/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536862/output_hg38_1000bp/counts/SRR7536862_hg38_1000bp_flagged_rmPCR_RT_rmDup_counts_1000_filt_1000.tsv',
                               '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536863/output_hg38_1000bp/counts/SRR7536863_hg38_1000bp_flagged_rmPCR_RT_rmDup_counts_1000_filt_1000.tsv'))

    resolution <- c(50000,25000,5000,1000)

    # Running methods on original data

    if(!file.exists(file.path(path,'Analysis.RData'))){

        load('/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/scChIPseq/datasets/HBCx-22-human-paper/scChIPseq.RData')
        load('/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/scChIPseq/datasets/HBCx-22-human-paper/cor_filtered_data/HBCx-22-human-paper_99_1.RData')

        sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = Matrix::Matrix(dt,sparse = TRUE)),
                                                         rowRanges = GenomicRanges::makeGRangesFromDataFrame(data.table::rbindlist(lapply(strsplit(rownames(dt),"_"),FUN = function(x){
                                                             data.table::data.table(chr=paste(x[seq_len(length(x)-2)],collapse = '_'),start=x[length(x)-1],end=x[length(x)])
                                                         }))),
                                                         colData = data.frame(Cluster = as.numeric(as.factor(metadata$ChromatinGroup))))

        shape <- metadata$sample_id
        shape %<>% plyr::mapvalues(from = c("GSM3290891_CountTable_HBCx-22_scChIP_H3K27me3_hg38","GSM3290892_CountTable_HBCx-22-TamR_scChIP_H3K27me3_hg38"),
                                   to = c('HBCx-22','HBCx-22-TamR')) %<>% factor(levels = c('HBCx-22','HBCx-22-TamR'))

        sc <- sort(sc)

        ### Begin Filtering
        sc <- sc[as.character(GenomicRanges::seqnames(sc))%in%control$chr,]
        sc <- sc[,Matrix::colSums(SummarizedExperiment::assay(sc))>minReads]
        sc <- sc[,Matrix::colSums(SummarizedExperiment::assay(sc))<quantile(Matrix::colSums(SummarizedExperiment::assay(sc)),probs = (100-maxPct)/100)]
        ### End Filtering

        SummarizedExperiment::assay(sc) <- 1*(SummarizedExperiment::assay(sc)>0)

        # Running mikado
        initializer <- mikado::initializer(object = sc,control = control)

        save(initializer,shape,sc,file = file.path(path,'Analysis.RData'),compress = 'xz')
    }

    # Analysis for other window sizes

    for(res in resolution){
        label <- paste0(res/1000,'kb')

        if(!file.exists(file.path(path,paste0('Analysis_',label,'.RData')))){
            message('Resolution: ',label)

            dtname1 <- dtnames[[label]][[1]]
            dtname2 <- dtnames[[label]][[2]]

            system(paste('cp',dtname1,file.path(tmpdir,paste0('dt1_',label,'.txt'))))
            system(paste('gzip -c',file.path(tmpdir,paste0('dt1_',label,'.txt')),'>',file.path(tmpdir,paste0('dt1_',label,'.txt.gz'))))
            system(paste('rm',file.path(tmpdir,paste0('dt1_',label,'.txt'))))

            system(paste('cp',dtname2,file.path(tmpdir,paste0('dt2_',label,'.txt'))))
            system(paste('gzip -c',file.path(tmpdir,paste0('dt2_',label,'.txt')),'>',file.path(tmpdir,paste0('dt2_',label,'.txt.gz'))))
            system(paste('rm',file.path(tmpdir,paste0('dt2_',label,'.txt'))))

            dt1 <- scater::readSparseCounts(file.path(tmpdir,paste0('dt1_',label,'.txt.gz')))
            dt1 <- dt1[unique(rownames(dt1)),]

            dt2 <- scater::readSparseCounts(file.path(tmpdir,paste0('dt2_',label,'.txt.gz')))
            dt2 <- dt2[unique(rownames(dt2)),]

            sc <- Seurat::RowMergeSparseMatrices(dt1,dt2)

            rn <- rownames(sc)

            grDatamatrix <- with(data.table::as.data.table(do.call(rbind,strsplit(rn,":|-"))),GenomicRanges::GRanges(V1,IRanges::IRanges(as.numeric(V2),as.numeric(V3))))

            orderRanges <- order(grDatamatrix)

            grDatamatrix <- grDatamatrix[orderRanges]
            rn <- rn[orderRanges]
            dt1 <- dt1[orderRanges,]
            dt2 <- dt2[orderRanges,]
            sc <- sc[orderRanges,]

            total_cell <- c(ncol(dt1),ncol(dt2))
            sample_name <- c('HBCx-22','HBCx-22-TamR')
            annot_raw <- data.table::rbindlist(list(data.table::data.table(barcode=colnames(dt1), cell_id=paste0(sample_name[1], "_c", 1:total_cell[1]), sample_id=rep(sample_name[1], total_cell[1]), batch_id=1),
                                                    data.table::data.table(barcode=colnames(dt2), cell_id=paste0(sample_name[2], "_c", 1:total_cell[2]), sample_id=rep(sample_name[2], total_cell[2]), batch_id=1)))

            colnames(sc) <- c(paste0(sample_name[1], "_c", 1:total_cell[1]),
                              paste0(sample_name[2], "_c", 1:total_cell[2]))

            rm(dt1,dt2)

            sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = sc),
                                                             rowRanges = grDatamatrix,
                                                             colData = data.frame(Cluster = annot_raw$sample_id))
            sc <- sort(sc)

            ### Begin Filtering
            sc <- sc[as.character(GenomicRanges::seqnames(sc))%in%control$chr,]
            sc <- sc[,Matrix::colSums(SummarizedExperiment::assay(sc))>minReads]
            sc <- sc[,Matrix::colSums(SummarizedExperiment::assay(sc))<quantile(Matrix::colSums(SummarizedExperiment::assay(sc)),probs = (100-maxPct)/100)]
            ### End Filtering

            SummarizedExperiment::assay(sc) <- 1*(SummarizedExperiment::assay(sc)>0)

            # Running mikado
            assign(paste0('initializer.',label),mikado::initializer(object = sc,control = control))
            assign(paste0('sc.',label),sc)

            save(list = c(paste0('initializer.',label),paste0('sc.',label)),file = file.path(path,paste0('Analysis_',label,'.RData')),compress = 'xz')
        }
    }
    system(paste('rm',tmpdir))
}
plbaldoni/scChIPseqsim documentation built on June 11, 2020, 7:41 p.m.