R/runGrosselin.R

Defines functions runGrosselin

Documented in runGrosselin

#' Multi-scenario analysis of single-cell epigenomic data
#'
#' Add description here
#'
#' @param path directory
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @importFrom magrittr %<>%
#'
#' @export
#'
runGrosselin <- function(path){

    mati2 = V1 = V2 = V3 = rl = Counts = N = SumN = NULL

    system2("mkdir",file.path(path))

    # Running methods on original data

    if(!file.exists(file.path(path,'Analysis.RData'))){
        load('/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/scChIPseq/datasets/HBCx-22-human-paper/scChIPseq.RData')
        load('/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/scChIPseq/datasets/HBCx-22-human-paper/cor_filtered_data/HBCx-22-human-paper_99_1.RData')

        sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = Matrix::Matrix(dt,sparse = TRUE)),
                                                         rowRanges = GenomicRanges::makeGRangesFromDataFrame(data.table::rbindlist(lapply(strsplit(rownames(dt),"_"),FUN = function(x){
                                                             data.table::data.table(chr=paste(x[seq_len(length(x)-2)],collapse = '_'),start=x[length(x)-1],end=x[length(x)])
                                                         }))),
                                                         colData = data.frame(Cluster = as.numeric(as.factor(metadata$ChromatinGroup))))

        shape <- metadata$sample_id
        shape %<>% plyr::mapvalues(from = c("GSM3290891_CountTable_HBCx-22_scChIP_H3K27me3_hg38","GSM3290892_CountTable_HBCx-22-TamR_scChIP_H3K27me3_hg38"),
                                 to = c('HBCx-22','HBCx-22-TamR')) %<>% factor(levels = c('HBCx-22','HBCx-22-TamR'))

        Cusanovich2018 <- runCusanovich2018(sc,peaks = FALSE,returnSc = FALSE,legend.title = 'Grosselin Clustering',shape = shape,shape.title = 'Sample Origin')
        SnapATAC <- runSnapATAC(sc,peaks = FALSE,returnSc = FALSE,legend.title = 'Grosselin Clustering',shape = shape,shape.title = 'Sample Origin')
        Scasat <- runScasat(sc,peaks = FALSE,returnSc = FALSE,legend.title = 'Grosselin Clustering',shape = shape,shape.title = 'Sample Origin')
        CisTopic <- runCisTopic(sc,peaks = FALSE,returnSc = FALSE,legend.title = 'Grosselin Clustering',shape = shape,shape.title = 'Sample Origin')

        # Reproducing Grosseling figure

        Grosselin2019 <- plot_umap(x = run_umap(mati2), labels = as.numeric(as.factor(metadata$ChromatinGroup)), title = 'Grosselin2019',legend.title = 'Grosselin Clustering',shape = shape,shape.title = 'Sample Origin')

        save(Cusanovich2018,
             SnapATAC,
             Scasat,
             CisTopic,
             Grosselin2019,shape,file = file.path(path,'Analysis.RData'),compress = 'xz')

        rm(list=setdiff(ls(),'path'))
    }

    # Analysis for 50kb

    if(!file.exists(file.path(path,'Analysis_50kb.RData'))){
        tmpdir <- file.path(path,'Data')
        system(paste('mkdir',tmpdir))

        dtname1 <- '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536862/output_hg38_50000bp/counts/SRR7536862_hg38_50000bp_flagged_rmPCR_RT_rmDup_counts_50000_filt_1000.tsv'
        dtname2 <- '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536863/output_hg38_50000bp/counts/SRR7536863_hg38_50000bp_flagged_rmPCR_RT_rmDup_counts_50000_filt_1000.tsv'

        system(paste('cp',dtname1,file.path(tmpdir,'dt1_50kb.txt')))
        system(paste('gzip -c',file.path(tmpdir,'dt1_50kb.txt'),'>',file.path(tmpdir,'dt1_50kb.txt.gz')))
        system(paste('rm',file.path(tmpdir,'dt1_50kb.txt')))

        system(paste('cp',dtname2,file.path(tmpdir,'dt2_50kb.txt')))
        system(paste('gzip -c',file.path(tmpdir,'dt2_50kb.txt'),'>',file.path(tmpdir,'dt2_50kb.txt.gz')))
        system(paste('rm',file.path(tmpdir,'dt2_50kb.txt')))

        dt1 <- scater::readSparseCounts(file.path(tmpdir,'dt1_50kb.txt.gz'))
        dt1 <- dt1[unique(rownames(dt1)),]

        dt2 <- scater::readSparseCounts(file.path(tmpdir,'dt2_50kb.txt.gz'))
        dt2 <- dt2[unique(rownames(dt2)),]

        sc <- Seurat::RowMergeSparseMatrices(dt1,dt2)

        rn <- rownames(sc)

        grDatamatrix <- with(as.data.table(do.call(rbind,strsplit(rn,":|-"))), GRanges(V1,IRanges::IRanges(as.numeric(V2),as.numeric(V3))))

        orderRanges <- order(grDatamatrix)

        grDatamatrix <- grDatamatrix[orderRanges]
        rn <- rn[orderRanges]
        dt1 <- dt1[orderRanges,]
        dt2 <- dt2[orderRanges,]
        sc <- sc[orderRanges,]

        total_cell <- c(ncol(dt1),ncol(dt2))
        sample_name <- c('HBCx-22','HBCx-22-TamR')
        annot_raw <- rbindlist(list(data.table(barcode=colnames(dt1), cell_id=paste0(sample_name[1], "_c", 1:total_cell[1]), sample_id=rep(sample_name[1], total_cell[1]), batch_id=1),
                                    data.table(barcode=colnames(dt2), cell_id=paste0(sample_name[2], "_c", 1:total_cell[2]), sample_id=rep(sample_name[2], total_cell[2]), batch_id=1)))

        colnames(sc) <- c(paste0(sample_name[1], "_c", 1:total_cell[1]),
                          paste0(sample_name[2], "_c", 1:total_cell[2]))

        rm(dt1,dt2)

        sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = sc),
                                                         rowRanges = grDatamatrix,
                                                         colData = data.frame(Cluster = annot_raw$sample_id))

        sc <- sc[seqnames(sc)%in%c(paste0('chr',1:22),'chrX','chrY'),]

        Rleagg <- data.table(Counts = Matrix::rowSums(SummarizedExperiment::assay(sc)))[,rl := rleid(Counts)][,N := 1:.N]
        Rleagg <- merge(Rleagg,Rleagg[,list(SumN = .N),by = 'rl'],by = 'rl',all.x = TRUE)
        sc <- sc[-Rleagg[,which(Counts==0 & SumN>2)],]

        Cusanovich2018.50kb <- runCusanovich2018(sc, peaks = TRUE, returnSc = TRUE,legend.title = 'Sample Origin')
        cisTopic.50kb <- runCisTopic(sc = Cusanovich2018.50kb$sc,legend.title = 'Sample Origin')
        SnapATAC.50kb <- runSnapATAC(sc = Cusanovich2018.50kb$sc,legend.title = 'Sample Origin')
        Scasat.50kb <- runScasat(sc = Cusanovich2018.50kb$sc,legend.title = 'Sample Origin')

        save(Cusanovich2018.50kb,
             cisTopic.50kb,
             SnapATAC.50kb,
             Scasat.50kb,file = file.path(path,'Analysis_50kb.RData'),compress = 'xz')

        rm(list=setdiff(ls(),'path'))
    }

    if(!file.exists(file.path(path,'Analysis_25kb.RData'))){
        # Analysis for 25kb

        tmpdir <- file.path(path,'Data')
        system(paste('mkdir',tmpdir))

        dtname1 <- '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536862/output_hg38_25000bp/counts/SRR7536862_hg38_25000bp_flagged_rmPCR_RT_rmDup_counts_25000_filt_1000.tsv'
        dtname2 <- '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536863/output_hg38_25000bp/counts/SRR7536863_hg38_25000bp_flagged_rmPCR_RT_rmDup_counts_25000_filt_1000.tsv'

        system(paste('cp',dtname1,file.path(tmpdir,'dt1_25kb.txt')))
        system(paste('gzip -c',file.path(tmpdir,'dt1_25kb.txt'),'>',file.path(tmpdir,'dt1_25kb.txt.gz')))
        system(paste('rm',file.path(tmpdir,'dt1_25kb.txt')))

        system(paste('cp',dtname2,file.path(tmpdir,'dt2_25kb.txt')))
        system(paste('gzip -c',file.path(tmpdir,'dt2_25kb.txt'),'>',file.path(tmpdir,'dt2_25kb.txt.gz')))
        system(paste('rm',file.path(tmpdir,'dt2_25kb.txt')))

        dt1 <- scater::readSparseCounts(file.path(tmpdir,'dt1_25kb.txt.gz'))
        dt1 <- dt1[unique(rownames(dt1)),]

        dt2 <- scater::readSparseCounts(file.path(tmpdir,'dt2_25kb.txt.gz'))
        dt2 <- dt2[unique(rownames(dt2)),]

        sc <- Seurat::RowMergeSparseMatrices(dt1,dt2)

        rn <- rownames(sc)

        grDatamatrix <- with(as.data.table(do.call(rbind,strsplit(rn,":|-"))), GRanges(V1,IRanges::IRanges(as.numeric(V2),as.numeric(V3))))

        orderRanges <- order(grDatamatrix)

        grDatamatrix <- grDatamatrix[orderRanges]
        rn <- rn[orderRanges]
        dt1 <- dt1[orderRanges,]
        dt2 <- dt2[orderRanges,]
        sc <- sc[orderRanges,]

        total_cell <- c(ncol(dt1),ncol(dt2))
        sample_name <- c('HBCx-22','HBCx-22-TamR')
        annot_raw <- rbindlist(list(data.table(barcode=colnames(dt1), cell_id=paste0(sample_name[1], "_c", 1:total_cell[1]), sample_id=rep(sample_name[1], total_cell[1]), batch_id=1),
                                    data.table(barcode=colnames(dt2), cell_id=paste0(sample_name[2], "_c", 1:total_cell[2]), sample_id=rep(sample_name[2], total_cell[2]), batch_id=1)))

        colnames(sc) <- c(paste0(sample_name[1], "_c", 1:total_cell[1]),
                          paste0(sample_name[2], "_c", 1:total_cell[2]))

        rm(dt1,dt2)

        sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = sc),
                                                         rowRanges = grDatamatrix,
                                                         colData = data.frame(Cluster = annot_raw$sample_id))

        sc <- sc[seqnames(sc)%in%c(paste0('chr',1:22),'chrX','chrY'),]

        Rleagg <- data.table(Counts = Matrix::rowSums(SummarizedExperiment::assay(sc)))[,rl := rleid(Counts)][,N := 1:.N]
        Rleagg <- merge(Rleagg,Rleagg[,list(SumN = .N),by = 'rl'],by = 'rl',all.x = TRUE)
        sc <- sc[-Rleagg[,which(Counts==0 & SumN>2)],]

        Cusanovich2018.25kb <- runCusanovich2018(sc, peaks = TRUE, returnSc = TRUE,legend.title = 'Sample Origin')
        cisTopic.25kb <- runCisTopic(sc = Cusanovich2018.25kb$sc,legend.title = 'Sample Origin')
        SnapATAC.25kb <- runSnapATAC(sc = Cusanovich2018.25kb$sc,legend.title = 'Sample Origin')
        Scasat.25kb <- runScasat(sc = Cusanovich2018.25kb$sc,legend.title = 'Sample Origin')

        save(Cusanovich2018.25kb,
             cisTopic.25kb,
             SnapATAC.25kb,
             Scasat.25kb,file = file.path(path,'Analysis_25kb.RData'),compress = 'xz')

        rm(list=setdiff(ls(),'path'))
    }

    # Analysis for 5kb

    if(!file.exists(file.path(path,'Analysis_5kb.RData'))){
        tmpdir <- file.path(path,'Data')
        system(paste('mkdir',tmpdir))

        dtname1 <- '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536862/output_hg38_5000bp/counts/SRR7536862_hg38_5000bp_flagged_rmPCR_RT_rmDup_counts_5000_filt_1000.tsv'
        dtname2 <- '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536863/output_hg38_5000bp/counts/SRR7536863_hg38_5000bp_flagged_rmPCR_RT_rmDup_counts_5000_filt_1000.tsv'

        system(paste('cp',dtname1,file.path(tmpdir,'dt1_5kb.txt')))
        system(paste('gzip -c',file.path(tmpdir,'dt1_5kb.txt'),'>',file.path(tmpdir,'dt1_5kb.txt.gz')))
        system(paste('rm',file.path(tmpdir,'dt1_5kb.txt')))

        system(paste('cp',dtname2,file.path(tmpdir,'dt2_5kb.txt')))
        system(paste('gzip -c',file.path(tmpdir,'dt2_5kb.txt'),'>',file.path(tmpdir,'dt2_5kb.txt.gz')))
        system(paste('rm',file.path(tmpdir,'dt2_5kb.txt')))

        dt1 <- scater::readSparseCounts(file.path(tmpdir,'dt1_5kb.txt.gz'))
        dt1 <- dt1[unique(rownames(dt1)),]

        dt2 <- scater::readSparseCounts(file.path(tmpdir,'dt2_5kb.txt.gz'))
        dt2 <- dt2[unique(rownames(dt2)),]

        sc <- Seurat::RowMergeSparseMatrices(dt1,dt2)

        rn <- rownames(sc)

        grDatamatrix <- with(as.data.table(do.call(rbind,strsplit(rn,":|-"))), GRanges(V1,IRanges::IRanges(as.numeric(V2),as.numeric(V3))))

        orderRanges <- order(grDatamatrix)

        grDatamatrix <- grDatamatrix[orderRanges]
        rn <- rn[orderRanges]
        dt1 <- dt1[orderRanges,]
        dt2 <- dt2[orderRanges,]
        sc <- sc[orderRanges,]

        total_cell <- c(ncol(dt1),ncol(dt2))
        sample_name <- c('HBCx-22','HBCx-22-TamR')
        annot_raw <- rbindlist(list(data.table(barcode=colnames(dt1), cell_id=paste0(sample_name[1], "_c", 1:total_cell[1]), sample_id=rep(sample_name[1], total_cell[1]), batch_id=1),
                                    data.table(barcode=colnames(dt2), cell_id=paste0(sample_name[2], "_c", 1:total_cell[2]), sample_id=rep(sample_name[2], total_cell[2]), batch_id=1)))

        colnames(sc) <- c(paste0(sample_name[1], "_c", 1:total_cell[1]),
                          paste0(sample_name[2], "_c", 1:total_cell[2]))

        rm(dt1,dt2)

        sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = sc),
                                                         rowRanges = grDatamatrix,
                                                         colData = data.frame(Cluster = annot_raw$sample_id))

        sc <- sc[seqnames(sc)%in%c(paste0('chr',1:22),'chrX','chrY'),]

        Rleagg <- data.table(Counts = Matrix::rowSums(SummarizedExperiment::assay(sc)))[,rl := rleid(Counts)][,N := 1:.N]
        Rleagg <- merge(Rleagg,Rleagg[,list(SumN = .N),by = 'rl'],by = 'rl',all.x = TRUE)
        sc <- sc[-Rleagg[,which(Counts==0 & SumN>2)],]

        Cusanovich2018.5kb <- runCusanovich2018(sc, peaks = TRUE, returnSc = TRUE,legend.title = 'Sample Origin')
        cisTopic.5kb <- runCisTopic(sc = Cusanovich2018.5kb$sc,legend.title = 'Sample Origin')
        SnapATAC.5kb <- runSnapATAC(sc = Cusanovich2018.5kb$sc,legend.title = 'Sample Origin')
        Scasat.5kb <- runScasat(sc = Cusanovich2018.5kb$sc,legend.title = 'Sample Origin')

        save(Cusanovich2018.5kb,
             cisTopic.5kb,
             SnapATAC.5kb,
             Scasat.5kb,file = file.path(path,'Analysis_5kb.RData'),compress = 'xz')

        rm(list=setdiff(ls(),'path'))
    }

    if(!file.exists(file.path(path,'Analysis_1kb.RData'))){
        # Analysis for 1kb

        tmpdir <- file.path(path,'Data')
        system(paste('mkdir',tmpdir))

        dtname1 <- '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536862/output_hg38_1000bp/counts/SRR7536862_hg38_1000bp_flagged_rmPCR_RT_rmDup_counts_1000_filt_1000.tsv'
        dtname2 <- '/nas/longleaf/home/baldoni/scratch/Rashid/Project3/Data/Grosselin2019/processedFiles/SRR7536863/output_hg38_1000bp/counts/SRR7536863_hg38_1000bp_flagged_rmPCR_RT_rmDup_counts_1000_filt_1000.tsv'

        system(paste('cp',dtname1,file.path(tmpdir,'dt1_1kb.txt')))
        system(paste('gzip -c',file.path(tmpdir,'dt1_1kb.txt'),'>',file.path(tmpdir,'dt1_1kb.txt.gz')))
        system(paste('rm',file.path(tmpdir,'dt1_1kb.txt')))

        system(paste('cp',dtname2,file.path(tmpdir,'dt2_1kb.txt')))
        system(paste('gzip -c',file.path(tmpdir,'dt2_1kb.txt'),'>',file.path(tmpdir,'dt2_1kb.txt.gz')))
        system(paste('rm',file.path(tmpdir,'dt2_1kb.txt')))

        dt1 <- scater::readSparseCounts(file.path(tmpdir,'dt1_1kb.txt.gz'))
        dt1 <- dt1[unique(rownames(dt1)),]

        dt2 <- scater::readSparseCounts(file.path(tmpdir,'dt2_1kb.txt.gz'))
        dt2 <- dt2[unique(rownames(dt2)),]

        sc <- Seurat::RowMergeSparseMatrices(dt1,dt2)

        rn <- rownames(sc)

        grDatamatrix <- with(as.data.table(do.call(rbind,strsplit(rn,":|-"))), GRanges(V1,IRanges::IRanges(as.numeric(V2),as.numeric(V3))))

        orderRanges <- order(grDatamatrix)

        grDatamatrix <- grDatamatrix[orderRanges]
        rn <- rn[orderRanges]
        dt1 <- dt1[orderRanges,]
        dt2 <- dt2[orderRanges,]
        sc <- sc[orderRanges,]

        total_cell <- c(ncol(dt1),ncol(dt2))
        sample_name <- c('HBCx-22','HBCx-22-TamR')
        annot_raw <- rbindlist(list(data.table(barcode=colnames(dt1), cell_id=paste0(sample_name[1], "_c", 1:total_cell[1]), sample_id=rep(sample_name[1], total_cell[1]), batch_id=1),
                                    data.table(barcode=colnames(dt2), cell_id=paste0(sample_name[2], "_c", 1:total_cell[2]), sample_id=rep(sample_name[2], total_cell[2]), batch_id=1)))

        colnames(sc) <- c(paste0(sample_name[1], "_c", 1:total_cell[1]),
                          paste0(sample_name[2], "_c", 1:total_cell[2]))

        rm(dt1,dt2)

        sc <- SingleCellExperiment::SingleCellExperiment(assays = list(counts = sc),
                                                         rowRanges = grDatamatrix,
                                                         colData = data.frame(Cluster = annot_raw$sample_id))

        sc <- sc[seqnames(sc)%in%c(paste0('chr',1:22),'chrX','chrY'),]

        Rleagg <- data.table(Counts = Matrix::rowSums(SummarizedExperiment::assay(sc)))[,rl := rleid(Counts)][,N := 1:.N]
        Rleagg <- merge(Rleagg,Rleagg[,list(SumN = .N),by = 'rl'],by = 'rl',all.x = TRUE)
        sc <- sc[-Rleagg[,which(Counts==0 & SumN>2)],]

        Cusanovich2018.1kb <- runCusanovich2018(sc, peaks = TRUE, returnSc = TRUE,legend.title = 'Sample Origin')
        cisTopic.1kb <- runCisTopic(sc = Cusanovich2018.1kb$sc,legend.title = 'Sample Origin')
        SnapATAC.1kb <- runSnapATAC(sc = Cusanovich2018.1kb$sc,legend.title = 'Sample Origin')
        Scasat.1kb <- runScasat(sc = Cusanovich2018.1kb$sc,legend.title = 'Sample Origin')

        save(Cusanovich2018.1kb,
             cisTopic.1kb,
             SnapATAC.1kb,
             Scasat.1kb,file = file.path(path,'Analysis_1kb.RData'),compress = 'xz')

        rm(list=setdiff(ls(),'path'))
    }
}
plbaldoni/scChIPseqsim documentation built on June 11, 2020, 7:41 p.m.