R/summaryMikado.R

Defines functions summaryMikado

Documented in summaryMikado

#' Multi-scenario analysis of single-cell epigenomic data
#'
#' Add description here
#'
#' @param path directory
#'
#' @return Add return here
#'
#' @details Add details here
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @export
#'
summaryMikado <- function(path){

    Sum = sc = Clustering = ARI = AMI = Method = Time = dt.epigrahmm = completed.dt.dirs = NULL

    path <- file.path(normalizePath("."),path)

    system(paste('mkdir',path))

    files <- list.files(path = 'Analysis',pattern = '*.RData',full.names = T)

    # Loading the data

    clustering <- list()
    timing <- list()
    tb.timing <- list()
    tb.clustering <- list()
    fig.umap <- list()

    for(x in seq_len(length(files))){
        if(!file.exists(file.path(path,paste0(gsub('.RData','',gsub('Analysis/','',files[x])),'_Table_Means.tsv')))){
            message(files[x])

            # Loading ARI, AMI, and time
            tryCatch({
                load(files[x])

                tb.clustering[[x]] <- data.table::rbindlist(list(dt.epigrahmm$clustering,
                                                                 dt.mikado$clustering))
                tb.timing[[x]] <- data.table::rbindlist(list(dt.epigrahmm$time,
                                                             dt.mikado$time))
                if(x==1){
                    fig.umap[['epigrahmm']] <- dt.epigrahmm$umap
                    fig.umap[['mikado']] <- dt.mikado$umap

                    load(gsub('Analysis','Data',files[x]))
                    mat = scChIPseqsim:::pseudoBulk(sc,as.factor(SummarizedExperiment::colData(sc)$Cluster),type = 'cols')
                    range = GenomicRanges::reduce(SummarizedExperiment::rowRanges(sc)[1600:2530])
                    fig.pseudoPlot <- mikado:::pseudoPlot(sc = sc,mat = mat,ranges = range,normalize = F)

                    # Creating peak plot
                    subsc <- sc[IRanges::overlapsAny(sc,range),]

                    dt.plot <- data.table::rbindlist(list(data.table::data.table(Counts = Matrix::rowSums(SummarizedExperiment::assay(subsc)),
                                                                                 Start = subsc@rowRanges@ranges@start,
                                                                                 Stop = subsc@rowRanges@ranges@start + subsc@rowRanges@ranges@width,
                                                                                 Peak = IRanges::overlapsAny(subsc,sc.epigrahmm$sc),
                                                                                 Type = 'Pooled'),
                                                          data.table::data.table(Counts = Matrix::rowSums(SummarizedExperiment::assay(subsc)),
                                                                                 Start = subsc@rowRanges@ranges@start,
                                                                                 Stop = subsc@rowRanges@ranges@start + subsc@rowRanges@ranges@width,
                                                                                 Peak = IRanges::overlapsAny(subsc,sc.mikado$sc),
                                                                                 Type = 'Differential')))
                    dt.plot$Type %<>% factor(levels = c('Pooled','Differential'))

                    fig.peaks <- ggplot2::ggplot(data = dt.plot,ggplot2::aes(x = Start,y = Counts))+
                        ggplot2::facet_grid(cols = ggplot2::vars(Type)) +
                        ggplot2::theme_bw()+
                        ggplot2::geom_line()+
                        ggplot2::geom_rect(data = dt.plot[Peak==T,],ggplot2::aes(xmin=Start,xmax=Stop,ymin=1.05*max(Counts),ymax=1.10*max(Counts)))+
                        ggplot2::annotate(size = 2.5,"text",fontface = 'italic',x = c(900000),y = c(27.5),label = c('Candidate peaks'))+
                        ggplot2::xlab(paste0('Genomic Window (',as.character(unique(subsc@rowRanges@seqnames)),')'))+
                        ggplot2::scale_x_continuous(labels = scales::scientific)


                }
            },error = function(cond){message(cond)})
        }
    }

    tb.clustering <- data.table::rbindlist(tb.clustering)
    tb.timing <- data.table::rbindlist(tb.timing)

    tb.clustering$Peak %<>% factor(levels = c('Pooled','Differential'))

    # tb.timing[[x]] <- timing[[x]][,list(Mean_Time = mean(Time,na.rm=T),SD_Time = sd(Time,na.rm = T)),by = eval(names(timing[[x]])[-which(names(timing[[x]])%in%c('Time','Simulation'))])]
    # tb.clustering[[x]] <- clustering[[x]][,list(Mean_ARI = mean(ARI,na.rm=T),SD_ARI = sd(ARI,na.rm = T),Mean_AMI = mean(AMI,na.rm=T),SD_AMI = sd(AMI,na.rm = T)),by = eval(names(clustering[[x]])[-which(names(clustering[[x]])%in%c('ARI','AMI','Simulation'))])]
    #
    # data.table::fwrite(tb.timing[[x]],file = file.path(path,paste0(gsub('Analysis/','',completed.dt.dirs$Dir[x]),'_Table_Time.tsv')))
    # data.table::fwrite(tb.clustering[[x]],file = file.path(path,paste0(gsub('Analysis/','',completed.dt.dirs$Dir[x]),'_Table_Means.tsv')))

    fig.clustering <- ggplot2::ggplot(data = tb.clustering,ggplot2::aes(x = Clustering,y = ARI,fill = Method)) +
        ggplot2::facet_grid(cols = ggplot2::vars(Peak)) +
        ggplot2::geom_boxplot() +
        ggplot2::theme_bw() +
        ggplot2::scale_fill_brewer(palette = 'Set1')+
        ggplot2::theme(legend.position = 'top',legend.direction = 'horizontal',axis.title.x = ggplot2::element_blank(),
                       axis.text = ggplot2::element_text(size = 8),axis.title = ggplot2::element_text(size = 8))

    fig.umap.epigrahmm <- ggpubr::ggarrange(plotlist = fig.umap[['epigrahmm']],legend = 'none',nrow = 2,ncol = 2,labels = list('D'))
    fig.umap.mikado <- ggpubr::ggarrange(plotlist = fig.umap[['mikado']],legend = 'none',nrow = 2,ncol = 2,labels = list('E'))
    fig.umap.combined <- ggpubr::ggarrange(fig.umap.epigrahmm,fig.umap.mikado,nrow = 1,ncol = 2)

    fig1 <- ggpubr::ggarrange(fig.pseudoPlot,nrow = 1,ncol=1,common.legend = T,legend = 'top',labels = list('A'))
    fig2 <- ggpubr::ggarrange(fig.peaks,nrow = 1,ncol=1,common.legend = T,legend = 'top',labels = list('B'))
    fig3 <- ggpubr::ggarrange(fig.clustering,nrow = 1,ncol=1,common.legend = T,legend = 'top',labels = list('C'))
    fig4 <- ggpubr::ggarrange(ggpubr::get_legend(scChIPseqsim:::addSmallLegend(fig.umap[['epigrahmm']]$cisTopic + ggplot2::theme(legend.position = 'bottom',legend.direction = 'horizontal'),pointSize = 1.5, textSize = 7, spaceLegend = 0.5,legend.title = "Cell Type")),
                              fig.umap.combined,
                              nrow = 2,ncol = 1,heights = c(0.05,0.95))

    fig <- ggpubr::ggarrange(fig1,fig2,fig3,fig4,nrow = 4,ncol = 1,heights = c(0.9,0.7,1.2,1.2)/4)

    ggplot2::ggsave(fig,filename = file.path(path,'Summary_mikado.pdf'),width = 8.5,height = 11)

    ### Creating table

    message('Combining tables...')

    tb.clustering.summary <- tb.clustering[,list(Mean_ARI = mean(ARI),SD_ARI = sd(ARI),Mean_AMI = mean(AMI),SD_AMI = sd(AMI)),by = c('Peak','Method','Clustering')]
    tb.clustering.summary[,ARI := paste0(scChIPseqsim:::rd(Mean_ARI),' (',scChIPseqsim:::rd(SD_ARI),')')][,Mean_ARI:= NULL][,SD_ARI:= NULL]
    tb.clustering.summary[,AMI := paste0(scChIPseqsim:::rd(Mean_AMI),' (',scChIPseqsim:::rd(SD_AMI),')')][,Mean_AMI:=NULL][,SD_AMI:=NULL]

    tb.clustering.summary <- merge(tb.clustering.summary[Peak == 'Pooled',][,Peak := NULL][,Pooled_ARI := ARI][,Pooled_AMI := AMI][,ARI:=NULL][,AMI:=NULL],
                                   tb.clustering.summary[Peak == 'Differential',][,Peak := NULL][,Differential_ARI := ARI][,Differential_AMI := AMI][,ARI:=NULL][,AMI:=NULL],
                                   by = c('Clustering','Method'),all.x = T)

    # Setting up scChIP-seq table

    kable1 <- kableExtra::kable(tb.clustering.summary,format = 'latex',booktabs = T, align = c('l','l','r','r','r','r'),escape = F,
                                col.names = kableExtra::linebreak(c('Clustering\nAlgorithm','Method','ARI','AMI','ARI','AMI'),align = 'c'),
                                caption = 'Performance of scATAC-seq methods on simulated scChIP-seq data with candidate peaks called either on bulk data (Pooled) or on single-cell data with 3-state HMM (Differential).',label = 'project3_simmikado') %>%
        kableExtra::collapse_rows(columns = 1,latex_hline = 'major',valign = 'top') %>%
        kableExtra::add_header_above(c(" " = 2, "Pooled" = 2, "Differential" = 2))

    sink(file.path(path,'Simulation_mikado.tex'))
    cat(kable1)
    sink()
}
plbaldoni/scChIPseqsim documentation built on June 11, 2020, 7:41 p.m.