R/FluteMLE.R

Defines functions FluteMLE

Documented in FluteMLE

#' Downstream analysis based on MAGeCK-MLE result
#'
#' Integrative analysis pipeline using the gene summary table in MAGeCK MLE results
#'
#' @docType methods
#' @name FluteMLE
#' @rdname FluteMLE
#' @aliases flutemle
#'
#' @param gene_summary A data frame or a file path to gene summary file generated by MAGeCK-MLE.
#' @param treatname A character vector, specifying the names of treatment samples.
#' @param ctrlname A character vector, specifying the names of control samples.
#' If there is no controls in your CRISPR screen, you can specify "Depmap" as ctrlname and set `incorporateDepmap=TRUE`.
#' @param keytype "Entrez" or "Symbol".
#' @param organism "hsa" or "mmu".
#' @param incorporateDepmap Boolean, indicating whether incorporate Depmap data into analysis.
#' @param cell_lines A character vector, specifying the cell lines in Depmap to be considered.
#' @param lineages A character vector, specifying the lineages in Depmap to be considered.
#' @param norm_method One of "none", "cell_cycle" (default) or "loess".
#' @param posControl A character vector, specifying a list of positive control gene symbols.
#' @param omitEssential Boolean, indicating whether omit common essential genes from the downstream analysis.
#' @param top An integer, specifying number of top selected genes to be labeled in rank figure.
#' @param toplabels A character vector, specifying interested genes to be labeled in rank figure.
#' @param scale_cutoff Boolean or numeric, specifying how many standard deviation will be used as cutoff.
#' @param limit A two-length vector, specifying the minimal and maximal size of gene sets for enrichent analysis.
#' @param pvalueCutoff A numeric, specifying pvalue cutoff of enrichment analysis, default 1.
#' @param enrich_method One of "ORT"(Over-Representing Test) and "HGT"(HyperGemetric test).
#' @param proj A character, indicating the prefix of output file name, which can't contain special characters.
#' @param width The width of summary pdf in inches.
#' @param height The height of summary pdf in inches.
#' @param outdir Output directory on disk.
#' @param pathview.top Integer, specifying the number of pathways for pathview visualization.
#' @param verbose Boolean
#'
#' @author Wubing Zhang
#'
#' @return All of the pipeline results is output into the \code{out.dir}/MAGeCKFlute_\code{proj},
#' which includes a pdf file and many folders. The pdf file 'FluteMLE_\code{proj}_\code{norm_method}.pdf' is the
#' summary of pipeline results. For each section in this pipeline, figures and useful data are
#' outputed to corresponding subfolders.
#' \itemize{
#'   \item {QC}: {Quality control}
#'   \item {Selection}: {Positive selection and negative selection.}
#'   \item {Enrichment}: {Enrichment analysis for positive and negative selection genes.}
#'   \item {PathwayView}: {Pathway view for top enriched pathways.}
#' }
#'
#' @details MAGeCK-MLE can be used to analyze screen data from multi-conditioned experiments. MAGeCK-MLE
#' also normalizes the data across multiple samples, making them comparable to each other. The most important
#' ouput of MAGeCK MLE is `gene_summary` file, which includes the beta scores of multiple conditions and
#' the associated statistics. The `beta score`  for each gene describes how the gene is selected: a positive
#' beta score indicates a positive selection, and a negative beta score indicates a negative selection.
#'
#' The downstream analysis includes identifying essential, non-essential, and target-associated genes,
#' and performing biological functional category analysis and pathway enrichment analysis of these genes.
#' The function also visualizes genes in the context of pathways to benefit users exploring screening data.
#'
#'
#' @seealso \code{\link{FluteRRA}}
#'
#' @examples
#' file3 = file.path(system.file("extdata", package = "MAGeCKFlute"),
#' "testdata/mle.gene_summary.txt")
#' \dontrun{
#'   # functional analysis for MAGeCK MLE results
#'   FluteMLE(file3, treatname = "plx", ctrlname = "dmso", proj = "PLX")
#' }
#'
#' @import ggplot2 stats grDevices utils gridExtra grid
#' @export

FluteMLE <- function(gene_summary, treatname, ctrlname = "Depmap",
                     keytype = "Symbol", organism = "hsa", # Input dataset
                     incorporateDepmap = FALSE,
                     cell_lines = NA, lineages = "All",
                     norm_method = "cell_cycle", posControl = NULL,
                     omitEssential = FALSE,
                     top = 10, toplabels = NA,
                     scale_cutoff = 2, limit = c(0,200),
                     pvalueCutoff=0.25, enrich_method = "ORT", proj = NA,
                     width = 10, height = 7, outdir = ".",
                     pathview.top = 4, verbose = TRUE){
	## Prepare the running environment ##
  {
    message(Sys.time(), " # Create output dir and pdf file...")
    outdir = file.path(outdir, paste0("MAGeCKFlute_", proj))
    dir.create(file.path(outdir), showWarnings = FALSE)
    output_pdf = paste0("FluteMLE_", proj, "_", norm_method, ".pdf")
    pdf(file.path(outdir, output_pdf), width = width, height = height)
  }

  ## Beta Score Preparation ##
  {
    beta = ReadBeta(gene_summary)
    if(tolower(keytype)!="entrez"){
      beta$EntrezID = TransGeneID(beta$Gene, keytype, "Entrez", organism = organism)
    }else{
      beta$EntrezID = beta$Gene
    }
    if(tolower(keytype)!="symbol"){
      beta$Symbol = TransGeneID(beta$Gene, keytype, "Symbol", organism = organism)
    }else{
      beta$Symbol = beta$Gene
    }
    if(organism != "hsa"){
      beta$HumanGene = TransGeneID(beta$Symbol, "Symbol", "Symbol",
                                   fromOrg = organism, toOrg = "hsa")
    }else{
      beta$HumanGene = beta$Symbol
    }

    message(Sys.time(), " # Transform id to official human gene name ...")
    idx1 = is.na(beta$EntrezID)
    idx2 = !is.na(beta$EntrezID) & duplicated(beta$EntrezID)
    idx = idx1|idx2
    if(sum(idx1)>0) message(sum(idx1), " genes fail to convert into Entrez IDs: ",
                           paste0(beta$Gene[idx1], collapse = ", "))
    if(sum(idx2)>0) message(sum(idx2), " genes have duplicate Entrez IDs: ",
                            paste0(beta$Gene[idx2], collapse = ", "))

    dd = beta[!idx, ]
    if(incorporateDepmap | "Depmap"%in%ctrlname)
      dd = IncorporateDepmap(dd, symbol = "HumanGene", cell_lines = cell_lines,
                             lineages = lineages)
    if(!all(c(ctrlname, treatname) %in% colnames(dd)))
      stop("Sample name doesn't match !!!")
    ## Normalization
    if(tolower(norm_method)=="cell_cycle")
      dd = NormalizeBeta(dd, id = "HumanGene", samples = c(ctrlname, treatname),
                         method = "cell_cycle", posControl = posControl)
    if(tolower(norm_method)=="loess")
      dd = NormalizeBeta(dd, id = "HumanGene", samples = c(ctrlname, treatname), method = "loess")
  }
	## Distribution of beta scores ##
	{
	  ## All genes ##
	  outputDir1 = file.path(outdir, "QC/")
	  dir.create(outputDir1, showWarnings = FALSE)
	  idx_distr = c(ctrlname, treatname)
	  P1 = ViolinView(dd[, idx_distr], ylab = "Beta score", main = "All genes",
	                  filename = paste0(outputDir1, "ViolinView_all_", norm_method, ".png"))
	  P2 = DensityView(dd[, idx_distr], xlab = "Beta score", main = "All genes",
	                   filename = paste0(outputDir1, "DensityView_all_", norm_method, ".png"))
	  P3 = ConsistencyView(dd, ctrlname, treatname, main = "All genes",
	                       filename = paste0(outputDir1, "Consistency_all_", norm_method, ".png"))
	  P4 = MAView(dd, ctrlname, treatname, main = "All genes",
	              filename = paste0(outputDir1, "MAView_all_", norm_method, ".png"))
	  grid.arrange(P1, P2, P3, P4, ncol = 2)

	  ## Essential genes ##
	  Zuber_Essential = readRDS(file.path(system.file("extdata", package = "MAGeCKFlute"),
	                            "Zuber_Essential.rds"))
	  if(is.null(posControl))
	    idx = toupper(dd$HumanGene) %in% toupper(Zuber_Essential$GeneSymbol)
	  else
	    idx = toupper(dd$Gene) %in% toupper(posControl)

	  if(sum(idx) > 10){
	    P1 = ViolinView(dd[idx, idx_distr], ylab = "Essential.B.S.", main = "Essential genes",
	                    filename = paste0(outputDir1, "ViolinView_posctrl_", norm_method, ".png"))
	    P2 = DensityView(dd[idx, idx_distr], xlab = "Essential.B.S.", main = "Essential genes",
	                     filename = paste0(outputDir1, "DensityView_posctrl_", norm_method, ".png"))
	    P3 = ConsistencyView(dd[idx, ], ctrlname, treatname, main = "Essential genes",
	                         filename = paste0(outputDir1, "Consistency_posctrl_", norm_method, ".png"))
	    P4 = MAView(dd[idx, ], ctrlname, treatname, main = "Essential genes",
	                filename = paste0(outputDir1, "MAView_posctrl_", norm_method, ".png"))
	    grid.arrange(P1, P2, P3, P4, ncol = 2)
	  }
	}

  ## Combine replicates ##
  {
    dd$Control = Biobase::rowMedians(as.matrix(dd[, ctrlname, drop = FALSE]))
    dd$Treatment = Biobase::rowMedians(as.matrix(dd[, treatname, drop = FALSE]))
    dd$Diff = dd$Treatment - dd$Control
    write.table(dd, paste0(outputDir1, proj, "_processed_data.txt"),
                sep = "\t", row.names = FALSE, quote = FALSE)
    x_cut = c(-CutoffCalling(dd$Control, scale_cutoff),
              CutoffCalling(dd$Control, scale_cutoff))
    y_cut = c(-CutoffCalling(dd$Treatment, scale_cutoff),
              CutoffCalling(dd$Treatment, scale_cutoff))
    intercept = c(-CutoffCalling(dd$Diff, scale_cutoff),
                  CutoffCalling(dd$Diff, scale_cutoff))
    if(omitEssential){
      dd = OmitCommonEssential(dd, symbol = "HumanGene", dependency = -0.4)
      write.table(dd, paste0(outputDir1, proj, "_omitessential_data.txt"),
                  sep = "\t", row.names = FALSE, quote = FALSE)
    }
    dd$Rank = rank(dd$Diff)
    dd$RandomIndex = sample(1:nrow(dd), nrow(dd))
    dd$Gene = dd$Symbol
  }

	## Drug-targeted genes ##
	{
	  outputDir2 = file.path(outdir, "Selection/")
	  dir.create(outputDir2, showWarnings = FALSE)

	  p1 = ScatterView(dd, "Control", "Treatment", groups = c("top", "bottom"),
	                   groupnames = c("GroupA", "GroupB"), intercept = intercept)
	  ggsave(paste0(outputDir2, "ScatterView_TreatvsCtrl_", norm_method, ".png"),
	         p1, width = 4, height = 3)
	  write.table(p1$data, paste0(outputDir2, "Data_ScatterView_TreatvsCtrl.txt"),
	              sep = "\t", row.names = FALSE, quote = FALSE)
	  p2 = ScatterView(dd, x = "Rank", y = "Diff", label = "Symbol",
	                   groups = c("top", "bottom"), groupnames = c("GroupA", "GroupB"),
	                   top = top, y_cut = y_cut)
	  ggsave(paste0(outputDir2, "RankView_Treat-Ctrl_", norm_method, ".png"),
	         p2, width = 3, height = 5)
	  p3 = ScatterView(dd[dd$Diff>0, ], x = "RandomIndex", y = "Diff", label = "Symbol",
	                   y_cut = y_cut, groups = "top", groupnames = c("GroupA"), top = top)
	  ggsave(paste0(outputDir2, "ScatterView_Treat-Ctrl_Positive_", norm_method, ".png"),
	         p3, width = 4, height = 3)
	  p4 = ScatterView(dd[dd$Diff<0, ], x = "RandomIndex", y = "Diff", label = "Symbol",
	                   y_cut = y_cut, groups = "bottom", groupnames = c("GroupB"), top = top)
	  ggsave(paste0(outputDir2, "ScatterView_Treat-Ctrl_Negative_", norm_method, ".png"),
	         p4, width = 4, height = 3)

	  grid.arrange(p1, p2, p3, p4, ncol = 2)
	}

	## Enrichment analysis of negative and positive selected genes ##
	{
	  outputDir3 = file.path(outdir, "Enrichment/")
	  outputDir4 = file.path(outdir, "PathwayView/")
	  dir.create(outputDir3, showWarnings=FALSE)
	  dir.create(outputDir4, showWarnings=FALSE)

	  E1 = EnrichAB(p1$data, pvalue = pvalueCutoff, enrich_method = enrich_method,
	                organism = organism, limit = limit,
	                filename = norm_method, out.dir = outputDir3)
	  # EnrichedView
	  grid.arrange(E1$keggA$gridPlot, E1$reactomeA$gridPlot, E1$gobpA$gridPlot, E1$complexA$gridPlot, ncol = 2)
	  grid.arrange(E1$keggB$gridPlot, E1$reactomeB$gridPlot, E1$gobpB$gridPlot, E1$complexB$gridPlot, ncol = 2)

	  # Pathway view for top 4 pathway
	  if(!is.null(E1$keggA$enrichRes) && nrow(E1$keggA$enrichRes)>0)
	    arrangePathview(dd, gsub("KEGG_", "", E1$keggA$enrichRes$ID),
	                    top = pathview.top, ncol = 2, title="Group A",
	                    organism=organism, output=outputDir4)
	  if(!is.null(E1$keggB$enrichRes) && nrow(E1$keggB$enrichRes)>0)
	    arrangePathview(dd, gsub("KEGG_", "", E1$keggB$enrichRes$ID),
	                    top = pathview.top, ncol = 2,
	                    title="Group B", organism=organism,
	                    output=outputDir4)
  }

	## Nine-squares ##
	{
	  p1 = ScatterView(dd, x = "Control", y = "Treatment", label = "Symbol",
	                   groups = c("midleft", "topcenter", "midright", "bottomcenter"),
	                   groupnames = c("Group1", "Group2", "Group3", "Group4"),
	                   top = top, display_cut = TRUE,
	                   x_cut = x_cut, y_cut = y_cut, intercept = intercept)
	  ggsave(paste0(outputDir2, "SquareView_", norm_method, ".png"), p1, width = 5, height = 4)
	  write.table(p1$data, paste0(outputDir2, proj, "squareview_data.txt"),
	              sep = "\t", row.names = FALSE, quote = FALSE)
	  grid.arrange(p1, ncol = 1)
	}

	## Nine-Square grouped gene enrichment ##
	{
	  E1 = EnrichSquare(p1$data, id = "EntrezID", keytype = "entrez",
	                    x = "Control", y = "Treatment", organism=organism,
	                    pvalue = pvalueCutoff, enrich_method = enrich_method,
	                    filename=norm_method, limit = limit, out.dir=outputDir3)
    # EnrichView
	  grid.arrange(E1$kegg1$gridPlot, E1$reactome1$gridPlot, E1$gobp1$gridPlot, E1$complex1$gridPlot, ncol = 2)
	  grid.arrange(E1$kegg2$gridPlot, E1$reactome2$gridPlot, E1$gobp2$gridPlot, E1$complex2$gridPlot, ncol = 2)
	  grid.arrange(E1$kegg3$gridPlot, E1$reactome3$gridPlot, E1$gobp3$gridPlot, E1$complex3$gridPlot, ncol = 2)
	  grid.arrange(E1$kegg4$gridPlot, E1$reactome4$gridPlot, E1$gobp4$gridPlot, E1$complex4$gridPlot, ncol = 2)
	  grid.arrange(E1$kegg12$gridPlot, E1$reactome12$gridPlot, E1$gobp12$gridPlot, E1$complex12$gridPlot, ncol = 2)
	  grid.arrange(E1$kegg13$gridPlot, E1$reactome13$gridPlot, E1$gobp13$gridPlot, E1$complex13$gridPlot, ncol = 2)
	  grid.arrange(E1$kegg24$gridPlot, E1$reactome24$gridPlot, E1$gobp24$gridPlot, E1$complex24$gridPlot, ncol = 2)
	  grid.arrange(E1$kegg34$gridPlot, E1$reactome34$gridPlot, E1$gobp34$gridPlot, E1$complex34$gridPlot, ncol = 2)

	  # PathwayView
	  if(!is.null(E1$kegg1$enrichRes) && nrow(E1$kegg1$enrichRes)>0)
	    arrangePathview(dd, gsub("KEGG_", "", E1$kegg1$enrichRes$ID), ncol = 2,
	                    top = pathview.top, title = "Group 1", organism=organism,
	                    output=outputDir4)
	  if(!is.null(E1$kegg2$enrichRes) && nrow(E1$kegg2$enrichRes)>0)
	    arrangePathview(dd, gsub("KEGG_", "", E1$kegg2$enrichRes$ID), ncol = 2,
	                    top = pathview.top, title = "Group 2",
	                    organism=organism,output=outputDir4)
	  if(!is.null(E1$kegg3$enrichRes) && nrow(E1$kegg3$enrichRes)>0)
	    arrangePathview(dd, gsub("KEGG_", "", E1$kegg3$enrichRes$ID), ncol = 2,
	                    top = pathview.top, title = "Group 3",
	                    organism=organism, output=outputDir4)
	  if(!is.null(E1$kegg4$enrichRes) && nrow(E1$kegg4$enrichRes)>0)
	    arrangePathview(dd, gsub("KEGG_", "", E1$kegg4$enrichRes$ID), ncol = 2,
	                    title = "Group 4", organism = organism,
	                    top = pathview.top, output=outputDir4)
	  if(!is.null(E1$kegg12$enrichRes) && nrow(E1$kegg12$enrichRes)>0)
	    arrangePathview(dd, gsub("KEGG_", "", E1$kegg12$enrichRes$ID), ncol = 2,
	                    title = "Group 1 & Group 2", organism=organism,
	                    top = pathview.top, output=outputDir4)
	  if(!is.null(E1$kegg13$enrichRes) && nrow(E1$kegg13$enrichRes)>0)
	    arrangePathview(dd, gsub("KEGG_", "", E1$kegg13$enrichRes$ID), ncol = 2,
	                    title = "Group 1 & Group 3", organism=organism,
	                    top = pathview.top, output=outputDir4)
	  if(!is.null(E1$kegg24$enrichRes) && nrow(E1$kegg24$enrichRes)>0)
	    arrangePathview(dd, gsub("KEGG_", "", E1$kegg24$enrichRes$ID), ncol = 2,
	                    title = "Group 2 & Group 4", organism=organism,
	                    top = pathview.top, output=outputDir4)
	  if(!is.null(E1$kegg34$enrichRes) && nrow(E1$kegg34$enrichRes)>0)
	    arrangePathview(dd, gsub("KEGG_", "", E1$kegg34$enrichRes$ID), ncol = 2,
	                    title = "Group 3 & Group 4", organism=organism,
	                    top = pathview.top, output=outputDir4)
  }
	dev.off()
}

Try the MAGeCKFlute package in your browser

Any scripts or data that you put into this service are public.

MAGeCKFlute documentation built on Nov. 8, 2020, 5:40 p.m.