R/mageck_rra.R

Defines functions plot_mageck_rra_sgrna_barplot plot_mageck_rra_gene_volcano gene_summary_top_n_genes read_mageck_rra_sgrna_summary read_mageck_rra_gene_summary

Documented in gene_summary_top_n_genes plot_mageck_rra_gene_volcano plot_mageck_rra_sgrna_barplot read_mageck_rra_gene_summary read_mageck_rra_sgrna_summary

# Copyright (c) 2021 Genome Research Ltd
#
# Author: CASM/Cancer IT <cgphelp@sanger.ac.uk>
#
# This file is part of RCRISPR.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
# 1. The usage of a range of years within a copyright statement contained within
# this distribution should be interpreted as being equivalent to a list of years
# including the first and last year specified and all consecutive years between
# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007-
# 2009, 2011-2012’ should be interpreted as being identical to a statement that
# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright
# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being
# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008,
# 2009, 2010, 2011, 2012’.
#
#' Reads MAGeCK RRA gene summary
#'
#' @description
#' Reads in a MAGeCK RRA gene summary file to a `MageckGeneRRA` object.
#'
#' @param filepath character string specifying a file path.
#' @param ... additional read.delim parameters.
#'
#' @return a data frame containing MAGeCK RRA gene summary.
#' @importFrom methods new
#' @importFrom methods validObject
#' @export read_mageck_rra_gene_summary
read_mageck_rra_gene_summary <-
  function(filepath = NULL,
           ...) {
    # Check if filepath is null
    if (is.null(filepath))
      stop("Cannot read MAGeCK RRA gene summary, filepath is null.")
    # Validate input file
    check_file(filepath)
    # Read MAGeCK RRA gene summary file
    gene_summary <- read_file_to_dataframe(filepath = filepath, ...)
    # Validate data frame
    check_dataframe(gene_summary)
    # Try reading gene summary into sobject
    gene_summary_object <- tryCatch({
      # Create new MAGeCK RRA gene summary object
      new(
        "MageckGeneRRA",
        filepath = filepath,
        gene_summary = gene_summary
      )
    }, error = function(e) {
      # Stop if there is an error
      stop(e)
    })
    # Validate MageckGeneRRA object
    validObject(gene_summary_object)
    # Return MageckGeneRRA object
    return(gene_summary_object)
  }

#' Reads MAGeCK RRA sgRNA summary
#'
#' @description
#' Reads in a MAGeCK RRA sgRNA summary file to a `MageckSgrnaRRA` object.
#'
#' @param filepath character string specifying a file path.
#' @param ... additional read.delim parameters.
#'
#' @return a data frame containing MAGeCK RRA sgRNA summary.
#' @importFrom methods new
#' @importFrom methods validObject
#' @export read_mageck_rra_sgrna_summary
read_mageck_rra_sgrna_summary <-
  function(filepath = NULL,
           ...) {
    # Check if filepath is null
    if (is.null(filepath))
      stop("Cannot read MAGeCK RRA sgRNA summary, filepath is null.")
    # Validate input file
    check_file(filepath)
    # Read MAGeCK RRA sgRNA summary file
    sgrna_summary <- read_file_to_dataframe(filepath = filepath, ...)
    # Validate data frame
    check_dataframe(sgrna_summary)
    # Try reading sgRNA summary into object
    sgrna_summary_object <- tryCatch({
      # Create new MAGeCK RRA sgRNA summary object
      new(
        "MageckSgrnaRRA",
        filepath = filepath,
        sgrna_summary = sgrna_summary
      )
    }, error = function(e) {
      # Stop if there is an error
      stop(e)
    })
    # Validate MageckSgrnaRRA object
    validObject(sgrna_summary_object)
    # Return MageckSgrnaRRA object
    return(sgrna_summary_object)
  }

#' Retrieve top n genes from MAGeCK gene summary
#'
#' @description
#' Collects top n genes from a `MageckGeneRRA` object.
#'
#' @param gene_summary_object a `MAGeCKGeneRRA` object.
#' @param n maximum number of genes to retrieve
#' @param fdr maximum FDR threshold
#' @param direction direction of results to retrieve (pos or neg)
#'
#' @return a data frame containing MAGeCK RRA gene results.
#' @import dplyr
#' @export gene_summary_top_n_genes
gene_summary_top_n_genes <-
  function(gene_summary_object = NULL,
           n = 10,
           fdr = 0.05,
           direction = 'neg') {
    # Validate inputs
    if (is.null(gene_summary_object))
      stop("Cannot get top n genes, gene_summary is null.")
    if (!is.numeric(fdr))
      stop(paste("Cannot get top n genes, fdr is not numeric:", fdr))
    if (!direction %in% c('pos', 'neg'))
      stop(paste("Cannot get top n genes, direction is not 'pos' or 'neg':", direction))
    check_is_numeric_and_is_integer(n)
    # Get top n genes by rank and filter by fdr
    filter_string <- paste0(direction, '.rank <= ', n, ' & ', direction, '.fdr < ', fdr)
    top_genes <- get_mageck_gene_summary(gene_summary_object, filter_string)
    # Warn if number of rows returned is less than n
    if (nrow(top_genes) < n)
      warning("Fewer than n top genes returned:", nrow(top_genes))
    return(top_genes)
  }

#' Plot MAGeCK RRA gene volcano
#'
#' @description
#' Volcano plot of MAGeCK RRA gene summary results.
#'
#' @param gene_summary_object a `MAGeCKGeneRRA` object.
#' @param n maximum number of genes to label (in each direction)
#' @param color_enriched color for enriched genes
#' @param color_depleted color for depleted genes
#' @param fdr maximum FDR threshold
#'
#' @return a ggplot2 object.
#' @import dplyr
#' @import ggplot2
#' @importFrom ggrepel geom_label_repel
#' @importFrom grDevices col2rgb
#' @export plot_mageck_rra_gene_volcano
plot_mageck_rra_gene_volcano <-
  function( gene_summary_object = NULL,
            n = 10,
            color_enriched = 'royalblue4',
            color_depleted = 'orangered4',
            fdr = 0.05 ) {
    # Validate inputs
    if (is.null(gene_summary_object))
      stop("Cannot plot MAGeCK gene volcano, gene_summary_object is null.")
    if (!is.numeric(fdr))
      stop(paste("Cannot plot MAGeCK gene volcano, fdr is not numeric:", fdr))
    check_is_numeric_and_is_integer(n)
    # Check color_enriched is valid
    tryCatch({
      col2rgb(color_enriched)
    }, error = function(e) {
      # Stop if there is an error
      stop(paste("Cannot plot MAGeCK gene volcano, color_enriched is invalid:", color_enriched))
    })
    # Check color_depleted is valid
    tryCatch({
      col2rgb(color_depleted)
    }, error = function(e) {
      # Stop if there is an error
      stop(paste("Cannot plot MAGeCK gene volcano, color_depleted is invalid:", color_depleted))
    })
    # Get gene_summary data frame
    gene_summary <- get_mageck_gene_summary(gene_summary_object)
    # Set log fold change (LFC)
    # Note: neg.lfc and pos.lfc are identical
    # Set FDR (combined from each direction)
    gene_summary <- gene_summary %>%
      mutate('LFC' = neg.lfc) %>%
      mutate('FDR'= ifelse((max(c(neg.goodsgrna, pos.goodsgrna))>1) &
                             (neg.goodsgrna > pos.goodsgrna), neg.fdr, pos.fdr) )
    # Label enriched and depleted genes
    gene_summary <- gene_summary %>%
      mutate('is_depleted' = ifelse(neg.fdr < fdr, 1, 0)) %>%
      mutate('is_enriched' = ifelse(pos.fdr < fdr, 1, 0))
    # Get top n enriched genes
    filter_enriched_string <- paste0('pos.rank <= ', n, ' & pos.fdr < ', fdr)
    top_n_enriched_genes <- get_mageck_gene_summary(gene_summary_object, filter_enriched_string)
    # Get top n depleted genes
    filter_depleted_string <- paste0('neg.rank <= ', n, ' & neg.fdr < ', fdr)
    top_n_depleted_genes <- get_mageck_gene_summary(gene_summary_object, filter_depleted_string)
    # Validate gene summary
    check_dataframe(gene_summary)
    # Build basic volcano
    mageck_rra_gene_volcano <-
      ggplot(gene_summary, aes(x=LFC, y=-log10(FDR))) +
      geom_point(data = subset(gene_summary, is_enriched == 0 & is_depleted == 0 ),
                 color="grey50", alpha=0.2, size = 0.7) +
      geom_point(data = subset(gene_summary, is_depleted == 1 ),
                 color=color_depleted, alpha=0.8, size = 0.7) +
      geom_point(data = subset(gene_summary, is_enriched == 1 ),
                 color=color_enriched, alpha=0.8, size = 0.7) +
      geom_hline(yintercept = -log10(fdr), linetype = "dotted") +
      geom_vline(xintercept = c(-0.5, 0.5), linetype = "dotted") +
      ylab('-log10 FDR') +
      xlab('Log fold change (LFC)') +
      theme_classic()
    # Add labels for enriched genes
    if (nrow(top_n_enriched_genes) > 0) {
      mageck_rra_gene_volcano <-
        mageck_rra_gene_volcano +
        geom_label_repel(data = subset(gene_summary, gene_summary$id %in% top_n_enriched_genes$id),
                         aes(label = id),
                         fontface = 'bold', fill=color_enriched, label.size = 0.2,
                         alpha = 0.75, force = 3 )
    }
    # Add labels for depleted genes
    if (nrow(top_n_depleted_genes) > 0) {
      mageck_rra_gene_volcano <-
        mageck_rra_gene_volcano +
        geom_label_repel(data = subset(gene_summary, gene_summary$id %in% top_n_depleted_genes$id),
                         aes(label = id),
                         fontface = 'bold', fill=color_depleted, label.size = 0.2,
                         alpha = 0.75, force = 3 )
    }
    return(mageck_rra_gene_volcano)
}

#' Plot MAGeCK RRA sgRNA LFC barplot
#'
#' @description
#' Barplot of MAGeCK RRA sgRNA log fold changes (LFC) for a given gene.
#'
#' @param sgrna_summary_object a `MAGeCKGeneRRA` object.
#' @param gene name of gene to plot
#'
#' @return a ggplot2 object.
#' @import dplyr
#' @import ggplot2
#' @export plot_mageck_rra_sgrna_barplot
plot_mageck_rra_sgrna_barplot <-
  function(sgrna_summary_object = NULL,
           gene = NULL) {
    # Validate inputs
    if (is.null(sgrna_summary_object))
      stop("Cannot plot MAGeCK RRA sgRNA LFC barplot, sgrna_summary_object is null.")
    if (is.null(gene))
      stop("Cannot plot MAGeCK RRA sgRNA LFC barplot, gene is null.")
    # Get sgRNA summary result for a given gene
    sgrna_summary <- get_mageck_sgrna_gene_results(
                      object = sgrna_summary_object,
                      gene = gene)
    # Plot sgRNA LFCs
    mageck_rra_sgrna_barplot <-
      ggplot(sgrna_summary, aes(x = sgrna, y = LFC)) +
      geom_bar(stat = 'identity', position = 'dodge') +
      coord_flip() +
      xlab('') +
      ylab('Log fold change') +
      theme_classic() +
      theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 1),
            strip.background = element_rect(color = "black", linewidth = 1))
    # Return plot
    return(mageck_rra_sgrna_barplot)
  }
cancerit/RCRISPR documentation built on April 26, 2023, 10:12 p.m.