R/plot.R

Defines functions plot_combn_gene_residuals plot_combn_residuals plot_condition_gene_residuals plot_condition_residuals plot_combn_response plot_condition_response plot_heatmap plot_samples plot_reads plot_reads_qc essential_lfc_qc plot_lfc_qc

Documented in essential_lfc_qc plot_combn_gene_residuals plot_combn_residuals plot_combn_response plot_condition_gene_residuals plot_condition_residuals plot_condition_response plot_heatmap plot_lfc_qc plot_reads plot_reads_qc plot_samples

######
# PLOTTING CODE
######

#' Plot replicate comparisons.
#' 
#' Plots replicate comparisons for all replicates in a list of screens and outputs
#' plots to a given folder.
#' 
#' @param df Reads or lfc dataframe.
#' @param screens List of screens created with \code{add_screens}.
#' @param output_folder Folder to output plots to. 
#' @param negative_controls List of negative control genes to append to default list of
#'   non-essential genes (default NULL).
#' @param control_label Label for negative control genes in plot (default "Negative controls").
#' @param plot_type Type of plot to output, one of "png" or "pdf" (default "png").
#' @param display_numbers Whether or not to include PCC values in heatmap (default TRUE).
#' @export 
plot_lfc_qc <- function(df, screens, output_folder, negative_controls = NULL, 
                        control_label = "Negative controls", plot_type = "png", 
                        display_numbers = TRUE) {
  
  # Checks for input errors
  check_screen_params(df, screens)
  
  # Gets essential gene QC metrics
  auc <- essential_lfc_qc(df, screens, negative_controls = negative_controls)
  auc_file <- file.path(output_folder, "essential_PR_QC.tsv")
  if (!is.null(auc)) {
    utils::write.table(auc, auc_file, quote = FALSE, sep = "\t",
                       row.names = FALSE, col.names = TRUE) 
  }
  
  # Checks plot type and converts to lowercase
  plot_type <- tolower(plot_type)
  if (plot_type != "png" & plot_type != "pdf") {
    stop("plot_type must be either png or pdf")
  }
  
  # Builds dataframe of replicate PCCs
  cor_df <- NULL
  
  # Adds nonessential labels to df and sets color scheme for replicate plots
  nonessentials <- traver_nonessentials
  nonessentials <- c(nonessentials, negative_controls)
  nonessential_ind <- (df$gene1 %in% nonessentials & df$gene2 %in% nonessentials) |
    (df$gene1 %in% nonessentials & df$gene2 == "NegControl") |
    (df$gene2 %in% nonessentials & df$gene1 == "NegControl")
  df$nonessentials <- "Other"
  df$nonessentials[nonessential_ind] <- control_label
  color_values <- c("Other" = "Gray")
  color_values[control_label] <- "Blue"
  
  # Compares replicates across all screens
  for (screen in screens) {
    rep_cols <- screen[["replicates"]]
    if (length(rep_cols) > 1) {
      pairs <- utils::combn(rep_cols, 2)
      for (i in 1:ncol(pairs)) {
        col1 <- pairs[1,i]
        col2 <- pairs[2,i]
        x_label <- paste0(col1, " log2 fold change")
        y_label <- paste0(col2, " log2 fold change")
        temp <- plot_samples(df, col1, col2, x_label, y_label, print_cor = TRUE,
                             color_col = "nonessentials", color_lab = "Guide type",
                             color_values = color_values)
        p <- temp[[1]]
        pcc <- temp[[2]]
        scc <- temp[[3]]
        file_name <- paste0(col1, "_vs_", col2, "_replicate_comparison.", plot_type)
        suppressWarnings(ggplot2::ggsave(file.path(output_folder, file_name), width = 10, height = 7, dpi = 300))
        
        # Stores PCC in dataframe
        if (is.null(cor_df)) {
          cor_df <- data.frame(rep1 = col1, rep2 = col2, pcc = pcc, scc = scc,
                               stringsAsFactors = FALSE)
        } else {
          cor_df <- rbind(cor_df, c(col1, col2, pcc, scc))
        }
      }   
    }
  }
  
  # Writes PCCs to file
  cor_file <- file.path(output_folder, "replicate_cor.tsv")
  utils::write.table(cor_df, cor_file, quote = FALSE, sep = "\t",
                     row.names = FALSE, col.names = TRUE)
  
  # Gets sample groups
  all_cols <- c()
  col_groups <- c()
  i <- 1
  for (screen_name in names(screens)) {
    screen <- screens[[screen_name]]
    for (col in screen[["replicates"]]) {
      all_cols <- c(all_cols, col)
      col_groups[i] <- screen_name
      i <- i +1
    }
  }
  
  # Plots heatmap of log-scaled read counts
  df <- df[,all_cols]
  heatmap_file <- file.path(output_folder, paste0("lfc_heatmap.", plot_type))
  plot_heatmap(df, col_groups, heatmap_file, display_numbers)
}

#' Computes essential gene recovery AUC.
#' 
#' Computes area under the curve for ROC curves that measure how well each technical replicate
#' recovers signal for essential-targeting guides. Only computes AUC for guides that target 
#' essential genes twice, guides that target two different essential genes, or guides that 
#' target an essential gene and an intergenic region.
#' 
#' @param df LFC dataframe.
#' @param screens List of screens generated with \code{add_screens}. 
#' @param negative_controls List of negative control genes to append to default list of
#'   non-essential genes (default NULL).
#' @return Returns a dataframe with three columns for replicate name, essential AUC relative 
#'   to all other genes, and essential AUC relative to a specified set of non-essentials.
essential_lfc_qc <- function(df, screens, negative_controls = NULL) {
  
  # Loads gene standards from internal data
  essentials <- traver_core_essentials
  nonessentials <- traver_nonessentials
  
  # Adds genes to nonessentials list if specified
  if (!is.null(negative_controls)) {
    nonessentials <- c(nonessentials, negative_controls) 
  }
  
  # Gets indices of essential-targeting guides
  essential_ind <- (df$gene1 %in% essentials & df$gene2 %in% essentials) |
    (df$gene1 %in% essentials & df$gene2 == "NegControl") |
    (df$gene2 %in% essentials & df$gene1 == "NegControl")
  
  # Gets indices of nonessential-targeting guides
  nonessential_ind <- (df$gene1 %in% nonessentials & df$gene2 %in% nonessentials) |
    (df$gene1 %in% nonessentials & df$gene2 == "NegControl") |
    (df$gene2 %in% nonessentials & df$gene1 == "NegControl")
  
  # Throws warning if too few genes in standards
  skip_nonessential <- FALSE
  if (sum(essential_ind) < 10) {
    warning(paste("too few essential-targeting guides in df, skipping all AUC computation"))
    return(NULL)
  }
  if (sum(nonessential_ind) < 10) {
    warning(paste("too few nonessential-targeting guides in df, skipping non-essential AUC computation"))
    skip_nonessential <- TRUE
  }
  
  # Gets PR curves for all essential genes relative to all other genes
  results <- data.frame(screen = NA,
                        replicate = NA, 
                        essential_AUC_all = NA,
                        essential_AUC_nonessential = NA)
  counter <- 1
  for (screen_name in names(screens)) {
    screen <- screens[[screen_name]]
    for (rep in screen[["replicates"]]) {
      
      # Gets replicate-specific indices (taking NAs into account)
      temp <- df[!is.na(df[[rep]]),]
      essential_ind_rep <- (temp$gene1 %in% essentials & temp$gene2 %in% essentials) |
        (temp$gene1 %in% essentials & temp$gene2 == "NegControl") |
        (temp$gene2 %in% essentials & temp$gene1 == "NegControl")
      nonessential_ind_rep <- (temp$gene1 %in% nonessentials & temp$gene2 %in% nonessentials) |
        (temp$gene1 %in% nonessentials & temp$gene2 == "NegControl") |
        (temp$gene2 %in% nonessentials & temp$gene1 == "NegControl")
      
      # Computes AUC relative to all genes
      auc1 <- NA
      auc2 <- NA
      if (sum(essential_ind_rep) < 10) {
        warning(paste("too few essential-targeting guides for replicate", rep, "skipping AUC computation"))
      } else {
        roc <- PRROC::roc.curve(-temp[[rep]], weights.class0 = as.numeric(essential_ind_rep), curve = TRUE)
        auc1 <- roc$auc
      }
      
      # Computes AUC relative to non-essential genes
      if (!skip_nonessential & sum(nonessential_ind_rep) > 10) {
        essential_rownames <- rownames(temp)[essential_ind_rep]
        temp <- temp[essential_ind_rep | nonessential_ind_rep,]
        temp_essential_ind <- rownames(temp) %in% essential_rownames
        roc <- PRROC::roc.curve(-temp[[rep]], weights.class0 = as.numeric(temp_essential_ind), curve = TRUE)
        auc2 <- roc$auc
      }
      results[counter,] <- c(screen_name, rep, auc1, auc2)
      counter <- counter + 1
    }
  }
  return(results)
}

#' Plot read counts for a screen.
#' 
#' Plots a histogram of read counts for each replicate of all screens. Also
#' plots total reads for all screens.
#' 
#' @param df Reads dataframe.
#' @param screens List of screens created with \code{add_screens}.
#' @param output_folder Folder to output plots to. 
#' @param log_scale If true, log-normalizes data.
#' @param pseudocount Pseudocounts to add to log-normalized data if specified (default 1).
#' @param display_numbers Whether or not to include PCC values in heatmap (default TRUE).
#' @param plot_type Type of plot to output, one of "png" or "pdf" (default "png").
#' @export
plot_reads_qc <- function(df, screens, output_folder,
                          log_scale = TRUE, pseudocount = 1,
                          display_numbers = TRUE, 
                          plot_type = "png") {
  
  # Checks for input errors
  check_screen_params(df, screens)
  
  # Checks plot type and converts to lowercase
  plot_type <- tolower(plot_type)
  if (plot_type != "png" & plot_type != "pdf") {
    stop("plot_type must be either png or pdf")
  }
  
  # Plots read count histograms for all replicates of all screens and stores total reads 
  reads_df <- NULL
  all_cols <- c()
  col_groups <- c()
  all_coverage <- c()
  i <- 1
  for (screen_name in names(screens)) {
    screen <- screens[[screen_name]]
    for (col in screen[["replicates"]]) {
      total_reads <- sum(df[,col], na.rm = TRUE)
      if (is.null(reads_df)) {
        reads_df <- data.frame(rep = col, reads = total_reads,
                               stringsAsFactors = FALSE)
      } else {
        reads_df <- rbind(reads_df, c(col, total_reads))
      }
      all_coverage <- c(all_coverage, screen[["target_coverage"]])
      all_cols <- c(all_cols, col)
      p <- plot_reads(df, col, log_scale, pseudocount)
      file_name <- paste0(col, "_raw_reads_histogram.", plot_type)
      suppressWarnings(ggplot2::ggsave(file.path(output_folder, file_name), width = 10, height = 7, dpi = 300))
      col_groups[i] <- screen_name
      i <- i + 1
    }
  }
  
  # Gets unique coverage breakpoints
  all_coverage <- unique(all_coverage)
  all_coverage <- all_coverage*nrow(df)
  
  # Plots total reads
  reads_df$reads <- as.numeric(reads_df$reads)
  p <- ggplot2::ggplot(reads_df, ggplot2::aes_string(x = "rep", y = "reads")) +
    ggplot2::geom_bar(stat = "identity", color = "Black", fill = "gray30")
  for (coverage in all_coverage) {
    p <- p + ggplot2::geom_hline(yintercept = coverage, linetype = 2, size = 1, alpha = 0.9, color = "Gray")
  }
  p <- p +
    ggplot2::xlab("Replicate") +
    ggplot2::ylab("Total reads") +
    ggplot2::scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
    ggthemes::theme_tufte(base_size = 20, base_family = "sans") +
    ggplot2::theme(axis.text.x = ggplot2:: element_text(angle = 45, hjust = 1))
  file_name <- paste0("total_reads.", plot_type)
  suppressWarnings(ggplot2::ggsave(file.path(output_folder, file_name), plot = p, width = 10, height = 7, dpi = 300))
  
  # Log-scales read counts if specified
  df <- df[,all_cols]
  if (log_scale) {
    for (col in colnames(df)) {
      df[,col] <- log2(df[,col] + 1)
    }
  }
  
  # Plots heatmap of log-scaled read counts
  heatmap_file <- file.path(output_folder, paste0("reads_heatmap.", plot_type))
  plot_heatmap(df, col_groups, heatmap_file, display_numbers)
}

#' Plot read counts.
#'
#' Plots a histogram of read counts for a given column.
#'
#' @param df Reads dataframe.
#' @param col Name of column to plot.
#' @param log_scale If true, log-normalizes data.
#' @param pseudocount Pseudocounts to add to log-normalized data if specified (default 1).
#' @return A ggplot object.
plot_reads <- function(df, col, log_scale = TRUE, pseudocount = 1) {
  x_label <- paste(col, "log-normalized read counts")
  y_label <- "Number of read counts"
  if (log_scale) {
    df[,col] <- log2(df[,col] + 1)
    y_label <- "Number of log-normalized read counts"
  }
  p <- ggplot2::ggplot(df, ggplot2::aes_string(col)) +
    ggplot2::geom_histogram(bins = 30) +
    ggplot2::xlab(x_label) +
    ggplot2::ylab(y_label) +
    ggthemes::theme_tufte(base_size = 20, base_family = "sans")
  return(p)
}

#' Plots sample comparisons.
#'
#' Pretty-plots comparisons between two samples in a scatterplot.
#
#' @param df Reads or lfc dataframe.
#' @param xcol Name of column containing values to plot on the x-axis.
#' @param ycol Name of column containing values to plot on the y-axis.
#' @param xlab X-axis label.
#' @param ylab Y-axis label.
#' @param color_col Name of column to color points by (optional).
#' @param color_lab Name of color legend (optional, defaults to color_col).
#' @param color_values Named list of discrete values in color_col mapped
#'   to their respective colors (defaults to RColorBrewer's Set2 colors
#'   with NULL).
#' @param print_cor If true, prints Pearson correlation between columns 
#'   (default FALSE).
#' @return A list of three elements. The first is a ggplot object, the
#'   second is the correlation between xcol and ycol, and the third is
#'   the Spearman correlation between xcol and ycol.
plot_samples <- function(df, xcol, ycol, xlab, ylab, 
                         color_col = NULL, color_lab = NULL,
                         color_values = NULL, print_cor = FALSE) {
  
  # Computes correlations and optionally prints Pearson correlation
  pcc <- stats::cor(df[[xcol]], df[[ycol]], use = "complete.obs")
  scc <- stats::cor(df[[xcol]], df[[ycol]], method = "spearman", use = "complete.obs")
  if (print_cor) {
    cat(paste0("Pearson correlation between ", xcol, " and ", ycol, ": ", pcc, "\n"))
  }
  
  # Makes plot
  p <- NULL
  if (is.null(color_col)) {
    p <- ggplot2::ggplot(df, ggplot2::aes_string(x = xcol, y = ycol)) +
      ggplot2::geom_abline(slope = 1, intercept = 0, color = "black", linetype = 2, size = 1) +
      ggplot2::geom_point(size = 1.5, alpha = 0.7) +
      ggplot2::xlab(xlab) +
      ggplot2::ylab(ylab) +
      ggthemes::theme_tufte(base_size = 20, base_family = "sans") 
  } else {
    
    # Gets parameters for a discrete color scale
    if (is.null(color_lab)) {
      color_lab <- color_col
    }
    if (is.null(color_values)) {
      n_discrete <- length(unique(df[[color_col]]))
      n_color <- max(3, n_discrete)
      color_values <- RColorBrewer::brewer.pal(n_color, "Set1")[1:n_discrete]
      names(color_values) <- sort(unique(df[[color_col]]))
    }
    
    # Constructs basic plot
    p <- ggplot2::ggplot() +
      ggplot2::geom_abline(slope = 1, intercept = 0, color = "black", linetype = 2, size = 1)
      
    # Adds layers of points one by one
    for (x in names(color_values)) {
      p <- p + ggplot2::geom_point(data = df[df[[color_col]] == x,], size = 1.5, alpha = 0.7,
                                   mapping = ggplot2::aes_string(x = xcol, y = ycol, color = color_col))
    }
    
    # Finishes plot
    p <- p + ggplot2::scale_color_manual(values = color_values, name = color_lab) +
      ggplot2::xlab(xlab) +
      ggplot2::ylab(ylab) +
      ggthemes::theme_tufte(base_size = 20, base_family = "sans")
  }
  return(list(p, pcc, scc))
}

#' Plots Pearson correlation heatmap.
#' 
#' Plots heatmap of Pearson correlations for a dataframe with labels
#' for groups of samples and a blue-yellow color scheme.
#' 
#' @param df Reads or lfc dataframe
#' @param col_groups List of grouping labels for each column in df. 
#' @param filename Output filename for plot. 
#' @param display_numbers Whether or not to include PCC values in heatmap (default TRUE).
#' @return Writes plot to file with no return value.
plot_heatmap <- function(df, col_groups, filename, display_numbers) {
  
  # Gets colors for different screens
  screen_colors <- NA
  n_colors <- length(unique(col_groups))
  if (n_colors < 10) {
    screen_colors <- list(group = RColorBrewer::brewer.pal(length(unique(col_groups)), "Set1"))
  } else {
    pal <- RColorBrewer::brewer.pal(9, "Set1")
    pal <- grDevices::colorRampPalette(pal)(n_colors)
    screen_colors <- list(group = pal)
  }
  names(screen_colors$group) <- unique(col_groups)
  
  # Gets PCCs for heatmap
  cor_mat <- data.matrix(stats::cor(df, use = "complete.obs"))
  
  # Gets annotation for heatmap
  col_groups <- data.frame("Screen" = col_groups)
  rownames(col_groups) <- colnames(df)
  colnames(cor_mat) <- colnames(df)
  
  # Gets color for heatmap values
  breaks <- seq(-1, 1, by = (1/150))
  pal <- grDevices::colorRampPalette(c("#7fbf7b", "#f7f7f7", "#af8dc3"))(n = length(breaks))
  
  # Plots heatmap of raw reads to file
  pheatmap::pheatmap(cor_mat,
                     border_color = NA,
                     annotation_col = col_groups,
                     annotation_colors = screen_colors,
                     display_numbers = display_numbers,
                     color = pal, 
                     breaks = breaks,
                     filename = filename,
                     width = 10,
                     height = 10)
}

#' Plots drug response for scored data.
#' 
#' Pretty-plots response for data which does not use a derived null model (e.g. for directly
#' comparing drug response to DMSO response). Assumes that data was scored by 
#' \code{score_conditions_vs_control} and significant effects were called by 
#' \code{call_condition_hits}. Writes both a scatterplot and a volcano plot
#' to file in the specified folder.
#' 
#' @param scores Dataframe of scores returned from \code{call_condition_hits}.
#' @param control_name Name of control passed to \code{call_condition_hits}.
#' @param condition_name Name of condition passed to \code{call_condition_hits}.
#' @param output_folder Folder to output plots to. 
#' @param loess If true and data was loess-normalized, plots loess null model instead
#'   (default TRUE).
#' @param neg_type Label for significant effects with a negative differential effect
#'   passed to \code{call_condition_hits} (default "Negative").
#' @param pos_type Label for significant effects with a positive differential effect
#'   passed to \code{call_condition_hits} (default "Positive").
#' @param volcano_type One of "pval" or "FDR" to specify whether to plot -log10(pval)
#'   or -log2(FDR), respectively (default "FDR").
#' @param plot_type Type of plot to output, one of "png" or "pdf" (default "png").
#' @export
plot_condition_response <- function(scores, control_name, condition_name, output_folder,
                                    loess = TRUE, neg_type = "Negative", 
                                    pos_type = "Positive", volcano_type = "FDR",
                                    plot_type = "png") {
  
  # Makes output folder if it doesn't exist
  if (!dir.exists(output_folder)) {
    dir.create(output_folder)
  }
  
  # Manually sets colors and fill for plot
  diff_col <- paste0("differential_", condition_name, "_vs_", control_name)
  scores <- scores[order(scores[[diff_col]]),]
  response_col <- paste0("effect_type_", condition_name)
  colors <- c(neg_type = "Black", "None" = "Gray", pos_type = "Black")
  names(colors)[1] <- neg_type
  names(colors)[3] <- pos_type
  fill <- c(neg_type = "Blue", "None" = "Gray", pos_type = "Yellow")
  names(fill)[1] <- neg_type
  names(fill)[3] <- pos_type

  # Builds basic plot
  p <- ggplot2::ggplot(scores, ggplot2::aes_string(x = paste0("mean_", control_name), 
                                                   y = paste0("mean_", condition_name))) +
    ggplot2::geom_hline(yintercept = 0, linetype = 2, size = 1, alpha = 1, color = "Gray") +
    ggplot2::geom_vline(xintercept = 0, linetype = 2, size = 1, alpha = 1, color = "Gray")
  
  # Appends choice of null model to plot
  if (loess) {
  } else {
    p <- p + 
      ggplot2::geom_abline(slope = 1, intercept = 0, size = 1.5, alpha = 0.5, color = "Black")
  }
  
  # Finishes plot
  p <- p + 
    ggplot2::geom_point(ggplot2::aes_string(color = response_col, fill = response_col), shape = 21, alpha = 0.7) +
    ggplot2::scale_color_manual(values = colors) +
    ggplot2::scale_fill_manual(values = fill) +
    ggplot2::xlab(paste0(control_name, " mean LFC")) +
    ggplot2::ylab(paste0(condition_name, " mean LFC")) +
    ggplot2::labs(fill = "Significant response") +
    ggplot2::guides(color = "none", size = "none") +
    ggthemes::theme_tufte(base_size = 20, base_family = "sans") +
    ggplot2::theme(axis.text.x = ggplot2:: element_text(color = "Black", size = 16),
                   axis.text.y = ggplot2:: element_text(color = "Black", size = 16),
                   legend.text = ggplot2:: element_text(size = 16))
  
  # Saves to file
  file_name <- paste0(condition_name, "_vs_", control_name, "_scatter.", plot_type)
  suppressWarnings(ggplot2::ggsave(file.path(output_folder, file_name), width = 10, height = 7, dpi = 300))
  
  # Gets log-scaled p-value or FDR column for volcano plot
  fdr_col <- paste0("fdr_", condition_name, "_vs_", control_name)
  pval_col <- paste0("pval_", condition_name, "_vs_", control_name)
  sig_col <- "volcano_y"
  ylab <- ""
  if (volcano_type == "FDR") {
    scores[[sig_col]] <- -log2(scores[[fdr_col]])
    ylab <- "-log2(FDR)"
  } else if (volcano_type == "pval") {
    scores[[sig_col]] <- -log10(scores[[pval_col]])
    ylab <- "-log10(p)"
  } else {
    cat(paste("invalid volcano_type value specified, defaulting to -log2(FDR)"))
    scores[[sig_col]] <- -log2(scores[[fdr_col]])
  }
  
  # Makes volcano plot
  p <- ggplot2::ggplot(scores, ggplot2::aes_string(x = diff_col, y = sig_col)) +
    ggplot2::geom_point(ggplot2::aes_string(color = response_col, fill = response_col), shape = 21, alpha = 0.7) +
    ggplot2::scale_color_manual(values = colors) +
    ggplot2::scale_fill_manual(values = fill) +
    ggplot2::xlab(paste0("Differential effect")) +
    ggplot2::ylab(ylab) +
    ggplot2::labs(fill = "Significant response") +
    ggplot2::guides(color = "none", size = "none") +
    ggthemes::theme_tufte(base_size = 20, base_family = "sans") +
    ggplot2::theme(axis.text.x = ggplot2:: element_text(color = "Black", size = 16),
                   axis.text.y = ggplot2:: element_text(color = "Black", size = 16),
                   legend.text = ggplot2:: element_text(size = 16))
  
  # Saves to file
  file_name <- paste0(condition_name, "_vs_", control_name, "_volcano.", plot_type)
  suppressWarnings(ggplot2::ggsave(file.path(output_folder, file_name), width = 10, height = 7, dpi = 300))
}

#' Plots drug response for scored paired data.
#' 
#' Pretty-plots response for data which uses a derived null model (e.g. for comparing
#' dual-gene knockout effects to a multiplicative null model derived from single-gene
#' effects). Assumes that data was scored by \code{score_combn_vs_single} and 
#' significant effects were called by \code{call_combn_hits}. Writes both a scatterplot 
#' and a volcano plot to file in the specified folder.
#' 
#' @param scores Dataframe of scores returned from \code{call_combn_hits}.
#' @param condition_name Name of condition passed to \code{call_combn_hits}.
#' @param output_folder Folder to output plots to. 
#' @param filter_names If a list of column names is given, calls points as non-significant 
#'   if they are significant in the provided columsn (e.g. to remove points significant 
#'   in control screens; default NULL).
#' @param loess If true and data was loess-normalized, plots loess null model instead
#'   (default TRUE).
#' @param neg_type Label for significant effects with a negative differential effect
#'   passed to \code{call_combn_hits} (default "Negative").
#' @param pos_type Label for significant effects with a positive differential effect
#'   passed to \code{call_combn_hits} (default "Positive").
#' @param volcano_type One of "pval" or "FDR" to specify whether to plot -log10(pval)
#'   or -log2(FDR), respectively (default "FDR").
#' @param plot_type Type of plot to output, one of "png" or "pdf" (default "png").
#' @export
plot_combn_response <- function(scores, condition_name, output_folder,
                                filter_names = NULL, loess = TRUE, 
                                neg_type = "Negative", pos_type = "Positive", 
                                volcano_type = "FDR", plot_type = "png") {
  
  # Makes output folder if it doesn't exist
  if (!dir.exists(output_folder)) {
    dir.create(output_folder)
  }
  
  # If filter_names given, remove significant guides with the same effect as a control 
  # type of guides (e.g. DMSO) from plot
  response_col <- paste0("effect_type_", condition_name)
  if (!is.null(filter_names)) {
    for (name in filter_names) {
      control_response_col <- paste0("effect_type_", name)
      scores[[response_col]][
        scores[[response_col]] != "None" & 
          scores[[control_response_col]] == scores[[response_col]]] <-
        "None"
    }
  }
  
  # Manually sets colors for plot
  diff_col <- paste0("differential_combn_vs_single_", condition_name)
  scores <- scores[order(scores[[diff_col]]),]
  colors <- c(neg_type = "Black", "None" = "Gray", pos_type = "Black")
  names(colors)[1] <- neg_type
  names(colors)[3] <- pos_type
  fill <- c(neg_type = "Blue", "None" = "Gray", pos_type = "Yellow")
  names(fill)[1] <- neg_type
  names(fill)[3] <- pos_type
  
  # Plots data
  p <- ggplot2::ggplot(scores, ggplot2::aes_string(x = paste0("mean_single_", condition_name), 
                                          y = paste0("mean_combn_", condition_name))) +
    ggplot2::geom_hline(yintercept = 0, linetype = 2, size = 1, alpha = 1, color = "Gray") +
    ggplot2::geom_vline(xintercept = 0, linetype = 2, size = 1, alpha = 1, color = "Gray")
  
  # Appends choice of null model to plot
  if (loess) {
  } else {
    p <- p + 
      ggplot2::geom_abline(slope = 1, intercept = 0, size = 1.5, alpha = 0.5, color = "Black")
  }
  
  # Finishes plot
  p <- p +
    ggplot2::geom_point(ggplot2::aes_string(color = response_col, fill = response_col), shape = 21, alpha = 0.7) +
    ggplot2::scale_color_manual(values = colors) +
    ggplot2::scale_fill_manual(values = fill) +
    ggplot2::xlab(paste0(condition_name, " mean expected single LFC")) +
    ggplot2::ylab(paste0(condition_name, " mean observed combinatorial LFC")) +
    ggplot2::labs(fill = "Significant response") +
    ggplot2::guides(color = "none", size = "none") +
    ggthemes::theme_tufte(base_size = 20, base_family = "sans") +
    ggplot2::theme(axis.text.x = ggplot2:: element_text(color = "Black", size = 16),
                   axis.text.y = ggplot2:: element_text(color = "Black", size = 16),
                   legend.text = ggplot2:: element_text(size = 16))
  
  # Saves to file
  file_name <- paste0(condition_name, "_combn_scatter.", plot_type)
  suppressWarnings(ggplot2::ggsave(file.path(output_folder, file_name), width = 10, height = 7, dpi = 300))
  
  # Gets log-scaled p-value or FDR columns for volcano plot
  fdr_col1 <- paste0("fdr1_combn_vs_single_", condition_name)
  fdr_col2 <- paste0("fdr2_combn_vs_single_", condition_name)
  pval_col1 <- paste0("pval1_combn_vs_single_", condition_name)
  pval_col2 <- paste0("pval2_combn_vs_single_", condition_name)
  sig_col <- "volcano_y"
  ylab <- ""
  if (volcano_type == "FDR") {
    scores[[sig_col]] <- -log2(rowMeans(scores[,c(fdr_col1, fdr_col2)], na.rm = TRUE))
    ylab <- "Mean -log2(FDR) across orientations"
  } else if (volcano_type == "pval") {
    scores[[sig_col]] <- -log10(rowMeans(scores[,c(fdr_col1, fdr_col2)], na.rm = TRUE))
    ylab <- "Mean -log10(p) across orientations"
  } else {
    cat(paste("invalid volcano_type value specified, defaulting to -log2(FDR)"))
    scores[[sig_col]] <- -log2(rowMeans(scores[[fdr_col1]], scores[[fdr_col2]], na.rm = TRUE))
  }
  
  # Makes volcano plot
  p <- ggplot2::ggplot(scores, ggplot2::aes_string(x = diff_col, y = sig_col)) +
    ggplot2::geom_point(ggplot2::aes_string(color = response_col, fill = response_col), shape = 21, alpha = 0.7) +
    ggplot2::scale_color_manual(values = colors) +
    ggplot2::scale_fill_manual(values = fill) +
    ggplot2::xlab(paste0("Differential effect")) +
    ggplot2::ylab(ylab) +
    ggplot2::labs(fill = "Significant response") +
    ggplot2::guides(color = "none", size = "none") +
    ggthemes::theme_tufte(base_size = 20, base_family = "sans") +
    ggplot2::theme(axis.text.x = ggplot2:: element_text(color = "Black", size = 16),
                   axis.text.y = ggplot2:: element_text(color = "Black", size = 16),
                   legend.text = ggplot2:: element_text(size = 16))
  
  # Saves to file
  file_name <- paste0(condition_name, "_combn_volcano.", plot_type)
  suppressWarnings(ggplot2::ggsave(file.path(output_folder, file_name), width = 10, height = 7, dpi = 300))
}

#' Plot guide-level residuals for all gene pairs.
#' 
#' Plots replicate comparisons for all replicates in a list of screens and outputs
#' plots to a given folder. Works for data returned from \code{call_condition_hits}.
#' 
#' @param scores Dataframe of scores returned from \code{call_condition_hits}.
#' @param residuals Residuals returned with the return_residuals argument set to true
#'   from \code{call_condition_hits}.
#' @param control_name Name of control passed to \code{call_condition_hits}.
#' @param condition_name Name of condition passed to \code{call_condition_hits}.
#' @param output_folder Folder to output plots to. 
#' @param neg_type Label for significant effects with a negative differential effect
#'   passed to \code{call_condition_hits} (default "Negative").
#' @param pos_type Label for significant effects with a positive differential effect
#'   passed to \code{call_condition_hits} (default "Positive").
#' @param plot_type Type of plot to output, one of "png" or "pdf" (default "png").
#' @export 
plot_condition_residuals <- function(scores, residuals, control_name, 
                                     condition_name, output_folder,
                                     neg_type = "Negative", pos_type = "Positive", 
                                     plot_type = "png") {
  
  # Checks plot type and converts to lowercase
  plot_type <- tolower(plot_type)
  if (plot_type != "png" & plot_type != "pdf") {
    stop("plot_type must be either png or pdf")
  }
  
  # Makes output folder if it doesn't exist
  if (!dir.exists(output_folder)) {
    dir.create(output_folder)
  }
  
  # Gets top hits
  response_col <- paste0("effect_type_", condition_name)
  control_col <- paste0("mean_", control_name)
  condition_col <- paste0("mean_", condition_name)
  diff_col <- paste0("differential_", condition_name, "_vs_", control_name)
  scores <- scores[scores[[response_col]] != "None",]
  residuals <- residuals[residuals$n %in% as.numeric(rownames(scores)),]
  residuals$lfc <- residuals[[condition_col]] - residuals[[control_col]]
  
  # Gets ranking of top hits
  neg_order <- order(scores[[diff_col]])
  scores$neg_rank <- NA
  scores$pos_rank <- NA
  scores$neg_rank[neg_order] <- 1:nrow(scores)
  scores$pos_rank[neg_order] <- nrow(scores):1
  
  # Makes LFC plots for all top hits
  for (i in unique(residuals$n)) {
    
    # Gets data and gene names
    df <- residuals[residuals$n == i,]
    ind <- which(as.numeric(rownames(scores)) == i)
    gene1 <- scores$gene1[ind]
    gene2 <- scores$gene2[ind]
    x_label <- paste0("Guides")
    y_label <- paste0("Guide-level differential LFC for ", gene1, "/", gene2)
    
    # Adds ID column for plotting
    df$ID <- paste("Guide", 1:nrow(df))
    
    # Plots data
    p <- ggplot2::ggplot(df) +
      ggplot2::xlab(x_label) +
      ggplot2::ylab(y_label) +
      ggplot2::geom_bar(ggplot2::aes_string(x = "ID", y = "lfc"), stat = "identity", color = "Black", 
                        fill = ggplot2::alpha(c("gray30"), 1)) +
      ggplot2::geom_hline(yintercept = 1, linetype = 2, size = 1, alpha = 0.75, color = "Yellow") +
      ggplot2::geom_hline(yintercept = 0, linetype = 2, size = 1, alpha = 0.75, color = "Gray") +
      ggplot2::geom_hline(yintercept = -1, linetype = 2, size = 1, alpha = 0.75, color = "Blue") +
      ggplot2::coord_flip() +
      ggthemes::theme_tufte(base_size = 20, base_family = "sans")
    
    # Gets type and rank of effect
    effect <- ""
    rank <- 0
    effect_type <- scores[[response_col]][ind]
    if (effect_type == neg_type) {
      effect <- "neg"
      rank <- scores$neg_rank[ind]
    } else {
      effect <- "pos"
      rank <- scores$pos_rank[ind]
    }
    
    # Saves to file
    file_name <- paste0(effect, "_", rank, "_", gene1, "_", gene2, ".", plot_type)
    suppressWarnings(ggplot2::ggsave(file.path(output_folder, file_name), width = 10, height = 7, dpi = 300))
  }
}

#' Plot guide-level residuals for a given gene.
#' 
#' Plots guide-level residuals for a given gene and returns the plot. Works for data 
#' returned from \code{call_condition_hits}.
#' 
#' @param scores Dataframe of scores returned from \code{call_condition_hits}.
#' @param residuals Residuals returned with the return_residuals argument set to true
#'   from \code{call_condition_hits}.
#' @param gene Gene name for guides to plot.
#' @param control_name Name of control passed to \code{call_condition_hits}.
#' @param condition_name Name of condition passed to \code{call_condition_hits}.
#' @return A ggplot object.
#' @export 
plot_condition_gene_residuals <- function(scores, residuals, gene, control_name, condition_name) {
  
  # Checks that genes present in dataframe
  if (!(gene %in% scores$gene1)) {
    stop(paste("entry for gene", gene, "not in scores"))
  }
  
  # Gets guide-level residuals for the given gene
  ind <- which(scores$gene1 == gene)
  df <- residuals[residuals$n == ind,]
  x_label <- paste0("Guides")
  y_label <- paste0("Guide-level differential LFC for ", gene)
  
  # Computes residuals
  control_col <- paste0("mean_", control_name)
  condition_col <- paste0("mean_", condition_name)
  df$lfc <- df[[condition_col]] - df[[control_col]]
  
  # Adds ID column for plotting
  df$ID <- paste("Guide", 1:nrow(df))
  
  # Plots data and returns plot
  p <- ggplot2::ggplot(df) +
    ggplot2::xlab(x_label) +
    ggplot2::ylab(y_label) +
    ggplot2::geom_bar(ggplot2::aes_string(x = "ID", y = "lfc"), stat = "identity", color = "Black", 
                      fill = ggplot2::alpha(c("gray30"), 1)) +
    ggplot2::geom_hline(yintercept = 1, linetype = 2, size = 1, alpha = 0.75, color = "Yellow") +
    ggplot2::geom_hline(yintercept = 0, linetype = 2, size = 1, alpha = 0.75, color = "Gray") +
    ggplot2::geom_hline(yintercept = -1, linetype = 2, size = 1, alpha = 0.75, color = "Blue") +
    ggplot2::coord_flip() +
    ggthemes::theme_tufte(base_size = 20, base_family = "sans")
  return(p)
}

#' Plot guide-level LFCs for all gene pairs.
#' 
#' Plots replicate comparisons for all replicates in a list of screens and outputs
#' plots to a given folder. Works for data returned from \code{call_combn_hits}.
#' 
#' @param scores Dataframe of scores returned from \code{call_combn_hits}.
#' @param residuals Residuals returned with the return_residuals argument set to true
#'   from \code{call_combn_hits}.
#' @param condition_name Name of condition passed to \code{call_combn_hits}.
#' @param output_folder Folder to output plots to. 
#' @param neg_type Label for significant effects with a negative differential effect
#'   passed to \code{call_combn_hits} (default "Negative").
#' @param pos_type Label for significant effects with a positive differential effect
#'   passed to \code{call_combn_hits} (default "Positive").
#' @param plot_type Type of plot to output, one of "png" or "pdf" (default "png").
#' @export 
plot_combn_residuals <- function(scores, residuals, condition_name, output_folder,
                                 neg_type = "Negative", pos_type = "Positive",
                                 plot_type = "png") {
  
  # Checks plot type and converts to lowercase
  plot_type <- tolower(plot_type)
  if (plot_type != "png" & plot_type != "pdf") {
    stop("plot_type must be either png or pdf")
  }
  
  # Makes output folder if it doesn't exist
  if (!dir.exists(output_folder)) {
    dir.create(output_folder)
  }
  
  # Gets top hits across both orientations
  response_col <- paste0("effect_type_", condition_name)
  single_col <- paste0("mean_single_", condition_name)
  combn_col <- paste0("mean_combn_", condition_name)
  diff_col <- paste0("differential_combn_vs_single_", condition_name)
  scores <- scores[scores[[response_col]] != "None",]
  residuals1 <- residuals[[1]]
  residuals2 <- residuals[[2]]
  residuals1 <- residuals1[residuals1$n %in% as.numeric(rownames(scores)),]
  residuals1$lfc <- residuals1[[combn_col]] - residuals1[[single_col]]
  residuals1$orientation <- "Orientation 1"
  
  ignored_orientation <- all(residuals2 == 0)
  if (!ignored_orientation) {
    residuals2 <- residuals2[residuals2$n %in% as.numeric(rownames(scores)),]
    residuals2$lfc <- residuals2[[combn_col]] - residuals2[[single_col]]
    residuals2$orientation <- "Orientation 2" 
  }
  
  # Gets ranking of top hits
  neg_order <- order(scores[[diff_col]])
  scores$neg_rank <- NA
  scores$pos_rank <- NA
  scores$neg_rank[neg_order] <- 1:nrow(scores)
  scores$pos_rank[neg_order] <- nrow(scores):1
  
  # Makes LFC plots for all top hits
  for (i in unique(residuals1$n)) {
    
    # Gets data
    df <- NULL
    if (!ignored_orientation) {
      df1 <- residuals1[residuals1$n == i,]
      df2 <- residuals2[residuals2$n == i,]
      df1$orientation <- paste0(df1$orientation[1], " (n = ", nrow(df1), ")")
      df2$orientation <- paste0(df2$orientation[1], " (n = ", nrow(df2), ")")
      df <- rbind(df1, df2)
    } else {
      df1 <- residuals1[residuals1$n == i,]
      df1$orientation <- paste0(df1$orientation[1], " (n = ", nrow(df1), ")")
      df <- df1
    }
    
    # Gets gene names and sets axis labels
    ind <- which(as.numeric(rownames(scores)) == i)
    gene1 <- scores$gene1[ind]
    gene2 <- scores$gene2[ind]
    x_label <- paste0("Guides")
    y_label <- paste0("Guide-level differential LFC for ", gene1, "/", gene2)
    
    # Adds ID column for plotting
    df$ID <- nrow(df):1
    
    # Plots data
    p <- ggplot2::ggplot(df) +
      ggplot2::xlab(x_label) +
      ggplot2::ylab(y_label) +
      ggplot2::geom_bar(ggplot2::aes_string(x = "ID", y = "lfc"), stat = "identity", color = "Black", 
                        fill = ggplot2::alpha(c("gray30"), 1)) +
      ggplot2::geom_hline(yintercept = 1, linetype = 2, size = 1, alpha = 0.75, color = "Yellow") +
      ggplot2::geom_hline(yintercept = 0, linetype = 2, size = 1, alpha = 0.75, color = "Gray") +
      ggplot2::geom_hline(yintercept = -1, linetype = 2, size = 1, alpha = 0.75, color = "Blue") +
      ggplot2::coord_flip() +
      ggplot2::facet_grid(. ~ orientation) +
      ggthemes::theme_tufte(base_size = 20, base_family = "sans") +
      ggplot2::theme(axis.text.y = ggplot2::element_blank(),
                     axis.ticks.y = ggplot2::element_blank(),
                     panel.border = ggplot2::element_rect(fill = NA, color = "black"))
    
    # Gets type and rank of effect
    effect <- ""
    rank <- 0
    effect_type <- scores[[response_col]][ind]
    if (effect_type == neg_type) {
      effect <- "neg"
      rank <- scores$neg_rank[ind]
    } else {
      effect <- "pos"
      rank <- scores$pos_rank[ind]
    }
      
    # Saves to file
    file_name <- paste0(effect, "_", rank, "_", gene1, "_", gene2, ".", plot_type)
    suppressWarnings(ggplot2::ggsave(file.path(output_folder, file_name), width = 10, height = 7, dpi = 300))
  }
}

#' Plot guide-level residuals for a given gene.
#' 
#' Plots guide-level residuals for a given gene pair and returns the plot. Works for data 
#' returned from \code{call_combn_hits}.
#' 
#' @param scores Dataframe of scores returned from \code{call_combn_hits}.
#' @param residuals Residuals returned with the return_residuals argument set to true
#'   from \code{call_combn_hits}.
#' @param gene1 First gene name for guides to plot.
#' @param gene2 Second gene name for guides to plot.
#' @param condition_name Name of condition passed to \code{call_combn_hits}.
#' @return A ggplot object.
#' @export 
plot_combn_gene_residuals <- function(scores, residuals, gene1, gene2, condition_name) {
  
  # Checks that genes present in dataframe
  if (!((gene1 %in% scores$gene1) & (gene2 %in% scores$gene2)) &
      !((gene1 %in% scores$gene2) & (gene2 %in% scores$gene1))) {
    stop(paste("entry for gene", gene1, "and", gene2, "not in scores"))
  }

  # Splits and preps residuals dataframes
  residuals1 <- residuals[[1]]
  residuals2 <- residuals[[2]]
  residuals1$orientation <- "Orientation 1"
  residuals2$orientation <- "Orientation 2"

  # Gets guide-level residuals for the given gene
  ind <- max(which(scores$gene1 == gene1 & scores$gene2 == gene2),
             which(scores$gene1 == gene2 & scores$gene2 == gene1))
  df1 <- residuals1[residuals1$n == ind,]
  df2 <- residuals2[residuals2$n == ind,]
  df1$orientation <- paste0(df1$orientation[1], " (n = ", nrow(df1), ")")
  df2$orientation <- paste0(df2$orientation[1], " (n = ", nrow(df2), ")")
  df <- rbind(df1, df2)
  x_label <- paste0("Guides")
  y_label <- paste0("Guide-level differential LFC for ", gene1, "/", gene2)

  # Computes residuals
  single_col <- paste0("mean_single_", condition_name)
  combn_col <- paste0("mean_combn_", condition_name)
  df$lfc <- df[[combn_col]] - df[[single_col]]

  # Adds ID column for plotting
  df$ID <- paste("Guide", 1:nrow(df))
  
  # Plots data and returns plot
  p <- ggplot2::ggplot(df) +
    ggplot2::xlab(x_label) +
    ggplot2::ylab(y_label) +
    ggplot2::geom_bar(ggplot2::aes_string(x = "ID", y = "lfc"), stat = "identity", color = "Black", 
                      fill = ggplot2::alpha(c("gray30"), 1)) +
    ggplot2::geom_hline(yintercept = 1, linetype = 2, size = 1, alpha = 0.75, color = "Yellow") +
    ggplot2::geom_hline(yintercept = 0, linetype = 2, size = 1, alpha = 0.75, color = "Gray") +
    ggplot2::geom_hline(yintercept = -1, linetype = 2, size = 1, alpha = 0.75, color = "Blue") +
    ggplot2::coord_flip() +
    ggplot2::facet_grid(. ~ orientation) +
    ggthemes::theme_tufte(base_size = 20, base_family = "sans") +
    ggplot2::theme(axis.text.y = ggplot2::element_blank(),
                   axis.ticks.y = ggplot2::element_blank(),
                   panel.border = ggplot2::element_rect(fill = NA, color = "black"))
  return(p)
}
HenryWard/orthrus documentation built on June 2, 2023, 10:28 p.m.