R/enrichment_functions.R

Defines functions .enrichment .unique_IG_pairs enrichment_by_significance enrichment_by_annotation

Documented in .enrichment enrichment_by_annotation enrichment_by_significance .unique_IG_pairs

#' Generic enrichment function
#'
#' Calculates relative risk given the number of outliers with a variant, the number of outliers, the number
#' of non-outliers with a variant, and the number of non outliers.
#'
#' It is used in a bunch of different things and is internal
.enrichment <-
  function(n.outliers.w.var,
           n.outliers,
           n.non.outliers.w.var,
           n.non.outliers) {
    ratio <-
      (n.outliers.w.var / n.outliers) / (n.non.outliers.w.var / n.non.outliers)
    q <-
      sqrt(1 / n.outliers.w.var - 1 / n.outliers + 1 / n.non.outliers.w.var - 1 /
             n.non.outliers)
    lower.q = ratio * exp(-1.96 * q)
    upper.q = ratio * exp(1.96 * q)
    n.outliers.w.var = n.outliers.w.var
    n.w.var <- n.outliers.w.var + n.non.outliers.w.var

    return(
      data.frame(
        ratio,
        lower.q,
        upper.q,
        n.outliers.w.var,
        n.outliers,
        n.non.outliers.w.var,
        n.non.outliers
      )
    )
  }

#' Unique individual-gene pairs in a data-frame
#'
#' Internal function that returns the number of unique individual-gene pairs from a data-frame
#'
#' @import tidyverse
.unique_IG_pairs <- function(x) {
  x %>%
    dplyr::select(SampleName, GeneID) %>%
    unique %>%
    nrow
}

#' Enrichment of rare variants stratified by outlier significance threshold
#'
#' @param outlier.calls A data frame with columns \code{GeneID}, \code{SampleName}, and \code{outlier.score}. The outlier.score is the
#' result of any expression based test which designates a gene as an outlier at some threshold.
#' @param rare.variants A data frame that lists all rare variants found near individual-gene pairs. Must columsn titled
#' \code{SampleName}, \code{GeneID}, \code{chr}, \code{start}, and \code{end}
#' @param outlier.thresholds Defaults to `c(0.05, 1e-2, 1e-3, 1e-4, 1e-5, 1e-7)`.
#' This script will calculate enrichment at every threshold.
#' @param limit.to.genes.w.outliers Default to `TRUE`. Should I remove genes that are never outliers in any individual?
#' @param base.significance.cutoff Default to `0.05`. Only needed if `limit.to.genes.w.outliers` is true. Use this threshold
#' for deciding whether to exclude genes that are never outliers.
#' @param verbose Defaults to `TRUE` Should I print annoying but helpful messages?
#'
#' @return A data frame with enrichment scores at each significance level.
#'
#' This function takes a data frame of outlier calls and a data frame of rare variant genotype data and produces
#' a data frame with enrichment values representing the relative risk of having a rare variant given outlier status
#' at different significance levels.
#'
#' @export
enrichment_by_significance <- function(outlier.calls,
                                       rare.variants,
                                       outlier.thresholds = c(0.05, 1e-2, 1e-3, 1e-4, 1e-5, 1e-7),
                                       limit.to.genes.w.outliers = T,
                                       base.significance.cutoff = .05,
                                       draw.plot = T,
                                       verbose = T) {
  # if(names(outlier.calls)) != c("GeneID", "SampleName", "outlier.score")){
  #  break("The column names of outlier.calls must be [GeneID, SampleName, outlier.score]")

  # Choose if you want to only look at genes that have at least one outlier.
  if (limit.to.genes.w.outliers) {
    if (verbose)
      cat(
        paste(
          "Only considering genes with at least one outlier at",
          base.significance.cutoff,
          "\n"
        )
      )
    sig.genes <- outlier.calls %>%
      filter(outlier.score < base.significance.cutoff) %>%
      .$GeneID %>%
      unique

    outlier.calls <- filter(outlier.calls, GeneID %in% sig.genes)
  } else {
    if (verbose)
      cat("Considering all genes, even those that are never significant\n")
  }

  if (verbose)
    cat("Checking to only include individuals that have genotype data!!\n")

  indvs.w.scores <- unique(outlier.calls$SampleName)
  indvs.w.RV.calls <- unique(rare.variants$SampleName)
  usable.indvs <- dplyr::intersect(indvs.w.scores, indvs.w.RV.calls)

  outlier.calls.w.RVs <-
    outlier.calls %>%
    dplyr::filter(SampleName %in% usable.indvs) %>%
    dplyr::left_join(rare.variants)

  if(verbose)
    cat("Calculating enrichment scores")

  enrichment.output <- data.frame()
  for (sig in outlier.thresholds) {
    enrichment.output <- rbind(enrichment.output,
                               cbind(
                                 .enrichment(
                                   n.outliers.w.var = outlier.calls.w.RVs %>%
                                     filter(outlier.score < sig & !is.na(chr)) %>%
                                     .unique_IG_pairs,
                                   n.outliers = outlier.calls.w.RVs %>%
                                     filter(outlier.score < sig) %>%
                                     .unique_IG_pairs,
                                   n.non.outliers.w.var = outlier.calls.w.RVs %>%
                                     filter(outlier.score > sig & !is.na(chr)) %>%
                                     .unique_IG_pairs,
                                   n.non.outliers = outlier.calls.w.RVs %>%
                                     filter(outlier.score > sig) %>%
                                     .unique_IG_pairs
                                 )
                                 , sig = sig
                               )
    )
  }


  if(draw.plot){
    plt.tbl <- enrichment.output
    plt.tbl <- plt.tbl %>%
      arrange(desc(sig)) %>%
      filter(n.outliers.w.var > 3) %>%
      mutate(sig = as.factor(sig))
    print(
      ggplot(plt.tbl, aes(sig, ratio)) +
        theme_linedraw() +
        geom_pointrange(aes(ymin = lower.q, ymax = upper.q)) +
        geom_hline(yintercept = 1, color = "red") +
        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
        scale_x_discrete(limits = rev(levels(plt.tbl$sig)))
    )
  }

  return(enrichment.output)
}




#' Enrichment of rare variants stratified by their annotation
#'
#' @param outlier.calls A data frame with columns \code{GeneID}, \code{SampleName}, and \code{outlier.score}. The outlier.score is the
#' result of any expression based test which designates a gene as an outlier at some threshold.
#' @param rare.variants A data frame that lists all rare variants found near individual-gene pairs. Must columsn titled
#' \code{SampleName}, \code{GeneID}, \code{chr}, \code{start}, and \code{end}
#' @param annotations The corresponding annotations for the `rare.variants` table. This will be joined together.
#' @param outlier.thresholds Defaults to `c(0.05, 1e-2, 1e-3, 1e-4, 1e-5, 1e-7)`.
#' This script will calculate enrichment at every threshold.
#' @param limit.to.genes.w.outliers Default to `TRUE`. Should I remove genes that are never outliers in any individual?
#' @param base.significance.cutoff Default to `0.05`. Only needed if `limit.to.genes.w.outliers` is true. Use this threshold
#' for deciding whether to exclude genes that are never outliers.
#' @param verbose Defaults to `TRUE` Should I print annoying but helpful messages?
#'
#' @return A data frame with enrichment scores at each significance level.
#'
#' This function takes a data frame of outlier calls and a data frame of rare variant genotype data and produces
#' a data frame with enrichment values representing the relative risk of having a rare variant given outlier status
#' at different significance levels.
#'
#' @export
enrichment_by_annotation <- function(outlier.calls,
                                     rare.variants,
                                     limit.to.genes.w.outliers = T,
                                     sig = .05,
                                     verbose = T,
                                     draw.plot = T) {

  # Check that the provided rare.variants file has the required annotations
  if (!(c("SampleName", "GeneID", "chr", "start", "consdetail") %in% names(rare.variants)))
    break("Check the supplied file has a column for the rare variant's annotation")


  # Choose if you want to only look at genes that have at least one outlier.
  if (limit.to.genes.w.outliers) {
    if (verbose)
      cat(
        paste(
          "Only considering genes with at least one outlier at",
          sig,
          "\n"
        )
      )
    sig.genes <- outlier.calls %>%
      filter(outlier.score < sig) %>%
      .$GeneID %>%
      unique

    outlier.calls <- filter(outlier.calls, GeneID %in% sig.genes)
  } else {
    if (verbose)
      cat("Considering all genes, even those that are never significant\n")
  }

  if (verbose)
    cat("Checking to only include individuals that have genotype data!!\n")

  indvs.w.scores <- unique(outlier.calls$SampleName)
  indvs.w.RV.calls <- unique(rare.variants$SampleName)
  usable.indvs <- dplyr::intersect(indvs.w.scores, indvs.w.RV.calls)

  if (verbose)
    cat("Combining outlier calls with rare variant information\n")
  outlier.calls.w.RVs <-
    outlier.calls %>%
    dplyr::filter(SampleName %in% usable.indvs) %>%
    dplyr::left_join(rare.variants)

  # Split ambiguous consdetails
  outlier.calls.w.RVs <-
    outlier.calls.w.RVs %>%
    mutate(consdetail = str_split(consdetail, ",")) %>%
    unnest

  # Get all possible annotations
  all.annotations <- unique(outlier.calls.w.RVs$consdetail)
  all.annotations <- all.annotations[!is.na(all.annotations)]

  if(verbose)
    cat("Calculating enrichment scores")

  enrichment.output <- data.frame()

  n.outliers = outlier.calls.w.RVs %>%
    filter(outlier.score < sig) %>%
    .unique_IG_pairs
  n.non.outliers = outlier.calls.w.RVs %>%
    filter(outlier.score > sig) %>%
    .unique_IG_pairs

  for (anno in all.annotations) {
    enrichment.output <- rbind(enrichment.output,
                               cbind(
                                 .enrichment(
                                   n.outliers.w.var = outlier.calls.w.RVs %>%
                                     filter(outlier.score < sig & consdetail == anno) %>%
                                     .unique_IG_pairs,
                                   n.outliers = n.outliers,
                                   n.non.outliers.w.var = outlier.calls.w.RVs %>%
                                     filter(outlier.score > sig & consdetail == anno) %>%
                                     .unique_IG_pairs,
                                   n.non.outliers = n.non.outliers
                                 )
                                 , anno = anno
                               )
    )
  }

  if(draw.plot){
    plt.tbl <- enrichment.output
    plt.tbl <- plt.tbl %>%
      arrange(desc(ratio)) %>%
      filter(n.outliers.w.var > 3) %>%
      mutate(anno = factor(anno, levels = anno))

    print(
      ggplot(plt.tbl, aes(anno, ratio)) +
        theme_linedraw() +
        geom_pointrange(aes(ymin = lower.q, ymax = upper.q)) +
        geom_hline(yintercept = 1, color = "red") +
        scale_y_continuous(trans='log2') +
        theme(axis.text.x = element_text(angle = 45, hjust = 1))
    )
  }
  return(enrichment.output)
}
jeinson/ExOutBench documentation built on Nov. 4, 2019, 2:38 p.m.