R/geneEnrichment.R

Defines functions EnrichmentVolcano geneEnrichment

Documented in EnrichmentVolcano geneEnrichment

#' geneEnrichment
#'
#' Function to calculate fold change enrichment in a set of snv calls correcting for gene length
#' @description Calculate the enrichment of snv hits in length-corrected genes
#' A 'gene_lengths' file must be provided with the following fields (cols 1..6 required)
#' gene length chrom    start      end      tss scaling_factor
#' This can be generated using the script 'script/genomic_features.pl' and a genome .gtf file
#' The defualt genome length is set to the mappable regions of the Drosophila melanogastor Dmel6.12 genome (GEM mappability score > .5)
#' (118274340). The full, assembled genome legnth for chroms 2/3/4/X/Y is 137547960
#' @param gene_lengths File containing all genes and their lengths (as generated by 'script/genomefeatures.pl') [Default 'data/gene_lengths.txt']
#' @param n The number of times we need to have seen a gene in our snv_data to view its enrichment score [Default 3]
#' @param genome_length The total legnth of the genome [Default 137547960 (chroms 2, 3, 4, X & Y for Drosophila melanogastor Dmel6.12)]
#' @keywords enrichment
#' @import dplyr
#' @import ggpubr
#' @return A snv_data frame with FC scores for all genes seen at least n times in snv snv_data
#' @export
geneEnrichment <- function(..., snv_data=NULL, gene_lengths_in="data/gene_lengths.txt", n=10, genome_length=118274340){
  if(missing(snv_data)) snv_data <- getData(...)

  snv_data <- snv_data %>%
    dplyr::filter(...,
                  gene != "intergenic") %>%
    droplevels()

  snv_count <- nrow(snv_data)

  gene_lengths <- read.delim(gene_lengths_in, header = T)

  gene_lengths <- gene_lengths %>%
    dplyr::filter(length > 1000) %>%
    dplyr::select(gene, length) %>%
    droplevels()

  genes <- setNames(as.list(gene_lengths$length), gene_lengths$gene)

  snv_data <- join(gene_lengths, snv_data, 'gene', type = 'left')

  snv_data$fpkm <- ifelse(snv_data$fpkm=='NULL' | snv_data$fpkm=='NA' | is.na(snv_data$fpkm), 0, snv_data$fpkm)
  snv_data$observed <- ifelse(is.numeric(snv_data$observed), snv_data$observed, 0)

  hit_genes <- table(factor(snv_data$gene, levels = levels(snv_data$gene) ))
  expression <- setNames(as.list(snv_data$fpkm), snv_data$gene)

  fun <- function(g) {
    # Calculate the fraction of geneome occupied by each gene
    genefraction<-genes[[g]]/genome_length

    # How many times should we expect to see this gene hit in our snv_data (given number of obs. and fraction of genome)?
    gene_expect<-snv_count*(genefraction)

    # observed/expected
    fc<-hit_genes[[g]]/gene_expect
    log2FC = log2(fc)

    if (hit_genes[[g]] >= gene_expect) {
      stat <- binom.test(x = hit_genes[[g]], n = snv_count, p = genefraction, alternative = "greater")
      test <- "enrichment"
    } else {
      stat <- binom.test(x = hit_genes[[g]], n = snv_count, p = genefraction, alternative = "less")
      test <- "depletion"
    }

    sig_val <- ifelse(stat$p.value <= 0.001, "***",
                      ifelse(stat$p.value <= 0.01, "**",
                             ifelse(stat$p.value <= 0.05, "*", "")))

    sig_val <- ifelse(stat$p.value > 0.05, "-", sig_val)

    p_val <- format.pval(stat$p.value, digits = 3)

    gene_expect <- round(gene_expect,digits=3)
    list(gene = g, length = genes[[g]], fpkm = expression[[g]], test=test, observed = hit_genes[g], expected = gene_expect, fc = fc, log2FC = log2FC, sig_val=sig_val, p_val=p_val)
  }

  enriched <- lapply(levels(snv_data$gene), fun)
  enriched <- do.call(rbind, enriched)
  genesFC <- as.data.frame(enriched)
  # Filter for genes with few observations

  genesFC <- genesFC %>%
    dplyr::filter(observed >= n) %>%
    dplyr::mutate(expected = round(as.numeric(expected),digits=3)) %>%
    dplyr::mutate(log2FC = round(as.numeric(log2FC),digits=2)) %>%
    dplyr::mutate(p_val = as.numeric(p_val)) %>%
    dplyr::mutate(eScore = abs(log2FC) * -log10(p_val)) %>%
    dplyr::mutate(eScore = round(as.numeric(eScore),digits=2)) %>%
    dplyr::select(gene, observed, expected, log2FC, test, sig_val, p_val, eScore) %>%
    dplyr::arrange(-eScore, p_val, -abs(log2FC)) %>%
    droplevels()

  return(genesFC)
}


# EnrichmentVolcano
#'
#' Plot the enrichment of SNVs in genes features
#' @keywords enrichment
#' @import dplyr
#' @import plotly
#' @export
EnrichmentVolcano <- function(d){
  gene_enrichment <- d

  minPval <- min(gene_enrichment$p_val[gene_enrichment$p_val>0])

  gene_enrichment$p_val <- ifelse(gene_enrichment$p_val==0, minPval/abs(gene_enrichment$log2FC), gene_enrichment$p_val)

  gene_enrichment$p_val <- ifelse(gene_enrichment$p_val==0, minPval, gene_enrichment$p_val)


  maxLog2 <- max(abs(gene_enrichment$log2FC[is.finite(gene_enrichment$log2FC)]))
  maxLog2 <- as.numeric(round_any(maxLog2, 1, ceiling))

  ax <- list(
    size = 25
  )

  ti <- list(
    size = 25
  )

  p <- plot_ly(data = gene_enrichment,
               x = ~log2FC,
               y = ~-log10(p_val),
               type = 'scatter',
               # showlegend = FALSE,
               mode = 'markers',
               # height = 1200,
               # width = 1000,
               # frame = ~p_val,
               text = ~paste("Gene: ", gene, "\n",
                             "Observed: ", observed, "\n",
                             "Expected: ", expected, "\n",
                             "P-val: ", p_val, "\n"),
               color = ~log10(p_val),
               colors = "Spectral",
               size = ~-log10(p_val)
  ) %>%
    layout(
      xaxis = list(title="Log2(FC)", titlefont = ax, range = c(-maxLog2, maxLog2)),
      yaxis = list(title="-Log10(p)", titlefont = ax)
    )
  p
}
nriddiford/mutationProfiles documentation built on Nov. 7, 2021, 12:14 a.m.