#' geneEnrichment
#'
#' Function to calculate fold change enrichment in a set of snv calls correcting for gene length
#' @description Calculate the enrichment of snv hits in length-corrected genes
#' A 'gene_lengths' file must be provided with the following fields (cols 1..6 required)
#' gene length chrom start end tss scaling_factor
#' This can be generated using the script 'script/genomic_features.pl' and a genome .gtf file
#' The defualt genome length is set to the mappable regions of the Drosophila melanogastor Dmel6.12 genome (GEM mappability score > .5)
#' (118274340). The full, assembled genome legnth for chroms 2/3/4/X/Y is 137547960
#' @param gene_lengths File containing all genes and their lengths (as generated by 'script/genomefeatures.pl') [Default 'data/gene_lengths.txt']
#' @param n The number of times we need to have seen a gene in our snv_data to view its enrichment score [Default 3]
#' @param genome_length The total legnth of the genome [Default 137547960 (chroms 2, 3, 4, X & Y for Drosophila melanogastor Dmel6.12)]
#' @keywords enrichment
#' @import dplyr
#' @import ggpubr
#' @return A snv_data frame with FC scores for all genes seen at least n times in snv snv_data
#' @export
geneEnrichment <- function(..., snv_data=NULL, gene_lengths_in="data/gene_lengths.txt", n=10, genome_length=118274340){
if(missing(snv_data)) snv_data <- getData(...)
snv_data <- snv_data %>%
dplyr::filter(...,
gene != "intergenic") %>%
droplevels()
snv_count <- nrow(snv_data)
gene_lengths <- read.delim(gene_lengths_in, header = T)
gene_lengths <- gene_lengths %>%
dplyr::filter(length > 1000) %>%
dplyr::select(gene, length) %>%
droplevels()
genes <- setNames(as.list(gene_lengths$length), gene_lengths$gene)
snv_data <- join(gene_lengths, snv_data, 'gene', type = 'left')
snv_data$fpkm <- ifelse(snv_data$fpkm=='NULL' | snv_data$fpkm=='NA' | is.na(snv_data$fpkm), 0, snv_data$fpkm)
snv_data$observed <- ifelse(is.numeric(snv_data$observed), snv_data$observed, 0)
hit_genes <- table(factor(snv_data$gene, levels = levels(snv_data$gene) ))
expression <- setNames(as.list(snv_data$fpkm), snv_data$gene)
fun <- function(g) {
# Calculate the fraction of geneome occupied by each gene
genefraction<-genes[[g]]/genome_length
# How many times should we expect to see this gene hit in our snv_data (given number of obs. and fraction of genome)?
gene_expect<-snv_count*(genefraction)
# observed/expected
fc<-hit_genes[[g]]/gene_expect
log2FC = log2(fc)
if (hit_genes[[g]] >= gene_expect) {
stat <- binom.test(x = hit_genes[[g]], n = snv_count, p = genefraction, alternative = "greater")
test <- "enrichment"
} else {
stat <- binom.test(x = hit_genes[[g]], n = snv_count, p = genefraction, alternative = "less")
test <- "depletion"
}
sig_val <- ifelse(stat$p.value <= 0.001, "***",
ifelse(stat$p.value <= 0.01, "**",
ifelse(stat$p.value <= 0.05, "*", "")))
sig_val <- ifelse(stat$p.value > 0.05, "-", sig_val)
p_val <- format.pval(stat$p.value, digits = 3)
gene_expect <- round(gene_expect,digits=3)
list(gene = g, length = genes[[g]], fpkm = expression[[g]], test=test, observed = hit_genes[g], expected = gene_expect, fc = fc, log2FC = log2FC, sig_val=sig_val, p_val=p_val)
}
enriched <- lapply(levels(snv_data$gene), fun)
enriched <- do.call(rbind, enriched)
genesFC <- as.data.frame(enriched)
# Filter for genes with few observations
genesFC <- genesFC %>%
dplyr::filter(observed >= n) %>%
dplyr::mutate(expected = round(as.numeric(expected),digits=3)) %>%
dplyr::mutate(log2FC = round(as.numeric(log2FC),digits=2)) %>%
dplyr::mutate(p_val = as.numeric(p_val)) %>%
dplyr::mutate(eScore = abs(log2FC) * -log10(p_val)) %>%
dplyr::mutate(eScore = round(as.numeric(eScore),digits=2)) %>%
dplyr::select(gene, observed, expected, log2FC, test, sig_val, p_val, eScore) %>%
dplyr::arrange(-eScore, p_val, -abs(log2FC)) %>%
droplevels()
return(genesFC)
}
# EnrichmentVolcano
#'
#' Plot the enrichment of SNVs in genes features
#' @keywords enrichment
#' @import dplyr
#' @import plotly
#' @export
EnrichmentVolcano <- function(d){
gene_enrichment <- d
minPval <- min(gene_enrichment$p_val[gene_enrichment$p_val>0])
gene_enrichment$p_val <- ifelse(gene_enrichment$p_val==0, minPval/abs(gene_enrichment$log2FC), gene_enrichment$p_val)
gene_enrichment$p_val <- ifelse(gene_enrichment$p_val==0, minPval, gene_enrichment$p_val)
maxLog2 <- max(abs(gene_enrichment$log2FC[is.finite(gene_enrichment$log2FC)]))
maxLog2 <- as.numeric(round_any(maxLog2, 1, ceiling))
ax <- list(
size = 25
)
ti <- list(
size = 25
)
p <- plot_ly(data = gene_enrichment,
x = ~log2FC,
y = ~-log10(p_val),
type = 'scatter',
# showlegend = FALSE,
mode = 'markers',
# height = 1200,
# width = 1000,
# frame = ~p_val,
text = ~paste("Gene: ", gene, "\n",
"Observed: ", observed, "\n",
"Expected: ", expected, "\n",
"P-val: ", p_val, "\n"),
color = ~log10(p_val),
colors = "Spectral",
size = ~-log10(p_val)
) %>%
layout(
xaxis = list(title="Log2(FC)", titlefont = ax, range = c(-maxLog2, maxLog2)),
yaxis = list(title="-Log10(p)", titlefont = ax)
)
p
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.