library(knitr) library(rmdformats) library(RColorBrewer) ## Global options options(max.print="75") display_code <- FALSE knitr::opts_knit$set(global.par = TRUE, width=75) knitr::opts_chunk$set(echo=display_code, cache=FALSE, prompt=FALSE, tidy=TRUE, comment=NA, message=FALSE, warning=FALSE) library(devtools) load_all()
par(mgp = c(2, 0.5, 0)) par(mar = par()$mar * 0.7) palette(brewer.pal(6, "Paired")) genome <- "Mus_musculus.GRCm38.94"
#Set a cut-off for the minimum number of times a category must appear in the 100 tests. min_appearances <- 30
In the mouse genome, most genes have only one transcript, but some have many more.
Do sets of genes with high numbers of transcripts produce significant results in GO
overrepresentation analysis?
The pre-processed gene info file can be used to plot the distribution of the number of transcripts per gene
in the r genome
genome.
genfo_file_name <- paste0("../data/", genome, "_gene_info.txt") genfo <- read.delim(genfo_file_name) bg_genes <- as.vector(unique(genfo$gene_name)) hist(genfo$no_of_transcripts, main = paste0("number of transcripts per gene in ", genome), xlab = "number of transcripts", col = 1 )
As expected, most genes have very low numbers of transcripts, so we don't get much information from this plot. We can look at the information in a table format to get actual numbers.
table(genfo$no_of_transcripts)
r ceiling(table(genfo$no_of_transcripts)["1"] / nrow(genfo) *100)
%
of genes only have one transcript.
r ceiling(table(genfo$no_of_transcripts)["2"] / nrow(genfo) *100)
%
of genes have two transcripts.
I'm not interested in these so I'll replot the data and see how it looks without
these values.
Above 30 there are only a handful of genes so these can be removed from the plot to
see it better.
#hist(genfo$no_of_transcripts[genfo$no_of_transcripts > 2], # main = paste0("number of transcripts (> 2) per gene in ", genome), # xlab = "number of transcripts (> 2)", # breaks = 12, # col = 1, # xlim = c(3, 30) # ) library("ggplot2") genfo_temp <- genfo[genfo$no_of_transcripts > 2 & genfo$no_of_transcripts <= 30,] ggplot(genfo_temp , aes(x = no_of_transcripts)) + geom_histogram(fill = "seagreen", binwidth = 1, color = "darkblue")
Originally, I selected arbitrary thresholds of minimum 10, 15 and 20 transcripts per gene. These seemed like reasonable numbers to choose and plenty of functional categories were identified in the overrepresentation tests.
It is not ideal having hardcoded values when it comes to generating further sets of genes. Selecting thresholds dependent on the distribution of the data is a more robust method for applying to different species and future versions of this genome.
probabilities <- c(0.9, 0.95, 0.99) transcript_thresholds <- quantile(genfo$no_of_transcripts, probs = probabilities) gene_counts <- sapply(transcript_thresholds, function(x) sum(genfo$no_of_transcripts >= x)) thresholds <- data.frame(transcript_thresholds, probabilities, gene_counts)
#It worked nicely for the gene lengths to plot out the numbers on graphs but the #distribution here doesn't plot quite so well. dens <- density(genfo$no_of_transcripts, adjust = 5) plot(dens, xlim = c(-2,16))
all_genes_in_group <- sapply(thresholds$transcript_thresholds, function(x){ genfo$gene_name[genfo$no_of_transcripts >= x] }) names(all_genes_in_group) <- thresholds$probabilities biased_transcripts <- lapply(names(all_genes_in_group), function(x){ sapply(1:100, function(i){ all_genes_in_group[[x]][ceiling(runif(200, min = 0, max = length(all_genes_in_group[[x]]) - 1))] }) }) names(biased_transcripts) <- names(all_genes_in_group)
# We don't want to run the analysis each time the document is knitted as it takes # too long. The code was run once and the data saved as an .rda object that can be # quickly loaded in to the R session. save(biased_transcripts, file = "../data/biased_transcripts.rda") x <- overrep_test(all_go_categories, query_genes = biased_transcripts[[1]][,1], bg_genes) transcript_results <- lapply(biased_transcripts, function(transcript_subset){ apply(transcript_subset, 2, function(query){ overrep_test(all_go_categories, query, bg_genes) }) }) names(transcript_results) <- names(biased_transcripts) save(transcript_results, file = "../data/transcript_results.rda")
The biased sets of genes were run through a GO over-representation analysis.
load("../data/transcript_results.rda") number_of_results <- lapply(transcript_results, function(x){ nulls_removed <- x[lapply(x,length) != 0] vapply(nulls_removed, nrow, FUN.VALUE = numeric(1)) })
p_and_q <- lapply(transcript_results, function(x){ pvals <- unlist(sapply(x, `[[`, "pval")) qvals <- unlist(sapply(x, `[[`, "adj_pval")) data.frame(pvals, qvals) }) par(mfrow = c(2, 2)) plot_density_highlight <- function(data_values, xlabel = "", threshold = 0.05, title = "", colour = 1 ){ dens <- density(data_values) filt <- dens$x < threshold plot(dens, main = title, xlab = xlabel, ylim = c(0, max(dens$y) * 1.2) ) polygon( c(dens$x[filt], threshold), c(dens$y[filt], 0), col = colour ) text_label = paste0("n = ", sum(data_values < threshold)) text(dens$x[length(dens$x)/10], y = max(dens$y) * 1.1, labels = text_label, font = 2, col = "red2" ) } sapply(names(p_and_q), function(x) { x_suffix <- paste0("values, N = ", nrow(p_and_q[[x]])) plot_density_highlight(p_and_q[[x]]$pvals, title = x, xlabel = paste0("p ", x_suffix) ) plot_density_highlight(p_and_q[[x]]$qvals, title = x, xlabel = paste0("corrected p ", x_suffix) ) }) #x_suffix <- paste0("values, N = ", nrow(min20_results)) #plot_density_highlight(min20_results$pval, xlabel = paste0("corrected p ", x_suffix), title = "min20") #plot_density_highlight(min20_results$adj_pval, xlabel = paste0("corrected p ", x_suffix), title = "min20")
ordered_categories <- lapply(transcript_results, function(transcript_subset){ all_sig_categories <- unlist(sapply(transcript_subset, rownames)) tabled_categories <- table(all_sig_categories) tabled_categories[order(tabled_categories, decreasing = TRUE)] })
par(mfrow = c(1,2)) sapply(names(ordered_categories), function(x){ plot(density(ordered_categories[[x]]), main = x) })
par(mfrow = c(1,2)) beanplot( ordered_categories, what = c(0,1,0,1), col = c("#1B9E77","#06086d"), ll = 0.02, method = "jitter", border = "#06086d", las = 1, log = "", ylim = c(0, 120), main = "GO category appearances during the 100 tests", ylab = "no of times GO category appeared" ) text(1:length(ordered_categories), y = 115, cex = 0.8, labels = paste0("n = ", sapply(ordered_categories, length)) ) filtered_categories <- lapply(ordered_categories, function(x) x[x >= min_appearances]) beanplot( filtered_categories, what = c(0,1,0,1), col = c("#1B9E77","#06086d"), ll = 0.03, method = "jitter", border = "#06086d", las = 1, log = "", ylim = c(10, 120), main = paste0("GO category appearances (>", min_appearances, ")"), ylab = "no of times GO category appeared" ) text(1:length(filtered_categories), y = 115, cex = 0.8, labels = paste0("n = ", sapply(filtered_categories, length)) )
Replot just showing the data for categories that appeared >= r min_appearances
times
Now we could do with some stats to pick a cutoff for the number of times a category appears. Let's just select an arbitrary value for now....
Create a dataset that contains these suspect set of categories
suspects <- lapply(ordered_categories, function(x){ names(x[x >= min_appearances]) })
print_overlaps <- function(set1, set2, not_in = FALSE) { ifelse( not_in, sum(!set1 %in% set2), sum(set1 %in% set2) ) } #print_overlaps(suspects$`0.99`, suspects$`0.95`) #print_overlaps(suspects$`0.99`, suspects$`0.9`) #print_overlaps(suspects$`0.95`, suspects$`0.9`) #There are `r print_overlaps(suspects$`0.95`, suspects$`0.9`)` overlapping categories #between 0.95 and 0.9.
There are r print_overlaps(suspects[["0.99"]], suspects[["0.95"]])
overlapping
categories between 0.99 and 0.95.
There are r print_overlaps(suspects[["0.99"]], suspects[["0.9"]])
overlapping categories
between 0.99 and 0.9.
There are r print_overlaps(suspects[["0.95"]], suspects[["0.9"]])
overlapping categories
between 0.95 and 0.9.
The duplicate terms will be removed from the more stringent categories.
suspects$`0.99` <- with(suspects, `0.99`[!`0.99` %in% `0.95`]) suspects$`0.99` <- with(suspects, `0.99`[!`0.99` %in% `0.9`]) suspects$`0.95` <- with(suspects, `0.95`[!`0.95` %in% `0.9`])
The number of terms remaining after deduplication
sapply(suspects, length)
Rename the categories
names(suspects) <- c("high_transcripts", "very_high_transcripts", "vv_high_transcripts")
#eval=FALSE, echo = FALSE} # write out file with unix line endings for (i in 1:length(suspects)) { filename <- paste0("../data/", names(suspects)[i], ".txt") output_file <- file(filename, "wb") write.table(file = output_file, x = suspects[[i]], row.names = FALSE, col.names = FALSE, quote = FALSE) close(output_file) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.