library(knitr) library(rmdformats) ## Global options options(max.print="75") display_code <- FALSE knitr::opts_knit$set(global.par = TRUE, width=75) knitr::opts_chunk$set(echo=display_code, #cache=TRUE, prompt=FALSE, tidy=TRUE, comment=NA, message=FALSE, warning=FALSE) library(rmdformats) library(devtools) load_all()
library(RColorBrewer) palette(brewer.pal(6, "Set2")) par(mgp = c(2, 0.5, 0)) par(mar = par()$mar * 0.7) genome <- "Mus_musculus.GRCm38.94"
# Set a cut-off for the minimum number of times a category must appear in the 100 tests. min_appearances <- 30
Generate lists of mouse genes biased by the length of the genes.
genfo_file_name <- paste0("../data/", genome, "_gene_info.txt") genfo <- read.delim(genfo_file_name) bg_genes <- as.vector(unique(genfo$gene_name))
The gene info file contains the lengths of all the genes in the mouse genome r genome
.
These can be plotted so that some thresholds for length categories can be determined.
Plot out the length distribution for all the genes.
par(mfrow = c(1,2)) plot(density(log2(genfo$length)), lwd = 2, main = "", xlab = "log2 gene length") boxplot(log2(genfo$length), ylab = "log2 gene length")
This data is displayed on a log scale as there are some very long genes that distort
the graph if it is plotted on a linear scale.
The plot shows a set of short genes between r 2^5
and r 2^7.6
(~5-7.5 on log2 scale).
This provides a natural set of lengths to use, and can be split into a "very short"
and a "short1" category. We'll also take some of the genes that are slightly longer
but are below the next peak in the distribution.
As the distribution of lengths for the long genes is smooth, this provides no obvious
thresholds to select, so we can take the top 5% and 10% of lengths.
long <- quantile(log2(genfo$length), probs = c(0.9, 0.95)) names(long) <- c("long", "very_long")
par(mfrow = c(1,1)) plot(density(log2(genfo$length)), lwd = 2, main = "", xlab = "log2 gene length" ) thresholds <- c( very_short = 6.8, short1 = 7.6, short2 = 9.7, long ) more_less <- c( very_short = "less", short1 = "less", short2 = "interval", long = "more", very_long = "more" ) sapply(names(thresholds), function(x){ colours <- as.factor(more_less) abline(v = thresholds[x], col = colours[x], lwd = 2, lty = 2) })
thresholds
The number of genes that remain after filtering for each length category.
dens <- density(log2(genfo$length)) par(mfrow = c(2, 3)) sapply(names(thresholds), function(length_cat){ plot(dens, lwd = 2, xlab = "log2 gene length", main = length_cat) if (more_less[length_cat] == "less") { filt <- dens$x < thresholds[length_cat] polygon( c(dens$x[filt], thresholds[length_cat]), c(dens$y[filt], 0), col = "grey" ) n_genes <- sum(log2(genfo$length) < thresholds[length_cat]) } else if (more_less[length_cat] == "more") { filt <- dens$x > thresholds[length_cat] polygon( c(dens$x[filt], thresholds[length_cat]), c(dens$y[filt], 0), col = "grey" ) n_genes <- sum(log2(genfo$length) > thresholds[length_cat]) } else { # hard code the interval plot filt <- dens$x > thresholds["short1"] & dens$x < thresholds["short2"] polygon( c(thresholds["short1"], dens$x[filt], thresholds["short2"]), c(0, dens$y[filt], 0), col = "grey" ) n_genes <- sum(log2(genfo$length) > thresholds["short1"] & log2(genfo$length) < thresholds["short2"] ) } label_text <- paste0("n = ", n_genes) text(x = 19, y = 0.09, labels = label_text, cex = 1.5) })
biased_lengths <- lapply(names(thresholds), function(category){ if (more_less[category] == "less") { filtered_genes <- genfo$gene_name[log2(genfo$length) < thresholds[category]] } else if (more_less[category] == "more") { filtered_genes <- genfo$gene_name[log2(genfo$length) > thresholds[category]] } else { # hard code the interval plot filtered_genes <- genfo$gene_name[log2(genfo$length) > thresholds["short1"] & log2(genfo$length) < thresholds["short2"]] } sapply(1:100, function(i){ filtered_genes[ceiling(runif(200, min = 0, max = length(filtered_genes) - 1))] }) }) names(biased_lengths) <- names(thresholds)
save(biased_lengths, file = "M:/temp/length/length_genelists.rda")
Check they all look right
par(mfrow = c(1, 1)) lengths <- lapply(biased_lengths, function(x) { log2(genfo$length[match(unlist(x), genfo$gene_name)]) }) boxplot(lengths, col = 1)
The odd gene that doesn't fall within the expected size range may be due to name duplication.
Run the gene lists through a GO overrepresentation analysis.
x <- overrep_test(all_go_categories, query_genes = biased_lengths[[4]][,1], bg_genes) gene_length_results <- lapply(biased_lengths, function(length_subset){ apply(length_subset, 2, function(query){ overrep_test(all_go_categories, query, bg_genes) }) }) save(gene_length_results, file = "../data/gene_length_results.rda")
We don't want to run the GO analysis each time the document is knitted as it takes too long. The code was run once and the data saved as an .rda object that can be quickly loaded in to the R session.
See how many significant categories were returned.
load("../data/gene_length_results.rda") number_of_results <- lapply(gene_length_results, function(x){ nulls_removed <- x[lapply(x,length) != 0] vapply(nulls_removed, nrow, FUN.VALUE = numeric(1)) })
The number of gene sets that returned significant results from the GO overrepresentation analysis
sapply(number_of_results, length)
The bean plots show the number of categories returned from each gene list. The null results have been removed.
par(mfrow = c(1,1)) library(beanplot) options(scipen = 999) # disable the scientific notation beanplot( number_of_results, what = c(0,1,0,1), col = c("#1B9E77","#06086d"), ll = 0.03, method = "jitter", border = "#06086d", las = 1, main = "number of significant categories per gene list returned from GO analysis" )
#There are loads of results returned for the long genes, we'll plot out the p #and q values. p_and_q <- lapply(gene_length_results, function(x){ pvals <- unlist(sapply(x, `[[`, "pval")) qvals <- unlist(sapply(x, `[[`, "adj_pval")) data.frame(pvals, qvals) }) par(mfrow = c(length(p_and_q), 2)) plot_density_highlight <- function(data_values, xlabel = "", threshold = 0.05, title = "", colour = 3 ){ dens <- density(data_values) filt <- dens$x < threshold plot(dens, main = title, xlab = xlabel, ylim = c(0, max(dens$y) * 1.2) ) polygon( c(dens$x[filt], threshold), c(dens$y[filt], 0), col = colour ) text_label = paste0("n = ", sum(data_values < threshold)) text(dens$x[length(dens$x)/10], y = max(dens$y) * 1.1, labels = text_label, font = 2, col = "red2" ) } sapply(names(p_and_q), function(x) { x_suffix <- paste0("values, N = ", nrow(p_and_q[[x]])) plot_density_highlight(p_and_q[[x]]$pvals, title = x, xlabel = paste0("p ", x_suffix) ) plot_density_highlight(p_and_q[[x]]$qvals, title = x, xlabel = paste0("corrected p ", x_suffix) ) })
ordered_categories <- lapply(gene_length_results, function(length_subset){ all_sig_categories <- unlist(sapply(length_subset, rownames)) tabled_categories <- table(all_sig_categories) tabled_categories[order(tabled_categories, decreasing = TRUE)] })
Plot how many times a category appeared.
par(mfrow = c(2, 3)) sapply(names(ordered_categories), function(x){ plot(density(ordered_categories[[x]]), main = x) })
par(mfrow = c(1,1)) beanplot( ordered_categories, what = c(0,1,0,1), col = c("#1B9E77","#06086d"), ll = 0.03, method = "jitter", border = "#06086d", las = 1, log = "", ylim = c(0, 120), main = "number of times a GO category appeared during the 100 tests", ylab = "no of times GO category appeared" ) text(1:length(ordered_categories), y = 115, cex = 0.8, labels = paste0("n = ", sapply(ordered_categories, length)) )
filtered_categories <- lapply(ordered_categories, function(x) x[x >= min_appearances])
# convert list of table to list of vectors convert_tbl_vec <- function(list_of_tables){ list_of_vec <- lapply(names(list_of_tables), function(x){ vector <- as.vector(list_of_tables[[x]]) names(vector) <- names(list_of_tables[[x]]) vector }) names(list_of_vec) <- names(list_of_tables) list_of_vec }
filtered_categories <- lapply(ordered_categories, function(x) x[x >= min_appearances]) filtered_categories_vec <- convert_tbl_vec(filtered_categories) boxplot(filtered_categories_vec, main = paste0("GO category appearances (>", min_appearances, ")"), ylab = "no of times GO category appeared", pch = 16, cex = 0.5, col = 1, las = 1, cex.axis = 0.8, ylim = c(10, 110) ) text(1:22, y = 108, cex = 0.8, col = 2, labels = sapply(filtered_categories_vec, length) )
Replot just showing the data for categories that appeared >= r min_appearances
times
filtered_categories <- lapply(ordered_categories, function(x) x[x >= min_appearances]) beanplot( filtered_categories, what = c(0,1,0,1), col = c("#1B9E77","#06086d"), ll = 0.03, method = "jitter", border = "#06086d", las = 1, log = "", ylim = c(10, 120), main = paste0("GO category appearances (>", min_appearances, ")"), ylab = "no of times GO category appeared" ) text(1:length(ordered_categories), y = 115, cex = 0.8, labels = paste0("n = ", sapply(filtered_categories, length)) )
Now we could do with some stats to pick a cutoff for the number of times a category appears. Let's just select an arbitrary value for now....
Create a dataset that contains these suspect set of categories
suspects <- lapply(ordered_categories, function(x) names(x[x >= min_appearances])) sapply(suspects, length)
print_overlaps <- function(set1, set2, not_in = FALSE) { ifelse( not_in, print(sum(!set1 %in% set2)), print(sum(set1 %in% set2)) ) } with(suspects, { print_overlaps(very_short, short1) #print_overlaps(very_short, short2) print_overlaps(very_long, long) })
There are r print_overlaps(suspects[["very_short"]], suspects[["short1"]])
overlapping
categories between very_short and short1.
There are r print_overlaps(suspects[["very_long"]], suspects[["long"]])
overlapping categories
between very_long and long.
The duplicate terms will be removed from the more stringent categories.
suspects$very_long <- with(suspects, very_long[!very_long %in% long]) suspects$very_short <- with(suspects, very_short[!very_short %in% short1]) sapply(suspects, length)
Rename "short1" to "short"
names(suspects) <- gsub(names(suspects), pattern = "short1", replacement = "short")
suspects <- suspects[sapply(suspects, length) > 0] sapply(suspects, length)
# write out file with unix line endings for (i in 1:length(suspects)) { filename <- paste0("../data/", names(suspects)[i], ".txt") output_file <- file(filename, "wb") write.table( file = output_file, x = suspects[[i]], row.names = FALSE, col.names = FALSE, quote = FALSE ) close(output_file) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.