library(knitr) library(rmdformats) ## Global options options(max.print="75") display_code <- FALSE knitr::opts_knit$set(global.par = TRUE, width=75) knitr::opts_chunk$set(echo=display_code, #cache=TRUE, prompt=FALSE, tidy=TRUE, comment=NA, message=FALSE, warning=FALSE) library(devtools) load_all()
library("devtools") load_all() par(mgp = c(2, 0.5, 0)) par(mar = par()$mar * 0.7) library(RColorBrewer) palette(brewer.pal(6, "Set2")) genome <- "Mus_musculus.GRCm38.94"
# Set a cut-off for the minimum number of times a category must appear in the 100 tests. min_appearances <- 30
To check whether any gene ontology or other functional categories were associated with high or low GC content, lists of genes were generated biased by GC content.
Preliminary work looking at GC biases using intervals over the whole range of GC content in the genome only identified categories using genes with high or low GC content, not the intermediate ranges, so we're focusing on the ends of the distribution.
genfo_file_name <- paste0("../data/", genome, "_gene_info.txt") genfo <- read.delim(genfo_file_name) bg_genes <- as.vector(unique(genfo$gene_name))
plot(density(genfo$GC_content), lwd = 2, main = "", xlab = "GC content", xlim = c(0.25,0.75))
For the preliminary work, arbitrary thresholds of 0.35, 0.4, 0.5, 0.55, 0.6 GC were used.
However, selecting thresholds dependent on the distribution of the data is a more robust method for applying to different species and future versions of this genome. The GC content thresholds will be based on the distribution of GC content over all genes. The probabilities selected will be 0.01, 0.05, 0.1, 0.9, 0.95, 0.99.
library("magrittr") probabilities <- c(0.01, 0.05, 0.1, 0.9, 0.95, 0.99) thresholds <- quantile(genfo$GC_content, probs = probabilities) more_less <- c( vv_low_GC = "less", very_low_GC = "less", low_GC = "less", high_GC = "more", very_high_GC = "more", vv_high_GC = "more" ) names(thresholds) <- names(more_less) names(probabilities) <- names(more_less) gene_counts <- sapply(names(thresholds), function(category_filter){ if (more_less[category_filter] == "less") { sum(genfo$GC_content <= thresholds[category_filter]) } else { sum(genfo$GC_content >= thresholds[category_filter]) } }) gene_counts_tibble <- tibble::enframe(gene_counts, name = "category", value = "gene count") %>% dplyr::mutate(probabilities = probabilities)
par(mfrow = c(2, 3)) dens <- density(genfo$GC_content) sapply(names(thresholds), function(category_filter){ if (more_less[category_filter] == "less") { filt <- dens$x < thresholds[category_filter] colour <- 1 n_genes <- sum(genfo$GC_content < thresholds[category_filter]) x_location <- 0.3 title_text <- " <= " } else { filt <- dens$x > thresholds[category_filter] colour <- 2 n_genes <- sum(genfo$GC_content > thresholds[category_filter]) x_location <- 0.65 title_text <- " >= " } plot(dens, lwd = 2, xlab = "GC content", xlim = c(0.2,0.75), main = paste0( category_filter, ", GC", title_text, thresholds[category_filter], ", p", title_text, probabilities[category_filter]) ) polygon( c(dens$x[filt], thresholds[category_filter]), c(dens$y[filt], 0), col = colour ) label_text <- paste0("n = ", n_genes) text(x = x_location, y = 4, labels = label_text, cex = 1.5, col = colour, font = 2) })
The thresholds are used to filter the set of genes. 200 genes are then randomly selected from the filtered set. This is repeated 100 times to get 100 lists each containing 200 genes.
GC_genelists <- lapply(names(thresholds), function(category){ if (more_less[category] == "less") { filtered_genes <- genfo$gene_name[genfo$GC_content < thresholds[category]] } else{ filtered_genes <- genfo$gene_name[genfo$GC_content > thresholds[category]] } sapply(1:100, function(i){ filtered_genes[ceiling(runif(200, min = 0, max = length(filtered_genes) - 1))] }) }) names(GC_genelists) <- names(thresholds) #save(GC_genelists, file = "M:/temp/GC/GC_genelists.rda")
par(mfrow = c(1, 1)) GC <- lapply(GC_genelists, function(x) { genfo$GC_content[match(unlist(x), genfo$gene_name)] }) boxplot(GC, col = 1, las = 1, main = "Distribution of GC content for 100 sets of GC biased genes")
The 100 sets of 200 genes for each GC category were run through a GO overrepresentation analysis.
go_results <- overrep_test(all_go_categories, GC_genelists[[1]][,1], bg_genes) head(go_results) GC_results <- lapply(GC_genelists, function(subset){ apply(subset, MARGIN = 2, function(query){ overrep_test(all_go_categories, query, bg_genes)#, mult_test = FALSE) }) }) #Any categories with 0 significant results can be removed. GC_results <- GC_results[!sapply(GC_results, is.null)] # We don't want to run the GO analysis each time the document is knitted as it takes # too long. The code was run once and the data saved as an .rda object that can be # quickly loaded in to the R session. save(GC_results, file = "../data/GC_results.rda")
# See how many significant categories were returned for each gene list. load("../data/GC_results.rda") number_of_results <- sapply(GC_results, function(x){ nulls_removed <- x[lapply(x,length) != 0] vapply(nulls_removed, nrow, FUN.VALUE = numeric(1)) })
The number of gene sets (out of 100) that returned significant results from the GO
overrepresentation analysis.
tibble::enframe(sapply(number_of_results, length), name = "category", value = "results") %>% DT::datatable(options = list(dom = 't'), rownames = FALSE)
The bean plots show the number of categories returned from each gene list. The null results have been removed.
par(mfrow = c(1,1)) library(beanplot) options(scipen = 999) # disable the scientific notation beanplot( number_of_results, what = c(0,1,0,1), col = c("#1B9E77","#06086d"), ll = 0.1, method = "jitter", border = "#06086d", las = 1, main = "number of significant categories per gene list returned from GO analysis" ) text(1:22, y = 250, cex = 0.8, col = 1, labels = sapply(number_of_results, length) )
All 100 sets of genes in the low GC categories returned significant results, but only 11, 14 and 5 sets returned significant results for the 3 high GC categories. We can plot out the distribution of the number of categories that were identified for each of the sets. Each horizontal line in the bean is the number of results identified for a gene list.
#Plot out the p and q values #There are loads of results returned for the long genes, we'll plot out the p #and q values. p_and_q <- lapply(GC_results, function(x){ pvals <- unlist(sapply(x, `[[`, "pval")) qvals <- unlist(sapply(x, `[[`, "adj_pval")) data.frame(pvals, qvals) }) par(mfrow = c(length(p_and_q), 2)) plot_density_highlight <- function(data_values, xlabel = "", threshold = 0.05, title = "", colour = 3 ){ dens <- density(data_values) filt <- dens$x < threshold plot(dens, main = title, xlab = xlabel, ylim = c(0, max(dens$y) * 1.2) ) polygon( c(dens$x[filt], threshold), c(dens$y[filt], 0), col = colour ) text_label = paste0("n = ", sum(data_values < threshold)) text(dens$x[length(dens$x)/10], y = max(dens$y) * 1.1, labels = text_label, font = 2, col = "red2" ) } sapply(names(p_and_q), function(x) { x_suffix <- paste0("values, N = ", nrow(p_and_q[[x]])) plot_density_highlight(p_and_q[[x]]$pvals, title = x, xlabel = paste0("p ", x_suffix) ) plot_density_highlight(p_and_q[[x]]$qvals, title = x, xlabel = paste0("corrected p ", x_suffix) ) })
ordered_categories <- lapply(GC_results, function(length_subset){ all_sig_categories <- unlist(sapply(length_subset, rownames)) tabled_categories <- table(all_sig_categories) tabled_categories[order(tabled_categories, decreasing = TRUE)] }) #lapply(ordered_categories, head, n = 10)
# convert list of table to list of vectors convert_tbl_vec <- function(list_of_tables){ list_of_vec <- lapply(names(list_of_tables), function(x){ vector <- as.vector(list_of_tables[[x]]) names(vector) <- names(list_of_tables[[x]]) vector }) names(list_of_vec) <- names(list_of_tables) list_of_vec }
We can now see whether any functional categories appeared multiple times during the 100 tests. They would only appear once per test, so the maximum number of appearances should be 100.
par(mfrow = c(1,1)) ordered_categories_vec <- convert_tbl_vec(ordered_categories) #ordered_categories_vec <- ordered_categories_vec[match(chr_order, names(ordered_categories_vec))] boxplot(ordered_categories_vec, main = "number of times a GO category appeared during the 100 tests", ylab = "no of times GO category appeared", xlab = "GC category", pch = 16, cex = 0.5, col = 1, cex.axis = 0.8, ylim = c(0, 110) ) text_labels <- sapply(ordered_categories_vec, length) mtext("n categories", side = 2, line = -4.2, at = c(1, 108), las = 2, cex = 0.9) text(1:22, y = 108, cex = 0.8, col = 1, labels = text_labels )
Replot just showing the data for categories that appeared >= r min_appearances
times
filtered_categories <- lapply(ordered_categories, function(x) x[x >= min_appearances]) filtered_categories_vec <- convert_tbl_vec(filtered_categories) boxplot(filtered_categories_vec, main = paste0("GO category appearances (>", min_appearances, ")"), ylab = "no of times GO category appeared", xlab = "GC", pch = 16, cex = 0.5, col = 1, las = 1, cex.axis = 0.8, ylim = c(10, 110) ) text(1:22, y = 108, cex = 0.8, col = 2, labels = sapply(filtered_categories_vec, length) )
Create a dataset that contains these suspect set of categories
suspects <- lapply(ordered_categories, function(x) names(x[x >= min_appearances])) sapply(suspects, length)
We don't need empty files so remove GC categories with no results.
suspects <- suspects[sapply(suspects, length) > 0] sapply(suspects, length)
Deduplicate
print_overlaps <- function(set1, set2, not_in = FALSE) { ifelse( not_in, print(sum(!set1 %in% set2)), print(sum(set1 %in% set2)) ) } with(suspects, { print_overlaps(vv_low_GC, very_low_GC) print_overlaps(vv_low_GC, low_GC) print_overlaps(very_low_GC, low_GC) })
There are r print_overlaps(suspects$vv_low_GC, suspects$very_low_GC)
overlapping categories
between vv low_GC and very_low_GC.
There are r print_overlaps(suspects$vv_low_GC, suspects$low_GC)
overlapping categories
between vv low_GC and low_GC.
There are r print_overlaps(suspects$very_low_GC, suspects$low_GC)
overlapping categories
between very low_GC and low_GC.
The duplicate terms will be removed from the more stringent categories.
suspects$vv_low_GC <- with(suspects, vv_low_GC[!vv_low_GC %in% very_low_GC]) suspects$vv_low_GC <- with(suspects, vv_low_GC[!vv_low_GC %in% low_GC]) suspects$very_low_GC <- with(suspects, very_low_GC[!very_low_GC %in% low_GC]) sapply(suspects, length)
# write out file with unix line endings for (i in 1:length(suspects)) { filename <- paste0("../data/", names(suspects)[i], ".txt") output_file <- file(filename, "wb") write.table( file = output_file, x = suspects[[i]], row.names = FALSE, col.names = FALSE, quote = FALSE ) close(output_file) }
Didn't get any significant categories using GC > 0.55 or 0.6, though I did before - it may be that the filters such as the max number of genes is too strict?
for i in {1..100}; do ../filter_gene_info.pl --max_GC 0.6 ../Mus_musculus.GRCm38.94_gene_info.txt
--number_of_genes 200 --output_file highGC_0.6_${i}; done
Extract the gene names and remove header
for i in highGC*; do cut -f2 $i | tail -n 200 > ${i}_just_genes.txt; done
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.