library(knitr)
library(rmdformats)
library(RColorBrewer)

## Global options
options(max.print="75")
display_code <- FALSE
knitr::opts_knit$set(global.par = TRUE, width=75)
knitr::opts_chunk$set(echo=display_code,
                 cache=FALSE,
               prompt=FALSE,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE)

library(devtools)
load_all()
par(mgp = c(2, 0.5, 0))
par(mar = par()$mar * 0.7)

palette(brewer.pal(6, "Paired"))

genome <- "Mus_musculus.GRCm38.94"
#Set a cut-off for the minimum number of times a category must appear in the 100 tests.
min_appearances <- 30

Generate lists of mouse genes by number of transcripts.

In the mouse genome, most genes have only one transcript, but some have many more.
Do sets of genes with high numbers of transcripts produce significant results in GO overrepresentation analysis?

Number of transcripts per gene

The pre-processed gene info file can be used to plot the distribution of the number of transcripts per gene in the r genome genome.

genfo_file_name <- paste0("../data/", genome, "_gene_info.txt")

genfo <- read.delim(genfo_file_name)
bg_genes <- as.vector(unique(genfo$gene_name))

hist(genfo$no_of_transcripts, 
     main = paste0("number of transcripts per gene in ", genome), 
     xlab = "number of transcripts", 
     col = 1
     )

As expected, most genes have very low numbers of transcripts, so we don't get much information from this plot. We can look at the information in a table format to get actual numbers.

table(genfo$no_of_transcripts)

r ceiling(table(genfo$no_of_transcripts)["1"] / nrow(genfo) *100)% of genes only have one transcript. r ceiling(table(genfo$no_of_transcripts)["2"] / nrow(genfo) *100)% of genes have two transcripts. I'm not interested in these so I'll replot the data and see how it looks without these values. Above 30 there are only a handful of genes so these can be removed from the plot to see it better.

#hist(genfo$no_of_transcripts[genfo$no_of_transcripts > 2], 
#     main = paste0("number of transcripts (> 2) per gene in ", genome), 
#     xlab = "number of transcripts (> 2)",
#     breaks = 12,
#     col = 1,
#     xlim = c(3, 30)
#     )

library("ggplot2")
genfo_temp <- genfo[genfo$no_of_transcripts > 2 & genfo$no_of_transcripts <= 30,]
ggplot(genfo_temp , aes(x = no_of_transcripts)) + geom_histogram(fill = "seagreen", binwidth = 1, color = "darkblue")

Thresholds

Originally, I selected arbitrary thresholds of minimum 10, 15 and 20 transcripts per gene. These seemed like reasonable numbers to choose and plenty of functional categories were identified in the overrepresentation tests.

It is not ideal having hardcoded values when it comes to generating further sets of genes. Selecting thresholds dependent on the distribution of the data is a more robust method for applying to different species and future versions of this genome.

probabilities <-  c(0.9, 0.95, 0.99)
transcript_thresholds <- quantile(genfo$no_of_transcripts, probs = probabilities)
gene_counts <- sapply(transcript_thresholds, function(x) sum(genfo$no_of_transcripts >= x))

thresholds <- data.frame(transcript_thresholds, probabilities, gene_counts)
wzxhzdk:7

#It worked nicely for the gene lengths to plot out the numbers on graphs but the 
#distribution here doesn't plot quite so well.
dens <- density(genfo$no_of_transcripts, adjust = 5)
plot(dens, xlim = c(-2,16))

Generating the biased gene sets

all_genes_in_group <- sapply(thresholds$transcript_thresholds, function(x){
  genfo$gene_name[genfo$no_of_transcripts >= x]
})
names(all_genes_in_group) <- thresholds$probabilities

biased_transcripts <- lapply(names(all_genes_in_group), function(x){
  sapply(1:100, function(i){
    all_genes_in_group[[x]][ceiling(runif(200, min = 0, max = length(all_genes_in_group[[x]]) - 1))]
  })
})
names(biased_transcripts) <- names(all_genes_in_group)


1. Select the subset of genes with high numbers of transcripts.

2. Randomly select 200 genes.

3. Repeat 100 times.

Repeat this for each transcript category.

The boxplot shows the distribution of transcript numbers for the genes in the biased sets.
wzxhzdk:10



GO over representation analysis

# We don't want to run the analysis each time the document is knitted as it takes
# too long. The code was run once and the data saved as an .rda object that can be 
# quickly loaded in to the R session.

save(biased_transcripts, file = "../data/biased_transcripts.rda")

x <- overrep_test(all_go_categories, query_genes = biased_transcripts[[1]][,1], bg_genes)

transcript_results <- lapply(biased_transcripts, function(transcript_subset){

  apply(transcript_subset, 2, function(query){
    overrep_test(all_go_categories, query, bg_genes)
  })
})

names(transcript_results) <- names(biased_transcripts)

save(transcript_results, file = "../data/transcript_results.rda")

The biased sets of genes were run through a GO over-representation analysis.

load("../data/transcript_results.rda")

number_of_results <- lapply(transcript_results, function(x){
  nulls_removed <- x[lapply(x,length) != 0]
  vapply(nulls_removed, nrow, FUN.VALUE = numeric(1))
})  

The table shows the number of gene sets that returned significant results from the GO overrepresentation analysis wzxhzdk:13
The bean plot shows the number of significant categories per gene list returned from the GO analysis.
wzxhzdk:14
p_and_q <- lapply(transcript_results, function(x){
  pvals <- unlist(sapply(x, `[[`, "pval"))
  qvals <- unlist(sapply(x, `[[`, "adj_pval"))
  data.frame(pvals, qvals)
})

par(mfrow = c(2, 2))

plot_density_highlight <- function(data_values, 
                                   xlabel    = "", 
                                   threshold = 0.05, 
                                   title     = "", 
                                   colour    = 1
                                   ){

  dens <- density(data_values)
  filt <- dens$x < threshold 

  plot(dens, 
    main = title, 
    xlab = xlabel,
    ylim = c(0, max(dens$y) * 1.2) 
  )
  polygon(
      c(dens$x[filt], threshold), 
      c(dens$y[filt], 0), 
      col = colour
    )
  text_label = paste0("n = ", sum(data_values < threshold))
  text(dens$x[length(dens$x)/10], 
       y = max(dens$y) * 1.1, 
       labels = text_label, 
       font = 2, 
       col  = "red2"
       )
}

sapply(names(p_and_q), function(x) {  

  x_suffix <- paste0("values, N = ", nrow(p_and_q[[x]]))

  plot_density_highlight(p_and_q[[x]]$pvals, 
                         title  = x, 
                         xlabel = paste0("p ", x_suffix)
  )

  plot_density_highlight(p_and_q[[x]]$qvals, 
                         title  = x, 
                         xlabel = paste0("corrected p ", x_suffix)
  )
})

#x_suffix <- paste0("values, N = ", nrow(min20_results))

#plot_density_highlight(min20_results$pval, xlabel = paste0("corrected p ", x_suffix), title = "min20")
#plot_density_highlight(min20_results$adj_pval, xlabel = paste0("corrected p ", x_suffix), title = "min20")
ordered_categories <- lapply(transcript_results, function(transcript_subset){

  all_sig_categories <- unlist(sapply(transcript_subset, rownames))
  tabled_categories  <- table(all_sig_categories)
  tabled_categories[order(tabled_categories, decreasing = TRUE)]
})

GO category occurrences

par(mfrow = c(1,2))
sapply(names(ordered_categories), function(x){
  plot(density(ordered_categories[[x]]), main = x)
})
par(mfrow = c(1,2))

beanplot(
  ordered_categories, 
  what   = c(0,1,0,1), 
  col    = c("#1B9E77","#06086d"), 
  ll     = 0.02, 
  method = "jitter", 
  border = "#06086d",
  las    = 1,
  log    = "",
  ylim   = c(0, 120),
  main  = "GO category appearances during the 100 tests",
  ylab  = "no of times GO category appeared"
)

text(1:length(ordered_categories), 
     y   = 115, 
     cex = 0.8, 
     labels = paste0("n = ", sapply(ordered_categories, length))
     )

filtered_categories <- lapply(ordered_categories, function(x) x[x >= min_appearances])

beanplot(
  filtered_categories, 
  what   = c(0,1,0,1), 
  col    = c("#1B9E77","#06086d"), 
  ll     = 0.03, 
  method = "jitter", 
  border = "#06086d",
  las    = 1,
  log    = "",
  ylim   = c(10, 120),
  main   = paste0("GO category appearances (>", min_appearances, ")"),
  ylab   = "no of times GO category appeared"
)

text(1:length(filtered_categories), 
     y   = 115, 
     cex = 0.8, 
     labels = paste0("n = ", sapply(filtered_categories, length))
     )

Replot just showing the data for categories that appeared >= r min_appearances times

Now we could do with some stats to pick a cutoff for the number of times a category appears. Let's just select an arbitrary value for now....

Create a dataset that contains these suspect set of categories

suspects <- lapply(ordered_categories, function(x){
  names(x[x >= min_appearances])
})  

Deduplicate

print_overlaps <- function(set1, set2, not_in = FALSE) {
  ifelse(
    not_in, 
    sum(!set1 %in% set2), 
    sum(set1 %in% set2)
  )
}
#print_overlaps(suspects$`0.99`, suspects$`0.95`)
#print_overlaps(suspects$`0.99`, suspects$`0.9`)
#print_overlaps(suspects$`0.95`, suspects$`0.9`)


#There are `r print_overlaps(suspects$`0.95`, suspects$`0.9`)` overlapping categories
#between 0.95 and 0.9.

There are r print_overlaps(suspects[["0.99"]], suspects[["0.95"]]) overlapping categories between 0.99 and 0.95.
There are r print_overlaps(suspects[["0.99"]], suspects[["0.9"]]) overlapping categories between 0.99 and 0.9.
There are r print_overlaps(suspects[["0.95"]], suspects[["0.9"]]) overlapping categories between 0.95 and 0.9.

The duplicate terms will be removed from the more stringent categories.

suspects$`0.99` <- with(suspects, `0.99`[!`0.99` %in% `0.95`])
suspects$`0.99` <- with(suspects, `0.99`[!`0.99` %in% `0.9`])
suspects$`0.95` <- with(suspects, `0.95`[!`0.95` %in% `0.9`])

The number of terms remaining after deduplication

sapply(suspects, length)

Rename the categories

names(suspects) <- c("high_transcripts", "very_high_transcripts", "vv_high_transcripts")
#eval=FALSE, echo = FALSE}
# write out file with unix line endings
for (i in 1:length(suspects)) {

  filename <- paste0("../data/", names(suspects)[i], ".txt")
  output_file <- file(filename, "wb")

  write.table(file = output_file, 
              x = suspects[[i]], 
              row.names = FALSE,
              col.names = FALSE, 
              quote = FALSE)

  close(output_file)
}


laurabiggins/GOcategoryStats documentation built on Oct. 27, 2019, 11:36 a.m.