library(knitr)
library(rmdformats)
## Global options
options(max.print="75")
display_code <- FALSE
knitr::opts_knit$set(global.par = TRUE, width=75)
knitr::opts_chunk$set(echo=display_code,
                 #cache=TRUE,
               prompt=FALSE,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE)

library(rmdformats)
library(devtools)
load_all()
library(RColorBrewer)
palette(brewer.pal(6, "Set2"))

par(mgp = c(2, 0.5, 0))
par(mar = par()$mar * 0.7)

genome <- "Mus_musculus.GRCm38.94"
# Set a cut-off for the minimum number of times a category must appear in the 100 tests.
min_appearances <- 30

Generate lists of mouse genes biased by the length of the genes.

genfo_file_name <- paste0("../data/", genome, "_gene_info.txt")
genfo <- read.delim(genfo_file_name)
bg_genes <- as.vector(unique(genfo$gene_name))

Lengths of all genes

The gene info file contains the lengths of all the genes in the mouse genome r genome. These can be plotted so that some thresholds for length categories can be determined. Plot out the length distribution for all the genes.

par(mfrow = c(1,2))
plot(density(log2(genfo$length)), lwd = 2, main = "", xlab = "log2 gene length")
boxplot(log2(genfo$length), ylab = "log2 gene length")

This data is displayed on a log scale as there are some very long genes that distort the graph if it is plotted on a linear scale. The plot shows a set of short genes between r 2^5 and r 2^7.6 (~5-7.5 on log2 scale). This provides a natural set of lengths to use, and can be split into a "very short" and a "short1" category. We'll also take some of the genes that are slightly longer but are below the next peak in the distribution. As the distribution of lengths for the long genes is smooth, this provides no obvious thresholds to select, so we can take the top 5% and 10% of lengths.

long <- quantile(log2(genfo$length), probs = c(0.9, 0.95))
names(long) <- c("long", "very_long")

Select gene length thresholds.

par(mfrow = c(1,1))

plot(density(log2(genfo$length)), 
     lwd  = 2, 
     main = "", 
     xlab = "log2 gene length"
)

thresholds <- c(
  very_short = 6.8, 
  short1     = 7.6, 
  short2     = 9.7, 
  long
)

more_less <- c(
  very_short = "less", 
  short1     = "less", 
  short2     = "interval", 
  long       = "more", 
  very_long  = "more"
)

sapply(names(thresholds), function(x){
  colours <- as.factor(more_less)
  abline(v = thresholds[x], col = colours[x], lwd = 2, lty = 2)
})

Thresholds

thresholds

The number of genes that remain after filtering for each length category.

dens <- density(log2(genfo$length))

par(mfrow = c(2, 3))

sapply(names(thresholds), function(length_cat){

  plot(dens, lwd = 2, xlab = "log2 gene length", main = length_cat)

  if (more_less[length_cat] == "less") {
    filt <- dens$x < thresholds[length_cat]
    polygon(
      c(dens$x[filt], thresholds[length_cat]), 
      c(dens$y[filt], 0), 
      col = "grey"
    )
    n_genes <- sum(log2(genfo$length) < thresholds[length_cat])

  } else if (more_less[length_cat] == "more") {
    filt <- dens$x > thresholds[length_cat]
    polygon(
      c(dens$x[filt], thresholds[length_cat]), 
      c(dens$y[filt], 0), 
      col = "grey"
    )
    n_genes <- sum(log2(genfo$length) > thresholds[length_cat])

  } else {
    # hard code the interval plot
     filt <- dens$x > thresholds["short1"] & dens$x < thresholds["short2"]
     polygon(
       c(thresholds["short1"], dens$x[filt], thresholds["short2"]), 
       c(0, dens$y[filt], 0), 
       col = "grey"
     )
     n_genes <- sum(log2(genfo$length) > thresholds["short1"] & log2(genfo$length) < thresholds["short2"] )
  }

  label_text <- paste0("n = ", n_genes)
  text(x = 19, y = 0.09, labels = label_text, cex = 1.5)
})

Generating the biased gene lists

biased_lengths <- lapply(names(thresholds), function(category){

  if (more_less[category] == "less") {
    filtered_genes <- genfo$gene_name[log2(genfo$length) < thresholds[category]]
  }
  else if (more_less[category] == "more") {
    filtered_genes <- genfo$gene_name[log2(genfo$length) > thresholds[category]]
  }
  else {
      # hard code the interval plot
     filtered_genes <- genfo$gene_name[log2(genfo$length) > thresholds["short1"] & 
                                         log2(genfo$length) < thresholds["short2"]]
  }
  sapply(1:100, function(i){
    filtered_genes[ceiling(runif(200, min = 0, max = length(filtered_genes) - 1))]
  })
})  

names(biased_lengths) <- names(thresholds)
save(biased_lengths, file = "M:/temp/length/length_genelists.rda")

Check they all look right

par(mfrow = c(1, 1))

lengths <- lapply(biased_lengths, function(x) {
  log2(genfo$length[match(unlist(x), genfo$gene_name)])
})

boxplot(lengths, col = 1)

The odd gene that doesn't fall within the expected size range may be due to name duplication.

GO overrepresentation analysis

Run the gene lists through a GO overrepresentation analysis.

x <- overrep_test(all_go_categories, query_genes = biased_lengths[[4]][,1], bg_genes)

gene_length_results <- lapply(biased_lengths, function(length_subset){

  apply(length_subset, 2, function(query){  
    overrep_test(all_go_categories, query, bg_genes)
  })
})

save(gene_length_results, file = "../data/gene_length_results.rda")

We don't want to run the GO analysis each time the document is knitted as it takes too long. The code was run once and the data saved as an .rda object that can be quickly loaded in to the R session.

See how many significant categories were returned.

load("../data/gene_length_results.rda")

number_of_results <- lapply(gene_length_results, function(x){
  nulls_removed <- x[lapply(x,length) != 0]
  vapply(nulls_removed, nrow, FUN.VALUE = numeric(1))
})  

The number of gene sets that returned significant results from the GO overrepresentation analysis

sapply(number_of_results, length)

The bean plots show the number of categories returned from each gene list. The null results have been removed.

par(mfrow = c(1,1))

library(beanplot)
options(scipen = 999) # disable the scientific notation

beanplot(
  number_of_results, 
  what   = c(0,1,0,1), 
  col    = c("#1B9E77","#06086d"), 
  ll     = 0.03, 
  method = "jitter", 
  border = "#06086d",
  las    = 1,
  main   = "number of significant categories per gene list returned from GO analysis"
)
#There are loads of results returned for the long genes, we'll plot out the p 
#and q values.

p_and_q <- lapply(gene_length_results, function(x){
  pvals <- unlist(sapply(x, `[[`, "pval"))
  qvals <- unlist(sapply(x, `[[`, "adj_pval"))
  data.frame(pvals, qvals)
})

par(mfrow = c(length(p_and_q), 2))


plot_density_highlight <- function(data_values, 
                                   xlabel    = "", 
                                   threshold = 0.05, 
                                   title     = "", 
                                   colour    = 3
                                   ){

  dens <- density(data_values)
  filt <- dens$x < threshold 

  plot(dens, 
    main = title, 
    xlab = xlabel,
    ylim = c(0, max(dens$y) * 1.2) 
  )
  polygon(
      c(dens$x[filt], threshold), 
      c(dens$y[filt], 0), 
      col = colour
    )
  text_label = paste0("n = ", sum(data_values < threshold))
  text(dens$x[length(dens$x)/10], 
       y = max(dens$y) * 1.1, 
       labels = text_label, 
       font = 2, 
       col  = "red2"
       )
}

sapply(names(p_and_q), function(x) {  

  x_suffix <- paste0("values, N = ", nrow(p_and_q[[x]]))

  plot_density_highlight(p_and_q[[x]]$pvals, 
                         title  = x, 
                         xlabel = paste0("p ", x_suffix)
  )

  plot_density_highlight(p_and_q[[x]]$qvals, 
                         title  = x, 
                         xlabel = paste0("corrected p ", x_suffix)
  )
})
ordered_categories <- lapply(gene_length_results, function(length_subset){

  all_sig_categories <- unlist(sapply(length_subset, rownames))
  tabled_categories  <- table(all_sig_categories)
  tabled_categories[order(tabled_categories, decreasing = TRUE)]
})

Plot how many times a category appeared.

par(mfrow = c(2, 3))
sapply(names(ordered_categories), function(x){
  plot(density(ordered_categories[[x]]), main = x)
})
par(mfrow = c(1,1))

beanplot(
  ordered_categories, 
  what   = c(0,1,0,1), 
  col    = c("#1B9E77","#06086d"), 
  ll     = 0.03, 
  method = "jitter", 
  border = "#06086d",
  las    = 1,
  log    = "",
  ylim   = c(0, 120),
  main  = "number of times a GO category appeared during the 100 tests",
  ylab  = "no of times GO category appeared"
)

text(1:length(ordered_categories), 
     y   = 115, 
     cex = 0.8, 
     labels = paste0("n = ", sapply(ordered_categories, length))
     )
filtered_categories <- lapply(ordered_categories, function(x) x[x >= min_appearances])
# convert list of table to list of vectors
convert_tbl_vec <- function(list_of_tables){
  list_of_vec <- lapply(names(list_of_tables), function(x){

    vector <- as.vector(list_of_tables[[x]])
    names(vector) <- names(list_of_tables[[x]])
    vector
  })
  names(list_of_vec) <- names(list_of_tables)
  list_of_vec
}
filtered_categories <- lapply(ordered_categories, function(x) x[x >= min_appearances])

filtered_categories_vec <- convert_tbl_vec(filtered_categories) 

boxplot(filtered_categories_vec,
        main = paste0("GO category appearances (>", min_appearances, ")"),
        ylab = "no of times GO category appeared",
        pch  = 16,
        cex  = 0.5,
        col  = 1,
        las  = 1,
        cex.axis = 0.8,
        ylim = c(10, 110)
        )

text(1:22, 
     y   = 108, 
     cex = 0.8, 
     col = 2,
     labels = sapply(filtered_categories_vec, length)
     )

Replot just showing the data for categories that appeared >= r min_appearances times

filtered_categories <- lapply(ordered_categories, function(x) x[x >= min_appearances])

beanplot(
  filtered_categories, 
  what   = c(0,1,0,1), 
  col    = c("#1B9E77","#06086d"), 
  ll     = 0.03, 
  method = "jitter", 
  border = "#06086d",
  las    = 1,
  log    = "",
  ylim   = c(10, 120),
  main   = paste0("GO category appearances (>", min_appearances, ")"),
  ylab   = "no of times GO category appeared"
)

text(1:length(ordered_categories), 
     y   = 115, 
     cex = 0.8, 
     labels = paste0("n = ", sapply(filtered_categories, length))
     )

Now we could do with some stats to pick a cutoff for the number of times a category appears. Let's just select an arbitrary value for now....

Create a dataset that contains these suspect set of categories

suspects <- lapply(ordered_categories, function(x) names(x[x >= min_appearances]))  

sapply(suspects, length)
print_overlaps <- function(set1, set2, not_in = FALSE) {
  ifelse(
    not_in, 
    print(sum(!set1 %in% set2)), 
    print(sum(set1 %in% set2))
  )
}

with(suspects, {
  print_overlaps(very_short, short1)
  #print_overlaps(very_short, short2)
  print_overlaps(very_long, long)
})

There are r print_overlaps(suspects[["very_short"]], suspects[["short1"]]) overlapping categories between very_short and short1.
There are r print_overlaps(suspects[["very_long"]], suspects[["long"]]) overlapping categories between very_long and long.

The duplicate terms will be removed from the more stringent categories.

suspects$very_long <- with(suspects, very_long[!very_long %in% long])
suspects$very_short <- with(suspects, very_short[!very_short %in% short1])

sapply(suspects, length)

Rename "short1" to "short"

names(suspects) <- gsub(names(suspects), pattern = "short1", replacement = "short")
suspects <- suspects[sapply(suspects, length) > 0]
sapply(suspects, length)
# write out file with unix line endings
for (i in 1:length(suspects)) {

  filename <- paste0("../data/", names(suspects)[i], ".txt")

  output_file <- file(filename, "wb")

  write.table(
    file      = output_file,
    x         = suspects[[i]], 
    row.names = FALSE,
    col.names = FALSE, 
    quote     = FALSE
  )

  close(output_file)
}


laurabiggins/GOcategoryStats documentation built on Oct. 27, 2019, 11:36 a.m.