library(knitr)
library(rmdformats)
## Global options
options(max.print="75")
display_code <- FALSE
knitr::opts_knit$set(global.par = TRUE, width=75)
knitr::opts_chunk$set(echo=display_code,
                 #cache=TRUE,
               prompt=FALSE,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE)

library(devtools)
load_all()
library("devtools")
load_all()
par(mgp = c(2, 0.5, 0))
par(mar = par()$mar * 0.7)

library(RColorBrewer)
palette(brewer.pal(6, "Set2"))

genome <- "Mus_musculus.GRCm38.94"
# Set a cut-off for the minimum number of times a category must appear in the 100 tests.
min_appearances <- 30

To check whether any gene ontology or other functional categories were associated with high or low GC content, lists of genes were generated biased by GC content.

Preliminary work looking at GC biases using intervals over the whole range of GC content in the genome only identified categories using genes with high or low GC content, not the intermediate ranges, so we're focusing on the ends of the distribution.

GC distribution for all the genes in Mus_musculus.GRCm38.94

genfo_file_name <- paste0("../data/", genome, "_gene_info.txt")

genfo <- read.delim(genfo_file_name)
bg_genes <- as.vector(unique(genfo$gene_name))
plot(density(genfo$GC_content), lwd = 2, main = "", xlab = "GC content", xlim = c(0.25,0.75))



Selecting GC thresholds

For the preliminary work, arbitrary thresholds of 0.35, 0.4, 0.5, 0.55, 0.6 GC were used.

However, selecting thresholds dependent on the distribution of the data is a more robust method for applying to different species and future versions of this genome. The GC content thresholds will be based on the distribution of GC content over all genes. The probabilities selected will be 0.01, 0.05, 0.1, 0.9, 0.95, 0.99.

library("magrittr")
probabilities <-  c(0.01, 0.05, 0.1, 0.9, 0.95, 0.99)
thresholds <- quantile(genfo$GC_content, probs = probabilities)

more_less <- c(
  vv_low_GC    = "less",
  very_low_GC  = "less", 
  low_GC       = "less", 
  high_GC      = "more", 
  very_high_GC = "more", 
  vv_high_GC   = "more"
)

names(thresholds)    <- names(more_less)
names(probabilities) <- names(more_less)

gene_counts <- sapply(names(thresholds), function(category_filter){
  if (more_less[category_filter] == "less") {
    sum(genfo$GC_content <= thresholds[category_filter])
  }
  else {
     sum(genfo$GC_content >= thresholds[category_filter])
  }
})

gene_counts_tibble <- tibble::enframe(gene_counts, name = "category", value = "gene count") %>%
  dplyr::mutate(probabilities = probabilities)
wzxhzdk:6
wzxhzdk:7



par(mfrow = c(2, 3))

dens <- density(genfo$GC_content)

sapply(names(thresholds), function(category_filter){

  if (more_less[category_filter] == "less") {

    filt <- dens$x < thresholds[category_filter]
    colour <- 1
    n_genes <- sum(genfo$GC_content < thresholds[category_filter])
    x_location <- 0.3
    title_text <- " <= "

    } else {

    filt <- dens$x > thresholds[category_filter]
    colour <- 2
    n_genes <- sum(genfo$GC_content > thresholds[category_filter])
    x_location <- 0.65
    title_text <- " >= "

    }

    plot(dens, 
       lwd  = 2, 
       xlab = "GC content", 
       xlim = c(0.2,0.75),
       main = paste0(
         category_filter, 
         ", GC",
         title_text,
         thresholds[category_filter],
         ", p",
         title_text,
         probabilities[category_filter])
    )

    polygon(
      c(dens$x[filt], thresholds[category_filter]), 
      c(dens$y[filt], 0), 
      col = colour
    )
   label_text <- paste0("n = ", n_genes)
   text(x = x_location, y = 4, labels = label_text, cex = 1.5, col = colour, font = 2)

})  



Generating the biased gene lists

The thresholds are used to filter the set of genes. 200 genes are then randomly selected from the filtered set. This is repeated 100 times to get 100 lists each containing 200 genes.

GC_genelists <- lapply(names(thresholds), function(category){

  if (more_less[category] == "less") {
    filtered_genes <- genfo$gene_name[genfo$GC_content < thresholds[category]]
  }
  else{
    filtered_genes <- genfo$gene_name[genfo$GC_content > thresholds[category]]
  }
  sapply(1:100, function(i){
    filtered_genes[ceiling(runif(200, min = 0, max = length(filtered_genes) - 1))]
  })
})  

names(GC_genelists) <- names(thresholds)

#save(GC_genelists, file = "M:/temp/GC/GC_genelists.rda")
par(mfrow = c(1, 1))

GC <- lapply(GC_genelists, function(x) {
  genfo$GC_content[match(unlist(x), genfo$gene_name)]
})

boxplot(GC, col = 1, las = 1, main = "Distribution of GC content for 100 sets of GC 
        biased genes")

GO overrepresentation analysis

The 100 sets of 200 genes for each GC category were run through a GO overrepresentation analysis.

go_results <- overrep_test(all_go_categories, GC_genelists[[1]][,1], bg_genes)
head(go_results)

GC_results <- lapply(GC_genelists, function(subset){

  apply(subset, MARGIN = 2, function(query){
    overrep_test(all_go_categories, query, bg_genes)#, mult_test = FALSE)
  })
})

#Any categories with 0 significant results can be removed.
GC_results <- GC_results[!sapply(GC_results, is.null)]

# We don't want to run the GO analysis each time the document is knitted as it takes
# too long. The code was run once and the data saved as an .rda object that can be 
# quickly loaded in to the R session.
save(GC_results, file = "../data/GC_results.rda")
# See how many significant categories were returned for each gene list.
load("../data/GC_results.rda")

number_of_results <- sapply(GC_results, function(x){
  nulls_removed <- x[lapply(x,length) != 0]
  vapply(nulls_removed, nrow, FUN.VALUE = numeric(1))
})  

The number of gene sets (out of 100) that returned significant results from the GO overrepresentation analysis.

tibble::enframe(sapply(number_of_results, length), name = "category", value = "results") %>%
  DT::datatable(options = list(dom = 't'), rownames = FALSE)



The bean plots show the number of categories returned from each gene list. The null results have been removed.

par(mfrow = c(1,1))

library(beanplot)
options(scipen = 999) # disable the scientific notation

beanplot(
  number_of_results, 
  what   = c(0,1,0,1), 
  col    = c("#1B9E77","#06086d"), 
  ll     = 0.1, 
  method = "jitter", 
  border = "#06086d",
  las    = 1,
  main   = "number of significant categories per gene list returned from GO analysis"
)

text(1:22, 
     y   = 250, 
     cex = 0.8, 
     col = 1,
     labels = sapply(number_of_results, length)
     )

All 100 sets of genes in the low GC categories returned significant results, but only 11, 14 and 5 sets returned significant results for the 3 high GC categories. We can plot out the distribution of the number of categories that were identified for each of the sets. Each horizontal line in the bean is the number of results identified for a gene list.

#Plot out the p and q values
#There are loads of results returned for the long genes, we'll plot out the p 
#and q values.
p_and_q <- lapply(GC_results, function(x){
  pvals <- unlist(sapply(x, `[[`, "pval"))
  qvals <- unlist(sapply(x, `[[`, "adj_pval"))
  data.frame(pvals, qvals)
})

par(mfrow = c(length(p_and_q), 2))


plot_density_highlight <- function(data_values, 
                                   xlabel    = "", 
                                   threshold = 0.05, 
                                   title     = "", 
                                   colour    = 3
                                   ){

  dens <- density(data_values)
  filt <- dens$x < threshold 

  plot(dens, 
    main = title, 
    xlab = xlabel,
    ylim = c(0, max(dens$y) * 1.2) 
  )
  polygon(
      c(dens$x[filt], threshold), 
      c(dens$y[filt], 0), 
      col = colour
    )
  text_label = paste0("n = ", sum(data_values < threshold))
  text(dens$x[length(dens$x)/10], 
       y = max(dens$y) * 1.1, 
       labels = text_label, 
       font = 2, 
       col  = "red2"
       )
}

sapply(names(p_and_q), function(x) {  

  x_suffix <- paste0("values, N = ", nrow(p_and_q[[x]]))

  plot_density_highlight(p_and_q[[x]]$pvals, 
                         title  = x, 
                         xlabel = paste0("p ", x_suffix)
  )

  plot_density_highlight(p_and_q[[x]]$qvals, 
                         title  = x, 
                         xlabel = paste0("corrected p ", x_suffix)
  )
})
ordered_categories <- lapply(GC_results, function(length_subset){

  all_sig_categories <- unlist(sapply(length_subset, rownames))
  tabled_categories  <- table(all_sig_categories)
  tabled_categories[order(tabled_categories, decreasing = TRUE)]
})

#lapply(ordered_categories, head, n = 10)
# convert list of table to list of vectors
convert_tbl_vec <- function(list_of_tables){
  list_of_vec <- lapply(names(list_of_tables), function(x){

    vector <- as.vector(list_of_tables[[x]])
    names(vector) <- names(list_of_tables[[x]])
    vector
  })
  names(list_of_vec) <- names(list_of_tables)
  list_of_vec
}

We can now see whether any functional categories appeared multiple times during the 100 tests. They would only appear once per test, so the maximum number of appearances should be 100.

par(mfrow = c(1,1))
ordered_categories_vec <- convert_tbl_vec(ordered_categories) 
#ordered_categories_vec <- ordered_categories_vec[match(chr_order, names(ordered_categories_vec))]

boxplot(ordered_categories_vec, 
        main = "number of times a GO category appeared during the 100 tests",
        ylab = "no of times GO category appeared",
        xlab = "GC category",
        pch = 16,
        cex = 0.5,
        col = 1,
        cex.axis = 0.8,
        ylim = c(0, 110)
        )

text_labels <- sapply(ordered_categories_vec, length)

mtext("n categories", side = 2, line = -4.2, at = c(1, 108), las = 2, cex = 0.9)
text(1:22, 
     y   = 108, 
     cex = 0.8, 
     col = 1,
     labels = text_labels
     )

Replot just showing the data for categories that appeared >= r min_appearances times

filtered_categories <- lapply(ordered_categories, function(x) x[x >= min_appearances])

filtered_categories_vec <- convert_tbl_vec(filtered_categories) 

boxplot(filtered_categories_vec,
        main = paste0("GO category appearances (>", min_appearances, ")"),
        ylab = "no of times GO category appeared",
        xlab = "GC",
        pch  = 16,
        cex  = 0.5,
        col  = 1,
        las  = 1,
        cex.axis = 0.8,
        ylim = c(10, 110)
        )

text(1:22, 
     y   = 108, 
     cex = 0.8, 
     col = 2,
     labels = sapply(filtered_categories_vec, length)
     )

Create a dataset that contains these suspect set of categories

suspects <- lapply(ordered_categories, function(x) names(x[x >= min_appearances]))  

sapply(suspects, length)

We don't need empty files so remove GC categories with no results.

suspects <- suspects[sapply(suspects, length) > 0]
sapply(suspects, length)

Deduplicate

print_overlaps <- function(set1, set2, not_in = FALSE) {
  ifelse(
    not_in, 
    print(sum(!set1 %in% set2)), 
    print(sum(set1 %in% set2))
  )
}

with(suspects, {
  print_overlaps(vv_low_GC, very_low_GC)
  print_overlaps(vv_low_GC, low_GC)
  print_overlaps(very_low_GC, low_GC)
})

There are r print_overlaps(suspects$vv_low_GC, suspects$very_low_GC) overlapping categories between vv low_GC and very_low_GC. There are r print_overlaps(suspects$vv_low_GC, suspects$low_GC) overlapping categories between vv low_GC and low_GC. There are r print_overlaps(suspects$very_low_GC, suspects$low_GC) overlapping categories between very low_GC and low_GC.

The duplicate terms will be removed from the more stringent categories.

suspects$vv_low_GC <- with(suspects, vv_low_GC[!vv_low_GC %in% very_low_GC])
suspects$vv_low_GC <- with(suspects, vv_low_GC[!vv_low_GC %in% low_GC])
suspects$very_low_GC <- with(suspects, very_low_GC[!very_low_GC %in% low_GC])
sapply(suspects, length)
# write out file with unix line endings
for (i in 1:length(suspects)) {

  filename <- paste0("../data/", names(suspects)[i], ".txt")

  output_file <- file(filename, "wb")

  write.table(
    file      = output_file,
    x         = suspects[[i]], 
    row.names = FALSE,
    col.names = FALSE, 
    quote     = FALSE
  )

  close(output_file)
}

Didn't get any significant categories using GC > 0.55 or 0.6, though I did before - it may be that the filters such as the max number of genes is too strict?

for i in {1..100}; do ../filter_gene_info.pl --max_GC 0.6 ../Mus_musculus.GRCm38.94_gene_info.txt --number_of_genes 200 --output_file highGC_0.6_${i}; done

Extract the gene names and remove header for i in highGC*; do cut -f2 $i | tail -n 200 > ${i}_just_genes.txt; done



laurabiggins/GOcategoryStats documentation built on Oct. 27, 2019, 11:36 a.m.