inst/doc/benchmarks.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----load_libs, warning = FALSE, message = FALSE, include = TRUE--------------
library(GAPGOM)
library(profvis)
library(GO.db)
library(graph)
library(ggplot2)
library(reshape2)
library(Biobase)

## ---- eval=FALSE--------------------------------------------------------------
#  # Example with default dataset, take a look at the data documentation
#  # to fully grasp what's going on with making of the filter etc. (Biobase
#  # ExpressionSet)
#  
#  # keep everything that is a protein coding gene
#  filter_vector <- expset@featureData@data[(expset@featureData@data$GeneType=="protein_coding"),]$GeneID
#  # set gid and run.
#  gid <- "ENSG00000228630"
#  
#  p <- profvis::profvis({GAPGOM::expression_prediction(gid,
#                                          GAPGOM::expset,
#                                          "human",
#                                          "BP",
#                                          id_translation_df = GAPGOM::id_translation_df,
#                                          id_select_vector = filter_vector,
#                                          method = "combine", verbose = TRUE,
#                                          filter_pvals = TRUE
#                         )})
#  time <- max(p$x$message$prof$time)*10
#  mem <- max(p$x$message$prof$memalloc)

## -----------------------------------------------------------------------------
# laptop
# time
310
# mem
124.91
# ---
# server
# time
560
# mem
72.47

## ---- eval=FALSE--------------------------------------------------------------
#  # prepare the godata for mouse and some other calculations later needed in benchmarking
#  organism <- "human"
#  ontology <- "BP"
#  go_data <- GAPGOM::set_go_data(organism, ontology)

## ----topotitj, eval=FALSE-----------------------------------------------------
#  # grab 15 random GOs (for term algorithm)
#  ## sample(unique(go_data@geneAnno$GO), 15)
#  random_gos <- c("GO:0030177", "GO:0001771", "GO:0045715", "GO:0044330", "GO:0098780",
#  "GO:1901006", "GO:0061143", "GO:0060025", "GO:0015695", "GO:0090074",
#  "GO:0035445", "GO:0008595", "GO:1903634", "GO:1903826", "GO:0048389"
#  )
#  # print them for reproducability
#  ## dput(random_gos)
#  # now compare all unique random GO pairs. (105 uniques).
#  unique_pairs <- GAPGOM:::.unique_combos(random_gos, random_gos)
#  
#  times <- c()
#  mem_usages <- c()
#  for (i in seq_len(nrow(unique_pairs))) {
#    prof_toptitj <- profvis({
#      pair <- unique_pairs[i]
#      go1 <- pair[[1]]
#      go2 <- pair[[2]]
#      GAPGOM::topo_ic_sim_term(organism, ontology, go1, go2, go_data = go_data)
#    })
#    time <- max(prof_toptitj$x$message$prof$time)*10
#    mem <- max(prof_toptitj$x$message$prof$memalloc)
#    mem_usages <- c(mem_usages, mem)
#    times <- c(times, time)
#    gc()
#  }
#  times_term <- times
#  mems_term <- mem_usages

## ---- fig.width=3, fig.height=3-----------------------------------------------
times_term_df <- data.frame(GAPGOM:::benchmarks$server_times_term, 
                          GAPGOM:::benchmarks$laptop_times_term, 
                          seq_len(105)
                          )
colnames(times_term_df) <- c("server", "laptop", "n")
times_term_df_melted <- melt(times_term_df, id="n")
colnames(times_term_df_melted) <- c("n", "machine", "milliseconds")
ggplot(times_term_df_melted, aes(x=machine, y=milliseconds, colour=machine)) + 
  geom_boxplot(notch = FALSE) + 
  scale_y_continuous(breaks=pretty(times_term_df_melted$milliseconds, n = 5)) + 
  labs(title = paste(strwrap("Speed of term algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))

## ---- fig.width=3, fig.height=3-----------------------------------------------
mems_term_df <- data.frame(GAPGOM:::benchmarks$server_mems_term, 
                          GAPGOM:::benchmarks$laptop_mems_term, 
                          seq_len(105)
                          )
colnames(mems_term_df) <- c("server", "laptop", "n")
mems_term_df_melted <- melt(times_term_df, id="n")
colnames(mems_term_df_melted) <- c("n", "machine", "RAM")
ggplot(mems_term_df_melted, aes(x=machine, y=RAM, colour=machine)) + 
  geom_boxplot(notch = FALSE) + 
  scale_y_continuous(breaks=pretty(mems_term_df_melted$RAM, n = 5)) + 
  labs(title = paste(strwrap("RAM usage for gene algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))

## ---- eval=FALSE--------------------------------------------------------------
#  ## dput(sample(unique(go_data@geneAnno$ENTREZID), 5))
#  random_genes <- c("3848", "2824", "65108", "3988", "10800")
#  
#  unique_pairs <- GAPGOM:::.unique_combos(random_genes, random_genes)
#  
#  times <- c()
#  mem_usages <- c()
#  for (i in seq_len(nrow(unique_pairs))) {
#    prof_topg1g2 <- profvis({
#      pair <- unique_pairs[i]
#      gene1 <- pair[[1]]
#      gene2 <- pair[[2]]
#      GAPGOM::topo_ic_sim_genes(organism, ontology, gene1, gene2, go_data=go_data)
#    })
#    time <- max(prof_topg1g2$x$message$prof$time)*10
#    mem <- max(prof_topg1g2$x$message$prof$memalloc)
#    mem_usages <- c(mem_usages, mem)
#    times <- c(times, time)
#    gc()
#  }
#  times
#  mem_usages
#  times_gen <- times
#  mems_gen <- mem_usages

## ---- fig.width=3, fig.height=3-----------------------------------------------
times_gene_df <- data.frame(GAPGOM:::benchmarks$server_times_gen, 
                          GAPGOM:::benchmarks$laptop_times_gen, 
                          seq_len(10)
                          )
colnames(times_gene_df) <- c("server", "laptop", "n")
times_gene_df_melted <- melt(times_gene_df, id="n")
times_gene_df_melted$value <- times_gene_df_melted$value/1000 
colnames(times_gene_df_melted) <- c("n", "machine", "seconds")
ggplot(times_gene_df_melted, aes(x=machine, y=seconds, colour=machine)) + 
  geom_boxplot(notch = FALSE) + 
  labs(title = paste(strwrap("Speed of gene algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))

## ---- fig.width=3, fig.height=3-----------------------------------------------
mems_gene_df <- data.frame(GAPGOM:::benchmarks$server_mems_gen, 
                          GAPGOM:::benchmarks$laptop_mems_gen, 
                          seq_len(10)
                          )
colnames(mems_gene_df) <- c("server", "laptop", "n")
mems_gene_df_melted <- melt(mems_gene_df, id="n")
colnames(mems_gene_df_melted) <- c("n", "machine", "RAM")
ggplot(mems_gene_df_melted, aes(x=machine, y=RAM, colour=machine)) + 
  geom_boxplot(notch = FALSE) + 
  scale_y_continuous(breaks=pretty(mems_gene_df_melted$RAM, n = 5)) + 
  labs(title = paste(strwrap("RAM usage for gene algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))

## ---- eval=FALSE--------------------------------------------------------------
#  list1=c("126133","221","218","216","8854","220","219","160428","224","222","8659","501","64577","223","217","4329","10840","7915","5832")
#  times <- c()
#  mem_usages <- c()
#  for (i in seq(length(list1)-1)) {
#    sampled_list <- list1[1:(i+1)]
#    print(sampled_list)
#    p <- profvis({
#       GAPGOM::topo_ic_sim_genes(organism, ontology, sampled_list, sampled_list, drop=NULL, go_data=go_data)
#     })
#    time <- max(p$x$message$prof$time)*10
#    mem <- max(p$x$message$prof$memalloc)
#    mem_usages <- c(mem_usages, mem)
#    times <- c(times, time)
#    gc()
#  }
#  times
#  mem_usages
#  times_genset <- times
#  mems_genset <- mem_usages

## ---- fig.width=6, fig.height=3-----------------------------------------------
final_times <- data.frame(GAPGOM:::benchmarks$server_times_genset, 
                          GAPGOM:::benchmarks$laptop_times_genset, 
                          seq_along(GAPGOM:::benchmarks$laptop_times_genset)
                          )
colnames(final_times) <- c("server", "laptop", "n")
final_times_melted <- melt(final_times, id="n")
final_times_melted$value <- final_times_melted$value/1000/60
colnames(final_times_melted) <- c("Genes in geneset", "Machine", "Minutes")
p <- ggplot(data=final_times_melted, aes(x=`Genes in geneset`, y=Minutes, colour=Machine)) + 
  geom_point() + 
  labs(title = paste(strwrap("Speed of TopoICSim geneset algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))
p
p + stat_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1)

## ---- fig.width=3, fig.height=3-----------------------------------------------
mems_geneset_df <- data.frame(GAPGOM:::benchmarks$server_mems_genset, 
                          GAPGOM:::benchmarks$laptop_mems_genset, 
                          seq_len(18)
                          )
colnames(mems_geneset_df) <- c("server", "laptop", "n")
mems_geneset_df_melted <- melt(mems_geneset_df, id="n")
colnames(mems_geneset_df_melted) <- c("Genes in geneset", "machine", "RAM")
ggplot(mems_geneset_df_melted, aes(x=machine, y=RAM, colour=machine)) + 
  geom_boxplot(notch = FALSE) + 
  scale_y_continuous(breaks=pretty(mems_geneset_df_melted$RAM, n = 5)) + 
  labs(title = paste(strwrap("RAM usage for geneset algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))

## ---- fig.width=6, fig.height=3-----------------------------------------------
p <- ggplot(mems_geneset_df_melted, aes(x=`Genes in geneset`, y=RAM, colour=machine)) + 
  geom_point()  + 
  scale_y_continuous(breaks=pretty(mems_geneset_df_melted$RAM, n = 5)) + 
  labs(title = paste(strwrap("RAM usage for geneset algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))
p
p + stat_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1)

## ---- eval=FALSE--------------------------------------------------------------
#  ## set stringsasfactors to false to ensure data is loaded properly.
#  options(stringsAsFactors = FALSE)
#  
#  ## load data
#  #
#  expdata <- read.table("~/Downloads/GSE63733_m.txt")
#  # http://software.broadinstitute.org/gsea/msigdb/cards/HALLMARK_GLYCOLYSIS.html
#  # GLYCOLOSIS HALLMARK
#  geneset <- read.table("~/Downloads/geneset_glyc.txt", sep="\n")
#  
#  colnames(expdata)[1:2] <- c("ENSEMBL", "SYMBOL") # Set important colnames
#  
#  ## make an expressionset
#  
#  expression_matrix <- as.matrix(expdata[,3:ncol(expdata)])
#  rownames(expression_matrix) <- expdata[[1]]
#  featuredat <- as.data.frame(expdata[,1:2])
#  rownames(featuredat) <- expdata[[1]]
#  expset <- ExpressionSet(expression_matrix,
#                          featureData = new("AnnotatedDataFrame",
#                          data=featuredat))
#  
#  ## make a selection on the geneset
#  geneset <- as.vector(geneset[3:nrow(geneset),])
#  geneset_ensembl <- expdata[expdata[[2]] %in% geneset,][[1]]
#  # select on the 10 genes with the most variance
#  geneset_ensembl_topvar <- names(rev(sort(apply(
#    expression_matrix[geneset_ensembl,], 1, var)))[1:10])
#  # convert back to symbol
#  geneset_selection <- expdata[expdata$ENSEMBL %in% geneset_ensembl_topvar,][[2]]
#  
#  ## predict annotation of the geneset selection (per ontology).
#  
#  ontologies <- c("MF", "BP", "CC")
#  bench_results <- list()
#  for (ontology in ontologies) {
#    # print("---")
#    # print(ontology)
#    predicted_annotations <- list()
#    for (gene in geneset_ensembl_topvar) {
#      symbol <- expdata[expdata$ENSEMBL==gene,]$SYMBOL
#  
#      result <- GAPGOM::expression_prediction(gene,
#                                            expset,
#                                            "human",
#                                            ontology,
#                                            idtype = "ENSEMBL")
#      go_annotation_prediction <- unique(as.vector(result$GOID))
#      predicted_annotations[[symbol]] <- go_annotation_prediction
#    }
#    print("Prediction finished! Calculating sims now")
#  
#    ## compare custom genes to original genes
#    tmp_go_data <- set_go_data("human", ontology, computeIC = TRUE,
#                               keytype = "SYMBOL")
#    bench_results[[ontology]] <- mapply(
#      function(gos, name) {
#        custom_gene <- list()
#        custom_gene[[paste0(name, "_pred")]] <- gos
#  
#        name_orig <- unlist(strsplit(name, "_"),
#                       FALSE, FALSE)[1]
#  
#        # print(name_orig)
#  
#        return(GAPGOM::topo_ic_sim_genes("human", ontology, c(), name_orig,
#                                     custom_genes1 = custom_gene,
#                                     idtype = "SYMBOL",
#                                     progress_bar = FALSE, go_data = tmp_go_data)$GeneSim)
#      }, gos = predicted_annotations, name = names(predicted_annotations)
#    )
#  }
#  # save(bench_results, file = "~/tmp_bench.RData")

## -----------------------------------------------------------------------------
bench_results <- GAPGOM:::benchmarks$bench_results
ontologies <- c("MF", "BP", "CC")

for (ontology in ontologies) {
  dat <- data.frame(Gene = names(bench_results[[ontology]]), 
                    sim = bench_results[[ontology]])
  brplot <- ggplot(dat, aes(x=Gene, weight=sim)) + geom_bar() + coord_flip() +
    labs(title = paste(strwrap(paste0(
      "Similarities of custom genes versus originals in ", ontology, 
      " Ontology"), 
                               width = 30), collapse = "\n")) +
    theme(plot.title = element_text(hjust = 0.5)) +
    ylab("Similarity (TopoICSim)")
  print("---")
  print(ontology)
  print("mean:")
  print(mean(bench_results[[ontology]]))
  print(brplot)
}

## -----------------------------------------------------------------------------
sessionInfo()

Try the GAPGOM package in your browser

Any scripts or data that you put into this service are public.

GAPGOM documentation built on Nov. 8, 2020, 8:08 p.m.