benchmarks.R
In GAPGOM: GAPGOM (novel Gene Annotation Prediction and other GO Metrics)

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----load_libs, warning = FALSE, message = FALSE, include = TRUE--------------
library(GAPGOM)
library(profvis)
library(GO.db)
library(graph)
library(ggplot2)
library(reshape2)
library(Biobase)

## ---- eval=FALSE--------------------------------------------------------------
#  # Example with default dataset, take a look at the data documentation
#  # to fully grasp what's going on with making of the filter etc. (Biobase
#  # ExpressionSet)
#  
#  # keep everything that is a protein coding gene
#  filter_vector <- expset@featureData@data[(expset@featureData@data$GeneType=="protein_coding"),]$GeneID
#  # set gid and run.
#  gid <- "ENSG00000228630"
#  
#  p <- profvis::profvis({GAPGOM::expression_prediction(gid,
#                                          GAPGOM::expset,
#                                          "human",
#                                          "BP",
#                                          id_translation_df = GAPGOM::id_translation_df,
#                                          id_select_vector = filter_vector,
#                                          method = "combine", verbose = TRUE,
#                                          filter_pvals = TRUE
#                         )})
#  time <- max(p$x$message$prof$time)*10
#  mem <- max(p$x$message$prof$memalloc)

## -----------------------------------------------------------------------------
# laptop
# time
310
# mem
124.91
# ---
# server
# time
560
# mem
72.47

## ---- eval=FALSE--------------------------------------------------------------
#  # prepare the godata for mouse and some other calculations later needed in benchmarking
#  organism <- "human"
#  ontology <- "BP"
#  go_data <- GAPGOM::set_go_data(organism, ontology)

## ----topotitj, eval=FALSE-----------------------------------------------------
#  # grab 15 random GOs (for term algorithm)
#  ## sample(unique(go_data@geneAnno$GO), 15)
#  random_gos <- c("GO:0030177", "GO:0001771", "GO:0045715", "GO:0044330", "GO:0098780",
#  "GO:1901006", "GO:0061143", "GO:0060025", "GO:0015695", "GO:0090074",
#  "GO:0035445", "GO:0008595", "GO:1903634", "GO:1903826", "GO:0048389"
#  )
#  # print them for reproducability
#  ## dput(random_gos)
#  # now compare all unique random GO pairs. (105 uniques).
#  unique_pairs <- GAPGOM:::.unique_combos(random_gos, random_gos)
#  
#  times <- c()
#  mem_usages <- c()
#  for (i in seq_len(nrow(unique_pairs))) {
#    prof_toptitj <- profvis({
#      pair <- unique_pairs[i]
#      go1 <- pair[[1]]
#      go2 <- pair[[2]]
#      GAPGOM::topo_ic_sim_term(organism, ontology, go1, go2, go_data = go_data)
#    })
#    time <- max(prof_toptitj$x$message$prof$time)*10
#    mem <- max(prof_toptitj$x$message$prof$memalloc)
#    mem_usages <- c(mem_usages, mem)
#    times <- c(times, time)
#    gc()
#  }
#  times_term <- times
#  mems_term <- mem_usages

## ---- fig.width=3, fig.height=3-----------------------------------------------
times_term_df <- data.frame(GAPGOM:::benchmarks$server_times_term, 
                          GAPGOM:::benchmarks$laptop_times_term, 
                          seq_len(105)
                          )
colnames(times_term_df) <- c("server", "laptop", "n")
times_term_df_melted <- melt(times_term_df, id="n")
colnames(times_term_df_melted) <- c("n", "machine", "milliseconds")
ggplot(times_term_df_melted, aes(x=machine, y=milliseconds, colour=machine)) + 
  geom_boxplot(notch = FALSE) + 
  scale_y_continuous(breaks=pretty(times_term_df_melted$milliseconds, n = 5)) + 
  labs(title = paste(strwrap("Speed of term algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))

## ---- fig.width=3, fig.height=3-----------------------------------------------
mems_term_df <- data.frame(GAPGOM:::benchmarks$server_mems_term, 
                          GAPGOM:::benchmarks$laptop_mems_term, 
                          seq_len(105)
                          )
colnames(mems_term_df) <- c("server", "laptop", "n")
mems_term_df_melted <- melt(times_term_df, id="n")
colnames(mems_term_df_melted) <- c("n", "machine", "RAM")
ggplot(mems_term_df_melted, aes(x=machine, y=RAM, colour=machine)) + 
  geom_boxplot(notch = FALSE) + 
  scale_y_continuous(breaks=pretty(mems_term_df_melted$RAM, n = 5)) + 
  labs(title = paste(strwrap("RAM usage for gene algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))

## ---- eval=FALSE--------------------------------------------------------------
#  ## dput(sample(unique(go_data@geneAnno$ENTREZID), 5))
#  random_genes <- c("3848", "2824", "65108", "3988", "10800")
#  
#  unique_pairs <- GAPGOM:::.unique_combos(random_genes, random_genes)
#  
#  times <- c()
#  mem_usages <- c()
#  for (i in seq_len(nrow(unique_pairs))) {
#    prof_topg1g2 <- profvis({
#      pair <- unique_pairs[i]
#      gene1 <- pair[[1]]
#      gene2 <- pair[[2]]
#      GAPGOM::topo_ic_sim_genes(organism, ontology, gene1, gene2, go_data=go_data)
#    })
#    time <- max(prof_topg1g2$x$message$prof$time)*10
#    mem <- max(prof_topg1g2$x$message$prof$memalloc)
#    mem_usages <- c(mem_usages, mem)
#    times <- c(times, time)
#    gc()
#  }
#  times
#  mem_usages
#  times_gen <- times
#  mems_gen <- mem_usages

## ---- fig.width=3, fig.height=3-----------------------------------------------
times_gene_df <- data.frame(GAPGOM:::benchmarks$server_times_gen, 
                          GAPGOM:::benchmarks$laptop_times_gen, 
                          seq_len(10)
                          )
colnames(times_gene_df) <- c("server", "laptop", "n")
times_gene_df_melted <- melt(times_gene_df, id="n")
times_gene_df_melted$value <- times_gene_df_melted$value/1000 
colnames(times_gene_df_melted) <- c("n", "machine", "seconds")
ggplot(times_gene_df_melted, aes(x=machine, y=seconds, colour=machine)) + 
  geom_boxplot(notch = FALSE) + 
  labs(title = paste(strwrap("Speed of gene algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))

## ---- fig.width=3, fig.height=3-----------------------------------------------
mems_gene_df <- data.frame(GAPGOM:::benchmarks$server_mems_gen, 
                          GAPGOM:::benchmarks$laptop_mems_gen, 
                          seq_len(10)
                          )
colnames(mems_gene_df) <- c("server", "laptop", "n")
mems_gene_df_melted <- melt(mems_gene_df, id="n")
colnames(mems_gene_df_melted) <- c("n", "machine", "RAM")
ggplot(mems_gene_df_melted, aes(x=machine, y=RAM, colour=machine)) + 
  geom_boxplot(notch = FALSE) + 
  scale_y_continuous(breaks=pretty(mems_gene_df_melted$RAM, n = 5)) + 
  labs(title = paste(strwrap("RAM usage for gene algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))

## ---- eval=FALSE--------------------------------------------------------------
#  list1=c("126133","221","218","216","8854","220","219","160428","224","222","8659","501","64577","223","217","4329","10840","7915","5832")
#  times <- c()
#  mem_usages <- c()
#  for (i in seq(length(list1)-1)) {
#    sampled_list <- list1[1:(i+1)]
#    print(sampled_list)
#    p <- profvis({
#       GAPGOM::topo_ic_sim_genes(organism, ontology, sampled_list, sampled_list, drop=NULL, go_data=go_data)
#     })
#    time <- max(p$x$message$prof$time)*10
#    mem <- max(p$x$message$prof$memalloc)
#    mem_usages <- c(mem_usages, mem)
#    times <- c(times, time)
#    gc()
#  }
#  times
#  mem_usages
#  times_genset <- times
#  mems_genset <- mem_usages

## ---- fig.width=6, fig.height=3-----------------------------------------------
final_times <- data.frame(GAPGOM:::benchmarks$server_times_genset, 
                          GAPGOM:::benchmarks$laptop_times_genset, 
                          seq_along(GAPGOM:::benchmarks$laptop_times_genset)
                          )
colnames(final_times) <- c("server", "laptop", "n")
final_times_melted <- melt(final_times, id="n")
final_times_melted$value <- final_times_melted$value/1000/60
colnames(final_times_melted) <- c("Genes in geneset", "Machine", "Minutes")
p <- ggplot(data=final_times_melted, aes(x=`Genes in geneset`, y=Minutes, colour=Machine)) + 
  geom_point() + 
  labs(title = paste(strwrap("Speed of TopoICSim geneset algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))
p
p + stat_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1)

## ---- fig.width=3, fig.height=3-----------------------------------------------
mems_geneset_df <- data.frame(GAPGOM:::benchmarks$server_mems_genset, 
                          GAPGOM:::benchmarks$laptop_mems_genset, 
                          seq_len(18)
                          )
colnames(mems_geneset_df) <- c("server", "laptop", "n")
mems_geneset_df_melted <- melt(mems_geneset_df, id="n")
colnames(mems_geneset_df_melted) <- c("Genes in geneset", "machine", "RAM")
ggplot(mems_geneset_df_melted, aes(x=machine, y=RAM, colour=machine)) + 
  geom_boxplot(notch = FALSE) + 
  scale_y_continuous(breaks=pretty(mems_geneset_df_melted$RAM, n = 5)) + 
  labs(title = paste(strwrap("RAM usage for geneset algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))

## ---- fig.width=6, fig.height=3-----------------------------------------------
p <- ggplot(mems_geneset_df_melted, aes(x=`Genes in geneset`, y=RAM, colour=machine)) + 
  geom_point()  + 
  scale_y_continuous(breaks=pretty(mems_geneset_df_melted$RAM, n = 5)) + 
  labs(title = paste(strwrap("RAM usage for geneset algorithm", width = 20), collapse = "\n")) + 
  theme(plot.title = element_text(hjust = 0.5))
p
p + stat_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1)

## ---- eval=FALSE--------------------------------------------------------------
#  ## set stringsasfactors to false to ensure data is loaded properly.
#  options(stringsAsFactors = FALSE)
#  
#  ## load data
#  #
#  expdata <- read.table("~/Downloads/GSE63733_m.txt")
#  # http://software.broadinstitute.org/gsea/msigdb/cards/HALLMARK_GLYCOLYSIS.html
#  # GLYCOLOSIS HALLMARK
#  geneset <- read.table("~/Downloads/geneset_glyc.txt", sep="\n")
#  
#  colnames(expdata)[1:2] <- c("ENSEMBL", "SYMBOL") # Set important colnames
#  
#  ## make an expressionset
#  
#  expression_matrix <- as.matrix(expdata[,3:ncol(expdata)])
#  rownames(expression_matrix) <- expdata[[1]]
#  featuredat <- as.data.frame(expdata[,1:2])
#  rownames(featuredat) <- expdata[[1]]
#  expset <- ExpressionSet(expression_matrix,
#                          featureData = new("AnnotatedDataFrame",
#                          data=featuredat))
#  
#  ## make a selection on the geneset
#  geneset <- as.vector(geneset[3:nrow(geneset),])
#  geneset_ensembl <- expdata[expdata[[2]] %in% geneset,][[1]]
#  # select on the 10 genes with the most variance
#  geneset_ensembl_topvar <- names(rev(sort(apply(
#    expression_matrix[geneset_ensembl,], 1, var)))[1:10])
#  # convert back to symbol
#  geneset_selection <- expdata[expdata$ENSEMBL %in% geneset_ensembl_topvar,][[2]]
#  
#  ## predict annotation of the geneset selection (per ontology).
#  
#  ontologies <- c("MF", "BP", "CC")
#  bench_results <- list()
#  for (ontology in ontologies) {
#    # print("---")
#    # print(ontology)
#    predicted_annotations <- list()
#    for (gene in geneset_ensembl_topvar) {
#      symbol <- expdata[expdata$ENSEMBL==gene,]$SYMBOL
#  
#      result <- GAPGOM::expression_prediction(gene,
#                                            expset,
#                                            "human",
#                                            ontology,
#                                            idtype = "ENSEMBL")
#      go_annotation_prediction <- unique(as.vector(result$GOID))
#      predicted_annotations[[symbol]] <- go_annotation_prediction
#    }
#    print("Prediction finished! Calculating sims now")
#  
#    ## compare custom genes to original genes
#    tmp_go_data <- set_go_data("human", ontology, computeIC = TRUE,
#                               keytype = "SYMBOL")
#    bench_results[[ontology]] <- mapply(
#      function(gos, name) {
#        custom_gene <- list()
#        custom_gene[[paste0(name, "_pred")]] <- gos
#  
#        name_orig <- unlist(strsplit(name, "_"),
#                       FALSE, FALSE)[1]
#  
#        # print(name_orig)
#  
#        return(GAPGOM::topo_ic_sim_genes("human", ontology, c(), name_orig,
#                                     custom_genes1 = custom_gene,
#                                     idtype = "SYMBOL",
#                                     progress_bar = FALSE, go_data = tmp_go_data)$GeneSim)
#      }, gos = predicted_annotations, name = names(predicted_annotations)
#    )
#  }
#  # save(bench_results, file = "~/tmp_bench.RData")

## -----------------------------------------------------------------------------
bench_results <- GAPGOM:::benchmarks$bench_results
ontologies <- c("MF", "BP", "CC")

for (ontology in ontologies) {
  dat <- data.frame(Gene = names(bench_results[[ontology]]), 
                    sim = bench_results[[ontology]])
  brplot <- ggplot(dat, aes(x=Gene, weight=sim)) + geom_bar() + coord_flip() +
    labs(title = paste(strwrap(paste0(
      "Similarities of custom genes versus originals in ", ontology, 
      " Ontology"), 
                               width = 30), collapse = "\n")) +
    theme(plot.title = element_text(hjust = 0.5)) +
    ylab("Similarity (TopoICSim)")
  print("---")
  print(ontology)
  print("mean:")
  print(mean(bench_results[[ontology]]))
  print(brplot)
}

## -----------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
GAPGOM documentation built on Nov. 8, 2020, 8:08 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
GAPGOM
GAPGOM (novel Gene Annotation Prediction and other GO Metrics)

inst/doc/benchmarks.R
In GAPGOM: GAPGOM (novel Gene Annotation Prediction and other GO Metrics)

Try the GAPGOM package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GAPGOM GAPGOM (novel Gene Annotation Prediction and other GO Metrics)

inst/doc/benchmarks.R In GAPGOM: GAPGOM (novel Gene Annotation Prediction and other GO Metrics)

Try the GAPGOM package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GAPGOM
GAPGOM (novel Gene Annotation Prediction and other GO Metrics)

inst/doc/benchmarks.R
In GAPGOM: GAPGOM (novel Gene Annotation Prediction and other GO Metrics)