knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" )
The package GilOG
does a number of tasks:
First, this package uses calculates gene length and size.
Next, this package calculates correlation efficients and displays the data in a boxplot, scatterplot, regression lines, correlation efficients, both overall and Top 5.
It executes the processing using R Shiny's reactivity functionality using HTML coding in ui.R
and global.R
to improve the appearance of the UI.
You can install the released version of GilOG
with:
devtools::install_github("EdwardJGillian/GilOG")
You can run GilOG
by opening global.R
in the GilOG/inst
folder.
This is a basic example of the workflow with package functions. We will start by getting the data needed for the package:
library(dplyr) library(tidyr) library(ggplot2) library(GilOG) # There are two example files within the package: # set the file paths and file names org_file_path <- system.file("extdata", "organisms.csv", package = "GilOG", mustWork = TRUE) # org_file_path <- file.path(home, "extdata/organism.rds") gene_file_path <- system.file("extdata", "genes.csv", package = "GilOG", mustWork = TRUE) # load organism data and convert empty strings to NA organism <- read.csv(org_file_path, na.strings = "") # load genes data and convert empty strings to NA genes <- read.csv(gene_file_path, na.strings = "")
The GilOG
package loads two files organism
and genes
and validates them for number of columns and column headings.
Examples can be found in the inst/extdata
folder.
First, we will prepare the summary data count:
# Data summary count summary_count_df <- GilOG::summary_count_processing(organism, genes) head(summary_count_df)
{width=100%}
Next, we will prepare the genes per organism count:
# Genes per organism count gpo_count_df <- GilOG::gpo_processing(organism, genes) head(gpo_count_df)
{width=100%}
Next, we will prepare the genes lengths and sizes:
# create data frame with gene lengths and sizes gpo_length_size <- GilOG::gene_length_size_calc(organism, genes) head(gpo_length_size)
Next, we will calculate correlation coefficients:
# calculate correlation coefficients co_ef_df <- GilOG::cor_processing(gpo_length_size) head(co_ef_df)
{width=100%}
Next, we will calculate top 5 correlation coefficients:
# calculate correlation coefficients top 5 gpo_length_size_top_5 <- GilOG::cor_processing_top_5(gpo_length_size) head(gpo_length_size_top_5)
Next, we will calculate and display the box plot
# calculate and display box plot gpo_box <- gpo_length_size_top_5 %>% dplyr::mutate_at(vars(organism), as.factor) p <- ggplot2::ggplot(gpo_box, aes(x=organism, y=gene_length, fill=organism)) + geom_boxplot() + labs(title="Plot of length per Organism",x="Organism", y = "Length") p + theme( plot.title = element_text(color="red", size=14, face="bold.italic"), axis.title.x = element_text(color="blue", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold") )
Next, we will calculate and display the scatterplot
# calculate and display correlation coefficients in a scatterplot GilOG::cor_scatterplot(gpo_length_size_top_5)
Next, we will calculate and display the Regression Curves
p <- ggplot2::ggplot(gpo_length_size_top_5, aes(x=gene_size, y=gene_length, color=organism)) + geom_point() + geom_smooth(method = "lm", fill = NA) + geom_smooth(method = 'lm',size = 1, colour = 'black', se = F, linetype = "dashed") + theme_bw() + labs(title="Regression Curves per Organism",x="Size", y = "Length") p + theme( plot.title = element_text(color="red", size=14, face="bold.italic"), axis.title.x = element_text(color="blue", size=14, face="bold"), axis.title.y = element_text(color="#993333", size=14, face="bold") )
Finally, tables of gene statistics per organism are displayed for the top 5 correlation coefficients and allows downloading of each table as a csv
file. The download buttons have HTML tags to style them different colours when they are clicked.
# calculate correlation coefficients top 5 - 1st place group_id <- as.double(1) top_5_coef_display_1st_place <- GilOG::gpo_output_processing(gpo_length_size_top_5, group_id) head(top_5_coef_display_1st_place)
{width=100%}
This flowchart shows the package workflow visually.
library(DiagrammeR) DiagrammeR::grViz("digraph{ graph[rankdir = LR, nodesep=.2, ranksep=.8] node[shape = rectangle, style = filled, fontsize=12] subgraph cluster_0 { graph[shape = rectangle] style = rounded bgcolor = Green fontsize = 72 label = 'Start Processing' node[shape = rectangle, fillcolor = LemonChiffon, margin = 0.1, fontsize=72] read_files[label = 'read_files', shape = folder, fillcolor = '#FF6347', fontsize=48] } subgraph cluster_1 { graph[shape = rectangle] style = rounded bgcolor = Green fontsize = 72 label = 'GilOG \n Processing Functions' node[shape = rectangle, fillcolor = LemonChiffon, margin = 0.25, fontsize = 48, nodesep=1.5] GilOG_proc_D[label = 'summary_count_processing()'] GilOG_proc_E[label = 'gpo_processing()'] GilOG_proc_F[label = 'gene_length_size_calc()'] GilOG_proc_G[label = 'cor_processing()'] GilOG_proc_H[label = 'cor_processing_top_5()'] GilOG_proc_I[label = 'gpo_output_processing()'] } subgraph cluster_3 { graph[shape = rectangle] style = rounded bgcolor = Green fontsize = 72 label = 'GilOG \n Display Functions' node[shape = rectangle, fillcolor = LemonChiffon, margin = 0.25] GilOG_disp_I[label = 'Data summary', shape = display, fillcolor = '#FF6347', fontsize = 48] GilOG_disp_J[label = 'Genes Per Organism', shape = display, fillcolor = '#FF6347', fontsize = 48] GilOG_disp_K[label = 'Correlation coefficients', shape = display, fillcolor = '#FF6347', fontsize = 48] GilOG_disp_L[label = 'Box Plot', shape = display, fillcolor = '#FF6347', fontsize = 48] GilOG_disp_M[label = 'Scatter Plot', shape = display, fillcolor = '#FF6347', fontsize = 48] GilOG_disp_N[label = 'Regression Curves', shape = display, fillcolor = '#FF6347', fontsize = 48] GilOG_disp_O[label = 'Genes Per Organism Stats - Top 5', shape = display, fillcolor = '#FF6347', fontsize = 48] } edge[color = black, arrowhead = vee, arrowsize = 1.25] read_files -> GilOG_proc_D GilOG_proc_D -> {GilOG_proc_E GilOG_disp_I} GilOG_proc_E -> {GilOG_proc_F GilOG_disp_J} GilOG_proc_F -> GilOG_proc_G GilOG_proc_G -> GilOG_proc_H GilOG_proc_H -> {GilOG_proc_I GilOG_disp_K GilOG_disp_L GilOG_disp_M GilOG_disp_N} GilOG_proc_I -> GilOG_disp_O }")
library(knitr) library(kableExtra) library(magrittr) input_table <- data.frame( "File" = c( "organism", "genes" ), "DataType" = c( "CSV", "CSV" ) ) input_html <- knitr::kable((input_table), "html") kableExtra::kable_styling(input_html, "striped", position = "left", font_size = 10) %>% kableExtra::row_spec(0, bold = TRUE, italic = TRUE, background = "red")
library(knitr) library(kableExtra) library(magrittr) OGP_function_table <- data.frame( "Function" = c( "summary_count_processing()", "gpo_processing()", "gene_length_size_calc()", "cor_processing()", "cor_processing_top_5()", "gpo_output_processing()" ), "Function Purpose" = c( "creates the a dataframe with the counts in various categories", "Create genes per organism count", "Calculates the gene length and size", "Performs correlation calculations", "Performs correlation calculations to find the 5 strongest relationships", "This function prepares output stats for the GPO stats tables by filtering the data by group ID" ) ) OGP_html <- knitr::kable((OGP_function_table), "html") kableExtra::kable_styling(OGP_html, "striped", position = "left", font_size = 10) %>% kableExtra::row_spec(0, bold = TRUE, italic = TRUE, background = "green")
As shown by the code below, a chained function using purrr::map2
is created to loop through two file lists of example data to test the core functions to be tested for reliable outputs. This function creates a series of reference files automatically.
This function uses expect_known_value
from testthat
edition 2
.
library(GilOG) library(testthat) test_chained_functions <- function(csv_file1, csv_file2) { # set testthat edition to 2 to run expect_known_value testthat::local_edition(2) test_that("check file values for parameters", { # naming helper tname <- function(n) { paste0(home, "/data/known_value/", csv, ".", n, ".rds" ) } # create file path to csv file examples csv_path1 <- paste0(home, "/data/known_csv_input/", csv_file1) csv <- stringr::str_remove(csv_file1, ".csv") df1 <- as.data.frame(read.csv(file=csv_path1, na.strings = "")) csv_path2 <- paste0(home, "/data/known_csv_input/", csv_file2) df2 <- read.csv(file=csv_path2, na.strings = "") organism <- GilOG::dataframe_preprocessing(df1) testthat::expect_known_value( organism, tname("organism")) genes <- GilOG::dataframe_preprocessing(df2) testthat::expect_known_value( genes, tname("genes")) # dplyr needs to be run inside testthat for function to work # generates a warning suppressWarnings(library(dplyr)) summary_count_df <- GilOG::summary_count_processing(organism, genes) testthat::expect_known_value( summary_count_df, tname("summary_count_df")) gpo_count_df <- GilOG::gpo_processing(organism, genes) testthat::expect_known_value( gpo_count_df, tname("gpo_count_df")) gpo_length_size <- GilOG::gene_length_size_calc(organism, genes) testthat::expect_known_value( gpo_length_size, tname("gpo_length_size")) co_ef_df <- GilOG::cor_processing(gpo_length_size) testthat::expect_known_value( co_ef_df, tname("co_ef_df")) gpo_length_size_top_5 <- GilOG::cor_processing_top_5(gpo_length_size) testthat::expect_known_value( gpo_length_size_top_5, tname("gpo_length_size_top_5")) }) } # create the ALS file path home <- setwd(Sys.getenv("HOME")) csv_file_path <- file.path(getwd()) csv_file_path <- file.path(home, "data/known_csv_input") # create organism as test1.csv and genes as test2 csv_files_list1 <- list.files(path = csv_file_path, pattern = "organisms*", full.names = FALSE) csv_files_list2 <- list.files(path = csv_file_path, pattern = "genes*", full.names = FALSE) # apply 1 list vector to the function # apply 2 list vector to function + purrr::map2(csv_files_list1, csv_files_list2, test_chained_functions)
This test saves the expected reference data as rds
files
so that the user can save the observed output as rds
files and then inspect each file for differences. The following shows an example from 1 expected output rds
file:
{width=100%}
expect_known_value
results:library(testthat) library(dplyr) library(magrittr) library(GilOG) test_results_raw1 <- testthat::test_file("tests/testthat/test-chained_function_known_value.R", reporter = testthat::ListReporter) test_results_individual1 <- test_results_raw1 %>% as_tibble() %>% dplyr::rename(Test = test) %>% dplyr::group_by(file, context, Test) %>% dplyr::summarise(NumTests = first(nb), Passed = sum(passed), Failed = sum(failed), Warnings = sum(warning), Errors = sum(as.numeric(error)), Skipped = sum(as.numeric(skipped)), .groups = "drop") summarise_results <- function(res) { res %>% dplyr::summarise_if(is.numeric, sum) %>% knitr::kable() } summarise_results(test_results_individual1)
As shown by the code below, a chained function using purrr::map2
is created to loop through two file lists of example data to test the core functions to be tested for reliable outputs. This function creates a series of snapshot reference files automatically.
This function uses expect_snapshot_value
from testthat
edition 3
.
library(GilOG) library(testthat) test_chained_functions <- function(csv_file1, csv_file2) { testthat::local_edition(3) test_that("check file values for parameters", { # create file path to csv file examples csv_path1 <- paste0(home, "/data/snapshot_csv_input/", csv_file1) csv <- stringr::str_remove(csv_file1, ".csv") df1 <- as.data.frame(read.csv(file=csv_path1, na.strings = "")) csv_path2 <- paste0(home, "/data/snapshot_csv_input/", csv_file2) df2 <- as.data.frame(read.csv(file=csv_path2, na.strings = "")) organism <- df1 testthat::expect_snapshot_value( organism, style = "json2") genes <- df2 testthat::expect_snapshot_value( genes, style = "json2") # dplyr needs to be run inside testthat for function to work # generates a warning suppressWarnings(library(dplyr)) summary_count_df <- GilOG::summary_count_processing(organism, genes) testthat::expect_snapshot_value( summary_count_df, style = "json2") gpo_length_size <- GilOG::gene_length_size_calc(organism, genes) testthat::expect_snapshot_value( gpo_length_size, style = "json2") co_ef_df <- GilOG::cor_processing(gpo_length_size) # round r values in co_ef_df for snapshot testing co_ef_df <- co_ef_df %>% dplyr::mutate(across(where(is.numeric), round, 5)) testthat::expect_snapshot_value( co_ef_df, style = "json2") gpo_length_size_top_5 <- GilOG::cor_processing_top_5(gpo_length_size) testthat::expect_snapshot_value( gpo_length_size_top_5, style = "json2") group_id <- as.double(1) gpo_1 <- GilOG::gpo_output_processing(gpo_length_size_top_5, group_id) testthat::expect_snapshot_value( gpo_1, style = "json2") group_id <- as.double(2) gpo_2 <- GilOG::gpo_output_processing(gpo_length_size_top_5, group_id) testthat::expect_snapshot_value( gpo_2, style = "json2") group_id <- as.double(3) gpo_3 <- GilOG::gpo_output_processing(gpo_length_size_top_5, group_id) testthat::expect_snapshot_value( gpo_3, style = "json2") group_id <- as.double(4) gpo_4 <- GilOG::gpo_output_processing(gpo_length_size_top_5, group_id) testthat::expect_snapshot_value( gpo_4, style = "json2") group_id <- as.double(5) gpo_5 <- GilOG::gpo_output_processing(gpo_length_size_top_5, group_id) testthat::expect_snapshot_value( gpo_5, style = "json2") }) } # create the ALS file path home <- setwd(Sys.getenv("HOME")) csv_file_path <- file.path(getwd()) csv_file_path <- file.path(home, "data/snapshot_csv_input") # create organism as test1.csv and genes as test2 csv_files_list1 <- list.files(path = csv_file_path, pattern = "organisms*", full.names = FALSE) csv_files_list2 <- list.files(path = csv_file_path, pattern = "genes*", full.names = FALSE) # apply 1 list vector to the function # apply 2 list vector to function + purrr::map2(csv_files_list1, csv_files_list2, test_chained_functions)
### Testthat `expect_snapshot_value` results: ```r library(testthat) library(dplyr) library(magrittr) library(GilOG) test_results_raw2 <- testthat::test_file("tests/testthat/test-chained_function_snapshot_value.R", reporter = testthat::ListReporter) test_results_individual2 <- test_results_raw2 %>% as_tibble() %>% dplyr::rename(Test = test) %>% dplyr::group_by(file, context, Test) %>% dplyr::summarise(NumTests = first(nb), Passed = sum(passed), Failed = sum(failed), Warnings = sum(warning), Errors = sum(as.numeric(error)), Skipped = sum(as.numeric(skipped)), .groups = "drop") summarise_results <- function(res) { res %>% dplyr::summarise_if(is.numeric, sum) %>% knitr::kable() } summarise_results(test_results_individual2)
You can run GilOG
by opening global.R
in the GilOG/inst
folder.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.