load_pkgs = c('cowplot', 'ggplot2', 'gtable', 'magrittr', 'purrr', 'tufte') purrr::walk(load_pkgs, ~ library(., character.only = TRUE)) devtools::load_all() knitr::opts_chunk$set( tidy = FALSE, warning = FALSE, message = FALSE, echo = FALSE, fig.fullwidth = TRUE, fig.width = 4, fig.height = 3) options(htmltools.dir.version = FALSE)
# Wrap in .cache_ from utils! E.g. gd = .cache_('gen_data', here::here('vignettes', <Rmd_filename>), function() {<evaluate code here>; return(environment())}) gd = .cache_('gen_data', here::here('vignettes', 'ltbAnalysis'), function() { TB_human_datasets_04_2018 = readRDS(here::here("data", "TB_human_datasets_04_2018.rds")) potentialDiscoveryGSEIDs = c("GSE19491", "GSE28623", "GSE54992", "GSE37250", "GSE39939.noCultureNeg", "GSE39940") potentialValidationGSEIDs = c("GSEScribaDay0to7", "GSE101705", "GSE73408", "GSE69581") case_classes = c("Latent TB") control_classes = c("Healthy", "Active TB", "Other Disease") gses = lapply(TB_human_datasets_04_2018[c(potentialDiscoveryGSEIDs, potentialValidationGSEIDs)], function(gse) { filteredGSE = .filterGSE(gse, classesOfInterest=c(case_classes, control_classes)) filteredGSEWithClass = .addClassVec(filteredGSE, caseClasses=case_classes) filteredGSEWithClass }) %>% keep(function(gse) sum(gse$class) > 1 && sum(gse$class) < length(gse$class)-1) gsesMeta = list(originalData = gses) geneFilters = .get_gene_filter(datetime = "latest") ds_by_filters = .gen_metrics_df(gsesMeta, geneFilters) %>% dplyr::mutate(disc_or_valid = ifelse(dataset %in% potentialDiscoveryGSEIDs, 'Discovery', 'Validation')) ds_labels = .gen_cohort_labels_df(gsesMeta$originalData) %>% dplyr::mutate(disc_or_valid = ifelse(dataset %in% potentialDiscoveryGSEIDs, 'Discovery', 'Validation')) ds_labels$info = "Cohort Info" rm(TB_human_datasets_04_2018, gses, gsesMeta, envir = environment()) return(environment()) })
gene_sigs = .df_from_list_of_lists(gd$geneFilters) %>% tibble::rownames_to_column(var = 'geneSignature') %>% dplyr::mutate(forwardSearchThresh = stringr::str_match(filterDescription, 'forwardThresh=([0-9.]+)')[,2]) %>% dplyr::arrange(desc(effectSizeThresh)) gene_sigs_spread = gene_sigs %>% tidyr::unnest(posGeneNames, .drop=F) %>% dplyr::mutate(effectSize = "+") %>% tidyr::spread(posGeneNames, effectSize) %>% tidyr::unnest(negGeneNames, .drop=F) %>% dplyr::mutate(effectSize = "-") %>% tidyr::spread(negGeneNames, effectSize) %>% dplyr::arrange(desc(effectSizeThresh)) ord_by_freq = names(sort(gene_sigs_spread %>% dplyr::summarize_all(function(col) {length(which(col %in% c('+', '-')))}), decreasing = TRUE))[1:34] gene_sigs_spread[,10:43] = gene_sigs_spread[,ord_by_freq] names(gene_sigs_spread)[10:43] = ord_by_freq gene_sigs_simpl = gene_sigs_spread %>% dplyr::select(-dplyr::one_of(c('posGeneNames', 'negGeneNames', 'filterDescription', 'timestamp')))
fontfamily = 'Palatino' theme_set(theme_bw(base_size = 11) + theme( text = element_text(family = fontfamily, size = 11), panel.border = element_blank(), strip.background = element_rect(fill = 'gray92', colour = 'white'), strip.text = element_text(colour = 'gray40'), axis.ticks = element_line(colour = 'gray92'), axis.text = element_text(colour = 'gray40')))
df_and_elims = .elim_unif_cols(gene_sigs_simpl, cols = colnames(gene_sigs_simpl)[1:7]) options(knitr.kable.NA = '') knitr::kable(df_and_elims$df[,1:13], align = 'r', col.names = c(.sentence_case(colnames(df_and_elims$df)[1:2], abbreviations = c('FDR'), repl_list = list(thresh = 'threshold', pval = 'p-value')), colnames(df_and_elims$df)[3:ncol(df_and_elims$df)])[1:13])
All gene signatures had the same values for r .param_sentence(df_and_elims$elims)
. 23 least common genes (r glue::collapse(colnames(df_and_elims$df)[14:36], sep = ", ", last = " and ")
) are truncated from right end of table.
p1 = ggplot(gd$ds_by_filters, aes(metric_val, gene_filter)) + geom_point() + geom_errorbarh(aes(xmin = metric_lower, xmax = metric_upper), height = 0) + .geom_text(aes(label = signif(metric_val, 2)), hjust=0.25, vjust=1.1, colour='gray40', size=3) + xlab(NULL) + ylab(NULL) + facet_grid(disc_or_valid + dataset ~ toupper(metric_name), switch = "y", scales = "free_x", labeller = labeller(dataset = function(string) { substr(string, 1, 8)})) # g1 and g2 below are manual; will try to have automated way in future g1 = ggplotGrob(p1) %>% gtable_add_row_space(unit(c(rep(0, 16), 15, rep(0, 11)), "points")) p2 = ggplot(gd$ds_labels, aes(.abbreviate_if(group, condition = function(x) { x != 'Healthy'}, minlength = 3))) + geom_bar(width = 0.05) + xlab(NULL) + ylab(NULL) + coord_flip() + facet_grid(dataset ~ info) + theme(strip.text.y = element_blank()) g2 = ggplotGrob(p2) %>% gtable_add_row_space(unit(c(rep(0, 16), 15, rep(0, 11)), "points")) plot_grid(g1, g2, nrow = 1, rel_widths = c(2, 1))
I was ecstatic that my first run through MetaIntegrator took less than an hour (Hayley, Aditya, Francesco, Tim and others had tidied the data already). My first run through iterating on even slightly customized figures, on the other hand, was much more time-consuming.
After reading Edward Tufte's The Visual Display of Quantitative Information on good data visualization practices, I only thought 10 percent of them truly mattered. But even implementing those 10% in the figures I generate in my exploratory data analysis has been more involved than I expected. I quickly found that, among other things, it was hard to keep track of data versions and tedious to repeat hard-coded visualization code. Here are some workflow practices I have tried to solve these issues and accelerate my process:
gd = .cache_('gen_data', here::here('vignettes', <Rmd_filename>), function() {<evaluate code here>; return(environment())}
.geom_text = function(...) {
geom_text(..., family = 'Palatino')
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.