knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 10, fig.height = 10, dpi = 50 # lower the limit so the vignette's file size is not too large )
# install from CRAN library(dplyr) library(readr) library(tidyr) library(purrr) library(ggplot2) library(DT) # install from Bioconductor library(qvalue) # load the midas R package library(midas) theme_set( theme_bw() + theme( axis.text = element_text(size = 14), axis.title = element_text(size = 20), title = element_text(size = 22), strip.text = element_text(size = 20), legend.title = element_text(size = 20), legend.text = element_text(size = 14) ))
midas_input_data_path <- system.file("extdata", "MIDAS_input_dataset.txt", package = "midas") raw_data <- read_delim(midas_input_data_path, delim = "\t") %>% rename(protein = Metabolite) proteins <- raw_data %>% dplyr::select(Protein_name:protein) measurements <- raw_data %>% dplyr::select(-c(Protein_name:Species)) pools <- measurements %>% slice(1) %>% select(-protein) %>% gather(metabolite, pool) if (!all(pools$pool %in% 1:4)) { stop ("The first row did not contain pool information, please update the input files") } log2_replicated_abundances <- measurements %>% slice(-1) %>% gather(metabolite, log2_abundance, -protein) %>% # remove missing observations filter(!is.na(log2_abundance)) util_pretty_khead(log2_replicated_abundances, nrows = 5, caption = "Example log2 fold-changes (protein bound - protein free)")
log2_replicated_abundances_outlier_filtered <- log2_replicated_abundances %>% group_by(protein, metabolite) %>% # calculate per-metabolite SD mutate(mean_log2_abundance = mean(log2_abundance)) %>% group_by(metabolite) %>% mutate(metabolite_sd = sqrt(mean((log2_abundance - mean_log2_abundance)^2) * 3/2)) %>% ungroup() %>% # filter up to one outlier point per metabolite/protein pair mutate(z_scores = (log2_abundance - mean_log2_abundance)/metabolite_sd) %>% group_by(protein, metabolite) %>% arrange(desc(abs(z_scores))) %>% mutate(disagree_rank = 1:n()) %>% filter(case_when(disagree_rank == 1 & abs(z_scores) > 5 ~ FALSE, TRUE ~ TRUE)) %>% ungroup() n_filtered <- (nrow(log2_replicated_abundances) - nrow(log2_replicated_abundances_outlier_filtered)) print(glue::glue( "{n_filtered} observations filtered {round(n_filtered/nrow(log2_replicated_abundances)*100,3)}% of observations" )) # collapse replicates by mean of replicate fold-changes log2_abundances <- log2_replicated_abundances_outlier_filtered %>% group_by(protein, metabolite) %>% summarize(log2_abundance = mean(log2_abundance)) %>% ungroup()
pool_data <- log2_abundances %>% # add pool information inner_join(pools, by = "metabolite") %>% # separate each pool nest(pool_data = -pool) %>% # remove top pcs separately for each pool mutate(correct_pool = map(pool_data, correct_pool)) %>% unnest_wider(correct_pool)
pool_data %>% select(pool, scree_data) %>% mutate(pool = glue::glue("pool {pool}")) %>% unnest(scree_data) %>% ggplot(aes(x = PC, y = VarEx)) + geom_point() + facet_wrap(~ pool) + scale_x_continuous("PC #") + scale_y_continuous("% variance explained", label = scales::percent)
Note: each plot is separately hierarchically clustered so ordering of metabolites and samples will change.
matrix_summary_plots(1, pool_data) matrix_summary_plots(2, pool_data) matrix_summary_plots(3, pool_data) matrix_summary_plots(4, pool_data)
# create a table of PCA-correct fold-changes log2_pc_projection_tbl <- pool_data %>% select(pool, log2_pc_projection_tbl) %>% unnest(log2_pc_projection_tbl) %>% select(-pool) enrichment_statistics <- log2_abundances %>% inner_join(log2_pc_projection_tbl, by = c("protein", "metabolite")) %>% nest(met_data = -metabolite) %>% # fit z-scores for enrichments using an estimate of SD from central # quantiles (75%-25%) of fold-changes for a given metabolite mutate(met_stats = map( met_data, generate_met_z_scores, sd_method = "central_quantiles", quantile_range = 0.5 )) %>% unnest(met_stats) %>% # calculate q-values from z-score p-values mutate(q_value = qvalue(p_value)$qvalues) ggplot(enrichment_statistics, aes(x = p_value)) + geom_histogram(breaks = seq(0,1,by=0.02)) + ggtitle("P-value distributions for all tested PMIs")
FDR_cutoff <- 0.1 # summary by protein enrichment_statistics %>% mutate(query_protein = factor(query_protein, levels = unique(query_protein))) %>% filter(q_value < FDR_cutoff) %>% mutate(direction = ifelse(log2_abundance_corrected > met_mean, "enriched", "depleted")) %>% count(query_protein, direction) %>% spread(direction, n, fill = 0) %>% datatable()
enrichment_statistics %>% mutate(q_label = ifelse(q_value < 0.1, "q-value < 0.1", "q-value > 0.1")) %>% select(query_protein, metabolite, log2_abundance, log2_abundance_corrected, q_label) %>% ggplot(aes(x = log2_abundance, y = log2_abundance_corrected)) + geom_hex(bins = 60) + facet_wrap(~ q_label, ncol = 1) + scale_fill_viridis_c(trans = "log2", option = "plasma") + coord_equal()+ scale_x_continuous("Raw fold-changes") + scale_y_continuous("PCA-corrected fold-changes") + ggtitle("Overview of All Changes")
# spot checking enrichments set.seed(1234) # metabolite levels sampled_metabolite_changes <- enrichment_statistics %>% filter(q_value < FDR_cutoff) %>% distinct(metabolite) %>% sample_n(8) enrichment_statistics %>% semi_join(sampled_metabolite_changes, by = "metabolite") %>% select(metabolite, query_protein, raw = log2_abundance, post_PCA = log2_abundance_corrected, q_value) %>% gather(variable, abundance, raw, post_PCA) %>% ggplot(aes(x = query_protein, y = abundance, fill = variable, alpha = q_value < FDR_cutoff)) + geom_bar(stat = "identity", position = "dodge") + facet_wrap(~ metabolite, scales = "free_y", ncol = 2) + scale_alpha_manual("Is discovery?", values = c("TRUE" = 1, "FALSE" = 0.5)) + theme(axis.text.x = element_blank()) + ggtitle("Changes of example metabolites")
sampled_proteins <- enrichment_statistics %>% distinct(query_protein) %>% sample_n(3) enrichment_statistics %>% semi_join(sampled_proteins, by = "query_protein") %>% select(metabolite, query_protein, raw = log2_abundance, post_PCA = log2_abundance_corrected, q_value) %>% gather(variable, abundance, raw, post_PCA) %>% ggplot(aes(x = metabolite, y = abundance, fill = variable, alpha = q_value < FDR_cutoff)) + geom_bar(stat = "identity", position = "dodge") + facet_wrap(~ query_protein, scales = "free_y", ncol = 1) + scale_alpha_manual("Is discovery?", values = c("TRUE" = 1, "FALSE" = 0.5)) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank()) + ggtitle("Changes of example proteins")
midas_pmis = enrichment_statistics %>% select(-met_data, -met_mean, -met_sd) stopifnot(nrow(midas_pmis %>% anti_join(midas::metabolites, by = "metabolite")) == 0) # save to packages with: # this data can be accessed with data(midas_pmis) # usethis::use_data(midas_pmis, overwrite = TRUE) util_pretty_khead(midas_pmis %>% filter(q_value < 0.1), nrows = 5, caption = "Example significant PMIs")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.