knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 10,
  fig.height = 10,
  dpi = 50 # lower the limit so the vignette's file size is not too large
)

Setup

Load R Packages

# install from CRAN
library(dplyr)
library(readr)
library(tidyr)
library(purrr)
library(ggplot2)
library(DT)
# install from Bioconductor
library(qvalue)
# load the midas R package
library(midas)
theme_set(
  theme_bw() + theme(
    axis.text = element_text(size = 14),
    axis.title = element_text(size = 20),
    title = element_text(size = 22),
    strip.text = element_text(size = 20),
    legend.title = element_text(size = 20),
    legend.text = element_text(size = 14)
    ))

Read and format MIDAS input data

midas_input_data_path <- system.file("extdata", "MIDAS_input_dataset.txt", package = "midas")

raw_data <- read_delim(midas_input_data_path, delim = "\t") %>%
  rename(protein = Metabolite)

proteins <- raw_data %>%
  dplyr::select(Protein_name:protein)

measurements <- raw_data %>%
  dplyr::select(-c(Protein_name:Species))

pools <- measurements %>%
  slice(1) %>%
  select(-protein) %>%
  gather(metabolite, pool)

if (!all(pools$pool %in% 1:4)) {
  stop ("The first row did not contain pool information, please update the input files")
}

log2_replicated_abundances <- measurements %>%
  slice(-1) %>%
  gather(metabolite, log2_abundance, -protein) %>%
  # remove missing observations
  filter(!is.na(log2_abundance))

util_pretty_khead(log2_replicated_abundances, nrows = 5, caption = "Example log2 fold-changes (protein bound - protein free)")

Normalization

Filter Outliers Then Collapse Technical Replicates

log2_replicated_abundances_outlier_filtered <- log2_replicated_abundances %>%
  group_by(protein, metabolite) %>%
  # calculate per-metabolite SD
  mutate(mean_log2_abundance = mean(log2_abundance)) %>%
  group_by(metabolite) %>%
  mutate(metabolite_sd = sqrt(mean((log2_abundance - mean_log2_abundance)^2) * 3/2)) %>%
  ungroup() %>%
  # filter up to one outlier point per metabolite/protein pair
  mutate(z_scores = (log2_abundance - mean_log2_abundance)/metabolite_sd) %>%
  group_by(protein, metabolite) %>%
  arrange(desc(abs(z_scores))) %>%
  mutate(disagree_rank = 1:n()) %>%
  filter(case_when(disagree_rank == 1 & abs(z_scores) > 5 ~ FALSE,
                   TRUE ~ TRUE)) %>%
  ungroup()

n_filtered <- (nrow(log2_replicated_abundances) - nrow(log2_replicated_abundances_outlier_filtered))

print(glue::glue(
  "{n_filtered} observations filtered
  {round(n_filtered/nrow(log2_replicated_abundances)*100,3)}% of observations"
))

# collapse replicates by mean of replicate fold-changes
log2_abundances <- log2_replicated_abundances_outlier_filtered %>%
  group_by(protein, metabolite) %>%
  summarize(log2_abundance = mean(log2_abundance)) %>%
  ungroup()

Correct For Non-Specific Binding

pool_data <- log2_abundances %>%
  # add pool information
  inner_join(pools, by = "metabolite") %>%
  # separate each pool
  nest(pool_data = -pool) %>%
  # remove top pcs separately for each pool
  mutate(correct_pool = map(pool_data, correct_pool)) %>%
  unnest_wider(correct_pool)

Create PCA Scree Plots

pool_data %>%
  select(pool, scree_data) %>%
  mutate(pool = glue::glue("pool {pool}")) %>%
  unnest(scree_data) %>%
  ggplot(aes(x = PC, y = VarEx)) +
  geom_point() +
  facet_wrap(~ pool) +
  scale_x_continuous("PC #") +
  scale_y_continuous("% variance explained", label = scales::percent)

Visualize PCA Projections and Post-PCA Corrected Data

Note: each plot is separately hierarchically clustered so ordering of metabolites and samples will change.

matrix_summary_plots(1, pool_data)
matrix_summary_plots(2, pool_data)
matrix_summary_plots(3, pool_data)
matrix_summary_plots(4, pool_data)

Statistical Analysis

Infer Protein-Metabolite Interactions (PMIs) from Corrected Fold-Changes

# create a table of PCA-correct fold-changes
log2_pc_projection_tbl <- pool_data %>%
  select(pool, log2_pc_projection_tbl) %>%
  unnest(log2_pc_projection_tbl) %>%
  select(-pool)

enrichment_statistics <- log2_abundances %>%
  inner_join(log2_pc_projection_tbl, by = c("protein", "metabolite")) %>%
  nest(met_data = -metabolite) %>%
  # fit z-scores for enrichments using an estimate of SD from central
  # quantiles (75%-25%) of fold-changes for a given metabolite
  mutate(met_stats = map(
    met_data,
    generate_met_z_scores,
    sd_method = "central_quantiles",
    quantile_range = 0.5
    )) %>%
  unnest(met_stats) %>%
  # calculate q-values from z-score p-values
  mutate(q_value = qvalue(p_value)$qvalues)

ggplot(enrichment_statistics, aes(x = p_value)) +
  geom_histogram(breaks = seq(0,1,by=0.02)) +
  ggtitle("P-value distributions for all tested PMIs")

Summarize FDR-controlled PMIs

FDR_cutoff <- 0.1

# summary by protein
enrichment_statistics %>%
  mutate(query_protein = factor(query_protein, levels = unique(query_protein))) %>%
  filter(q_value < FDR_cutoff) %>%
  mutate(direction = ifelse(log2_abundance_corrected > met_mean, "enriched", "depleted")) %>%
  count(query_protein, direction) %>%
  spread(direction, n, fill = 0) %>%
  datatable()
enrichment_statistics %>%
  mutate(q_label = ifelse(q_value < 0.1, "q-value < 0.1", "q-value > 0.1")) %>%
  select(query_protein, metabolite, log2_abundance, log2_abundance_corrected, q_label) %>%
  ggplot(aes(x = log2_abundance, y = log2_abundance_corrected)) +
  geom_hex(bins = 60) +
  facet_wrap(~ q_label, ncol = 1) +
  scale_fill_viridis_c(trans = "log2", option = "plasma") +
  coord_equal()+
  scale_x_continuous("Raw fold-changes") +
  scale_y_continuous("PCA-corrected fold-changes") +
  ggtitle("Overview of All Changes")
# spot checking enrichments
set.seed(1234)

# metabolite levels
sampled_metabolite_changes <- enrichment_statistics %>%
  filter(q_value < FDR_cutoff) %>%
  distinct(metabolite) %>%
  sample_n(8)

enrichment_statistics %>%
  semi_join(sampled_metabolite_changes, by = "metabolite") %>%
  select(metabolite, query_protein, raw = log2_abundance, post_PCA = log2_abundance_corrected, q_value) %>%
  gather(variable, abundance, raw, post_PCA) %>%
  ggplot(aes(x = query_protein, y = abundance, fill = variable, alpha = q_value < FDR_cutoff)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ metabolite, scales = "free_y", ncol = 2) +
  scale_alpha_manual("Is discovery?", values = c("TRUE" = 1, "FALSE" = 0.5)) +
  theme(axis.text.x = element_blank()) +
  ggtitle("Changes of example metabolites")
sampled_proteins <- enrichment_statistics %>%
  distinct(query_protein) %>%
  sample_n(3)

enrichment_statistics %>%
  semi_join(sampled_proteins, by = "query_protein") %>%
  select(metabolite, query_protein, raw = log2_abundance, post_PCA = log2_abundance_corrected, q_value) %>%
  gather(variable, abundance, raw, post_PCA) %>%
  ggplot(aes(x = metabolite, y = abundance, fill = variable, alpha = q_value < FDR_cutoff)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ query_protein, scales = "free_y", ncol = 1) +
  scale_alpha_manual("Is discovery?", values = c("TRUE" = 1, "FALSE" = 0.5)) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid.major.x = element_blank()) +
  ggtitle("Changes of example proteins")

Save Results

midas_pmis = enrichment_statistics %>%
  select(-met_data, -met_mean, -met_sd)

stopifnot(nrow(midas_pmis %>% anti_join(midas::metabolites, by = "metabolite")) == 0)

# save to packages with:
# this data can be accessed with data(midas_pmis)
# usethis::use_data(midas_pmis, overwrite = TRUE)

util_pretty_khead(midas_pmis %>% filter(q_value < 0.1), nrows = 5, caption = "Example significant PMIs")


KevinGHicks/MIDAS documentation built on May 12, 2022, 8:14 a.m.