knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

Purpose

Hunter has this idea that we can improve sample-to-sample normalization using relative ranking of peaks based on their intensity ranks. So this is my attempting to figure out if that will work.

See the conclusions for my final summary.

Setup

library(dplyr)
library(visualizationQualityControl)
library(smirfeTools)
library(ggplot2)
library(rlang)
library(ggridges)
theme_set(cowplot::theme_cowplot())

Load Data

Data we have is all the peaks from each sample, along with their ranks, along with the assigned and voted on peaks across samples. The peaks and their IDs (Sample_Peak) should match up between the two data sets.

all_peaks = readRDS("testing_stuff/lung_all_peaks.rds")
voted_data = readRDS("testing_stuff/lung_voted_all.rds")
imf_data = extract_imf_emf_data(voted_data, by = "IMF")
calc_rank = function(values){
  rank(values, ties.method = "random") / length(values)
}

calc_median_rank = function(peak_df, rank_column = fractional_rank){
  dplyr::group_by(peak_df, imf) %>%
    dplyr::summarise(median_rank = median({{rank_column}}))
}

Analysis

imf_peaks = imf_data$peaks
peak_counts = apply(imf_peaks, 1, function(.x){sum(!is.na(.x))})
n_sample = ncol(imf_peaks)

peak_frac = peak_counts / n_sample
peak_frac_df = data.frame(imf = rownames(imf_peaks), fraction = peak_frac,
                          stringsAsFactors = FALSE)

imf_2_peak = purrr::map_df(rownames(imf_peaks), function(in_row){
  data.frame(imf = in_row,
             Sample_Peak = as.vector(imf_peaks[in_row, ]),
             stringsAsFactors = FALSE)
})
imf_2_peak = dplyr::filter(imf_2_peak, !is.na(Sample_Peak))
imf_2_peak = dplyr::left_join(imf_2_peak, peak_frac_df, by = "imf")
use_imf = peak_frac >= 0.9
use_peaks = imf_peaks[use_imf, ]
split_peaks = split(all_peaks, all_peaks$Sample)
all_peaks = purrr::map_df(split_peaks, function(in_peaks){
  in_peaks$fractional_rank = calc_rank(in_peaks$Height)
  in_peaks
})
n_peak_sample = dplyr::group_by(all_peaks, Sample) %>%
  dplyr::summarise(n_peak = dplyr::n())
all_peaks = dplyr::left_join(all_peaks, n_peak_sample, by = "Sample")

imf_ranks = dplyr::left_join(imf_2_peak, all_peaks[, c("Sample_Peak", "fractional_rank", "n_peak")], by = "Sample_Peak")

imf_median_rank = imf_ranks %>% 
  dplyr::group_by(imf) %>%
  dplyr::summarise(median_org = median(fractional_rank)) %>%
  dplyr::left_join(., peak_frac_df, by = "imf")

Let's see if the data that Hunter would want to use even exists!

ggplot(imf_median_rank, aes(x = median_org, y = fraction)) + geom_point(alpha = 0.5) +
  geom_vline(xintercept = 0.5, color = "red") +
  geom_hline(yintercept = 0.8, color = "red") +
  labs(subtitle = "Median Rank across Samples vs Fraction of Total Samples",
       x = "Median Rank of Peak", y = "Fraction of Samples Present")

OK, I previously messed up the rank within a sample, in that a lower number previously meant it was more highly ranked. Now it is right! A higher number in a sample means it is more highly ranked in the sample.

The next step then is to look at the peaks that appear in a decent number of samples, say 10 (so a fraction >= 0.05), and let's slice it down to things that are in the range of 0.5 median ranked across samples, using a range of 0.4 to 0.6.

ggplot(imf_median_rank, aes(x = median_org, y = fraction)) + geom_point(alpha = 0.5) +
  xlim(0.4, 0.6) + 
  geom_hline(yintercept = 0.05, color = "red") +
  labs(subtitle = "Median Rank across Samples vs Fraction of Total Samples",
       x = "Median Rank of Peak", y = "Fraction of Samples Present")
trim_ranks = dplyr::filter(imf_median_rank, dplyr::between(median_org, 0.6, 0.8),
                           fraction >= 0.055)

use_peaks = dplyr::filter(imf_2_peak, imf %in% trim_ranks$imf) %>%
  dplyr::left_join(., all_peaks, by = "Sample_Peak")

ggplot(use_peaks, aes(x = n_peak, y = fractional_rank)) + geom_point(alpha = 0.5) +
  geom_smooth(method = "lm")
cor.test(use_peaks$n_peak, use_peaks$fractional_rank)

OK, there is definitely a positive correlation between the fractional rank of a peak and the number of peaks in the sample. Now, can we correct it and make the correlation value lower?

Let's try a simple example.

tmp_data = rnorm(1000)
tmp_ranks = calc_rank(tmp_data)

rank_df = data.frame(value = tmp_data, org_rank = tmp_ranks)
rank_df2 = rank_df[order(rank_df$org_rank), ]
rank_df2 = rank_df2[200:1000, ]
rank_df2$new_rank = calc_rank(rank_df2$value)

ggplot(rank_df2, aes(new_rank, org_rank)) + geom_point() + geom_abline(slope = 0.8, intercept = 0.2,  color = "red")

Interestingly, we can create a linear model $rank_{org} = 0.2 + 0.8 \times rank_{new}$) that relates the two ranks.

So, this corresponds to doing the following:

$$rank_{org} = (1 - \frac{P_{sample}}{P_{max}})+ \frac{P_{sample}}{P_{max}} \times rank_{new}$$

Let's see if we can get this to work on our actual data.

imf_ranks = dplyr::filter(imf_ranks, fraction >= 0.057) %>%
  dplyr::left_join(., imf_median_rank, by = "imf")
imf_rank_corrected = dplyr::mutate(imf_ranks, max_peaks = max(n_peak),
                                   model1 = 1 - (n_peak / max_peaks),
                                   model2 = n_peak / max_peaks,
                                   lc_rank = model1 + (model2 * fractional_rank))

lc_median = imf_rank_corrected %>% dplyr::group_by(imf) %>%
  dplyr::summarise(median_lc = median(lc_rank)) %>%
  dplyr::left_join(., imf_rank_corrected, by = "imf")

ggplot(dplyr::filter(lc_median, dplyr::between(median_lc, 0.65, 0.75)), aes(x = n_peak, y = lc_rank)) +
  geom_point() +
  geom_smooth(method = "lm")

cor.test(~ lc_rank + n_peak, data = dplyr::filter(lc_median, dplyr::between(median_lc, 0.65, 0.75)))

So, we did the correction, and then we calculated new medians, and made sure to evaluate based on those new median values. And it appears to be an over-correction.

This is probably because not all the peaks are lost on the low end, so the correction isn't exactly right.

An alternative method is to use the fitted linear model, and the ratio of the current predicted value to the maximum predicted value. I think that should work to transform our correlation to something closer to 0. However, this takes advantage of the fact that we have 200 samples to do the correction with ..., which is not realistic.

tmp_data = as.data.frame(dplyr::filter(lc_median, dplyr::between(median_org, 0.65, 0.75)))
fit_npeak = lm(fractional_rank ~ n_peak, data = tmp_data)
rank_lookup = data.frame(n_peak = tmp_data$n_peak, fit_rank = fit_npeak$fitted.values) %>% unique(.)
rank_lookup = rank_lookup[!duplicated(rank_lookup$n_peak), ]
rank_lookup = rank_lookup[order(rank_lookup$n_peak), ]
rank_lookup = dplyr::mutate(rank_lookup, rank_ratio = fit_rank / max(fit_rank))
lc_median = dplyr::left_join(lc_median, rank_lookup, by = "n_peak")
corrected_ratiofit = dplyr::mutate(lc_median, ratiofit_rank = fractional_rank / rank_ratio)

lc_median = corrected_ratiofit %>%
  dplyr::group_by(imf) %>%
  dplyr::summarise(median_rf = median(ratiofit_rank)) %>%
  dplyr::left_join(., corrected_ratiofit, by = "imf")
ggplot(dplyr::filter(lc_median, dplyr::between(median_rf, 0.65, 0.75)), aes(x = n_peak, y = median_rf)) + geom_point() +
  geom_smooth(method = "lm")
cor.test(~ n_peak + median_rf, data = dplyr::filter(lc_median, dplyr::between(median_rf, 0.65, 0.75)))

And the correlation is gone! The 95% CI on the test is now: -0.02 - 0.031.

Verify By Simulation

So I wonder if we can verify this is what happens using a simulation? If we take say 200 samples, with 4500 peaks, and then let up to 0.6 of the mostly lower intensity ones get dropped, do we see the same phenomena? The nice thing is, because we are talking about ranks (essentially quantiles) they are actually distribution agnostic.

set.seed(1234)
library(fakeDataWithError)
base_distribution = rnorm(4500, mean = 10, sd = 1)
rep_data = add_uniform_noise(200, base_distribution, 0.5)
rep_data = matrix(base_distribution, nrow = length(base_distribution), ncol = 200, byrow = FALSE)
drop_prob = rnorm(200, mean = 2500, sd = 800)
low_prob = drop_prob < 1800
drop_prob[low_prob] = drop_prob[low_prob] + 1800
hi_prob = drop_prob > 4500
drop_prob[hi_prob] = 4500
colnames(rep_data) = paste0("s", seq(1, ncol(rep_data)))
rownames(rep_data) = paste0("f", seq(1, nrow(rep_data)))
drop_prob = floor(drop_prob)

My strategy for this is going to be: * Coin toss for each peak * Probability of being discarded in the coin toss is directly weighted by it's relative rank * Continue removing points until we reach number of points to remove (based on drop_prob above) * This should allow low intensity points to be kept at random and also remove random points above a rank of 0.5

removed_points = purrr::map_df(seq(1, ncol(rep_data)), function(in_col){
  #message(in_col)
  tmp_data = rep_data[, in_col]
  tmp_rank = calc_rank(tmp_data)
  names(tmp_rank) = names(tmp_data)

  get_loc = 1
  if (drop_prob[in_col] > length(tmp_data)/2) {
    keep_or_drop = rep(TRUE, length(tmp_rank))
    while (sum(keep_or_drop) > drop_prob[in_col]) {
      if (get_loc > length(tmp_rank)) {
        get_loc = 1
      }
      if (keep_or_drop[get_loc]) {
        keep_or_drop[get_loc] = sample(c(TRUE, FALSE), 1, prob = c(tmp_rank[get_loc], 1 - tmp_rank[get_loc]))
      }
      get_loc = get_loc + 1
    }
  } else {
    keep_or_drop = rep(FALSE, length(tmp_rank))
    while (sum(keep_or_drop) <= drop_prob[in_col]) {

      if (get_loc > length(tmp_rank)) {
        get_loc = 1
      }
      if (!(keep_or_drop[get_loc])) {
        keep_or_drop[get_loc] = sample(c(TRUE, FALSE), 1, prob = c(tmp_rank[get_loc], 1 - tmp_rank[get_loc]))
      }

      get_loc = get_loc + 1
    #message(sum(keep_or_drop))
    }
  }

  out_data = data.frame(Peak = names(tmp_rank),
                        Sample = colnames(rep_data)[in_col],
                        Height = tmp_data,
                        org_rank = tmp_rank,
                        stringsAsFactors = FALSE)
  out_data$Sample_Peak = paste0(out_data$Sample, "_", out_data$Peak)
  out_data = out_data[keep_or_drop, ]
  out_data$new_rank = calc_rank(out_data$Height)
  out_data
})

OK, and now we can check things out!

removed_npeak = dplyr::group_by(removed_points, Sample) %>%
  dplyr::summarize(n_peak = dplyr::n())
removed_points = dplyr::left_join(removed_points, removed_npeak, by = "Sample")
n_insample = dplyr::group_by(removed_points, Peak) %>%
  dplyr::summarise(n_sample = dplyr::n(),
                   median_rank = median(new_rank))
keep_removed = dplyr::filter(n_insample, n_sample >= 10)
removed_keep = dplyr::left_join(keep_removed, removed_points, by = "Peak")
ggplot(dplyr::filter(removed_keep, dplyr::between(median_rank, 0.65, 0.75)), aes(x = n_peak, y = new_rank)) + geom_point() +
  geom_smooth(method = "lm")

Cool!! It matches at least somewhat what we were seeing previously.

Correction of Simulated Data

Now we can try applying the same correction we derived above on our simulated data.

corrected_keep = dplyr::mutate(removed_keep, max_peaks = max(n_peak),
                             model1 = 1 - (n_peak / max_peaks),
                             model2 = n_peak / max_peaks)
corrected_keep = dplyr::mutate(corrected_keep, lc_rank = model1 + (model2 * new_rank))

removed_keep = corrected_keep %>%
  dplyr::group_by(Peak) %>%
  dplyr::summarise(median_lc = median(lc_rank)) %>%
  dplyr::left_join(., corrected_keep, by = "Peak")
ggplot(dplyr::filter(removed_keep, dplyr::between(median_lc, 0.65, 0.75)), aes(x = n_peak, y = lc_rank)) + geom_point() + geom_smooth(method = "lm")
cor.test(~ n_peak + lc_rank, dplyr::filter(removed_keep, dplyr::between(median_lc, 0.65, 0.75)))

Yeah, still not working.

What if we try to find the ratio of $$\frac{n_{peak}}{max_{peak}}$$ first?

removed_keep = dplyr::mutate(removed_keep, ratio_peaks = n_peak / max_peaks,
                             ratio_rank = new_rank / org_rank)
ggplot(dplyr::filter(removed_keep, dplyr::between(median_rank, 0.65, 0.75)), aes(x = ratio_peaks, y = ratio_rank)) + geom_point() + geom_smooth(method = "lm")

ratio_fit = lm(ratio_rank ~ ratio_peaks, data = dplyr::filter(removed_keep, dplyr::between(median_rank, 0.65, 0.75)))
rr_corrected_keep = dplyr::mutate(removed_keep, rr_rank = new_rank / (0.61 + 0.42*ratio_peaks))

removed_keep = rr_corrected_keep %>%
  dplyr::group_by(Peak) %>%
  dplyr::summarise(rr_median = median(rr_rank)) %>%
  dplyr::left_join(., rr_corrected_keep, by = "Peak")

ggplot(dplyr::filter(removed_keep, dplyr::between(rr_median, 0.65, 0.75)), aes(x = n_peak, y = rr_rank)) + geom_point() + geom_smooth(method = "lm")
cor.test(~ n_peak + rr_rank, dplyr::filter(removed_keep, dplyr::between(rr_median, 0.65, 0.75)))

Good thing Hunter realized I wasn't plotting what I thought I was, because this last solution doesn't work. Crud.

So right now the best option is the one that involves the fit to the whole data, I think. Let's see if that works here.

tmp_sim = dplyr::filter(removed_keep, dplyr::between(median_rank, 0.65, 0.75))
tmp_fit = lm(new_rank ~ n_peak, data = tmp_sim)
sim_lookup = data.frame(n_peak = tmp_sim$n_peak, fit_rank = tmp_fit$fitted.values) %>% unique(.)
sim_lookup = sim_lookup[!duplicated(sim_lookup$n_peak), ]
sim_lookup = dplyr::mutate(sim_lookup, rank_ratio = fit_rank / max(fit_rank))
removed_keep = dplyr::left_join(removed_keep, sim_lookup, by = "n_peak")

corrected_lookup = dplyr::mutate(removed_keep, ll_rank = new_rank / rank_ratio)
removed_keep = corrected_lookup %>%
  dplyr::group_by(Peak) %>%
  dplyr::summarise(ll_median = median(ll_rank)) %>%
  dplyr::left_join(., corrected_lookup)

ggplot(dplyr::filter(removed_keep, dplyr::between(ll_median, 0.65, 0.75)), aes(x = n_peak, y = ll_rank)) + 
  geom_point() + geom_smooth(method = "lm")

Nope, that doesn't even work on the simulated data. That doesn't mean it's not a bad option. It also might be related to the structure of our simulated data compared to the real data.

How Does This Affect Normalization?

We are going to calculate some metrics after we do normalization a couple of different ways.

Metrics

  1. Relative Standard Deviation of peak intensities
  2. P-values of not-cancer vs cancer

Normalization Methods

  1. Median peak intensity (median)
  2. Peak ranked at 0.5 in a sample (n50)
  3. Peak ranked at 0.7 in a sample (n70)
  4. Peak ranked most consistently at 0.5 / 0.7 (close50/70)
  5. Sum of up to 200 peaks that are <= 0.9 rank (under90) using either:
  6. original ranks (org)
  7. ratiofit ranks (fit)
  8. analytical correction (anal)

Do It!

lc_median_intensity = dplyr::left_join(lc_median, all_peaks[, c("Sample_Peak", "Height", "Sample")], by = "Sample_Peak")
lc_median_intensity = dplyr::mutate(lc_median_intensity,
                                    type = dplyr::case_when(
                                      grepl("Npos.*", Sample, ignore.case = TRUE) ~ "non-cancer",
                                      TRUE ~ "cancer"
                                    ))
raw_rsd = lc_median_intensity %>% 
  dplyr::group_by(imf, type) %>%
  dplyr::summarise(mean = mean(Height),
                   sd = sd(Height),
                   rsd = sd / mean,
                   which = "raw")
lung_json = readRDS("testing_stuff/lung_all_json.rds")
lung_norm = purrr::map_df(lung_json, function(in_json){
  data.frame(Value = in_json$peak$other_info$median_intensity,
             Sample = gsub(".zip$", "", in_json$zip$file),
             stringsAsFactors = FALSE)
})

mediannorm_rsd = dplyr::left_join(lc_median_intensity, lung_norm, by = "Sample") %>%
  dplyr::mutate(Height_mediannorm = Height / Value) %>%
  dplyr::group_by(imf, type) %>%
  dplyr::summarise(mean = mean(Height_mediannorm),
                   sd = sd(Height_mediannorm),
                   rsd = sd / mean,
                   which = "median")
calc_rank_rsd = function(lc_median_intensity, rank = 0.5, label = "n50"){
  rank_norm = lc_median_intensity %>%
    dplyr::mutate(diff_rank = abs(ratiofit_rank - {{rank}})) %>%
    dplyr::group_by(Sample) %>%
    dplyr::arrange(diff_rank) %>%
    dplyr::slice(1) %>%
    dplyr::select(Height, Sample) %>%
    dplyr::mutate(Value = Height)
  norm_rsd = dplyr::left_join(lc_median_intensity, rank_norm[, c("Sample", "Value")], by = "Sample") %>%
    dplyr::mutate(HeightNorm = Height / Value) %>%
    dplyr::group_by(imf, type) %>%
    dplyr::summarise(mean = mean(HeightNorm),
                     sd = sd(HeightNorm),
                     rsd = sd / mean,
                     which = {{label}})
  norm_rsd
}

df_norm = data.frame(rank = c(0.1, 0.25, 0.33, 0.5, 0.7, 0.9),
                     label = c("n10", "n25", "n33", "n50", "n70", "n90"))

nrank_rsd = purrr::map_df(seq(1, nrow(df_norm)), function(in_row){
  calc_rank_rsd(lc_median_intensity, df_norm[in_row, "rank"], df_norm[in_row, "label"])
})
n_imf = lc_median_intensity %>%
  dplyr::group_by(imf) %>%
  dplyr::summarise(n_sample = dplyr::n())

use_imf = dplyr::filter(n_imf, n_sample >= 150)
lc_use_imf = dplyr::left_join(use_imf, lc_median_intensity, by = "imf") %>%
  dplyr::mutate(diff_50 = abs(ratiofit_rank - 0.5),
                diff_70 = abs(ratiofit_rank - 0.7))
most_close50 = lc_use_imf %>%
  dplyr::group_by(imf) %>%
  dplyr::summarise(sum_50 = sum(diff_50)) %>%
  dplyr::arrange(sum_50) %>%
  dplyr::slice(1)
norm_close50 = dplyr::left_join(most_close50, lc_median_intensity, by = "imf") %>%
  dplyr::select(Height, Sample) %>%
  dplyr::mutate(Value = Height)
norm_close50_rsd = dplyr::left_join(norm_close50[, c("Sample", "Value")], lc_median_intensity, by = "Sample") %>%
  dplyr::mutate(Height_close50 = Height / Value) %>%
  dplyr::group_by(imf, type) %>%
  dplyr::summarise(mean = mean(Height_close50),
                   sd = sd(Height_close50),
                   rsd = sd /mean,
                   which = "close50")

most_close70 = lc_use_imf %>%
  dplyr::group_by(imf) %>%
  dplyr::summarise(sum_70 = sum(diff_70)) %>%
  dplyr::arrange(sum_70) %>%
  dplyr::slice(1)
norm_close70 = dplyr::left_join(most_close70, lc_median_intensity, by = "imf") %>%
  dplyr::select(Height, Sample) %>%
  dplyr::mutate(Value = Height)
norm_close70_rsd = dplyr::left_join(norm_close70[, c("Sample", "Value")], lc_median_intensity, by = "Sample") %>%
  dplyr::mutate(Height_close70 = Height / Value) %>%
  dplyr::group_by(imf, type) %>%
  dplyr::summarise(mean = mean(Height_close70),
                   sd = sd(Height_close70),
                   rsd = sd /mean,
                   which = "close70")

Compare RSDs

mode_value = function(values){
  values = values[!is.na(values)]
  density_estimate = stats::density(values)
  mode_value = density_estimate$x[which.max(density_estimate$y)]
  mode_value
}
mode_peak = function(values){
  values = values[!is.na(values)]
  density_estimate = stats::density(values)
  mode_value = density_estimate$x[which.max(density_estimate$y)]
  peak_mode = max(density_estimate$y)
  peak_mode
}
mode_fraction = function(values){
  values = values[!is.na(values)]
  density_estimate = stats::density(values)
  mode_value = density_estimate$x[which.max(density_estimate$y)]
  peak_mode = max(density_estimate$y)
  fraction_less_mode = sum(values <= mode_value) / length(values)
  fraction_less_mode
}
all_rsd = rbind(nrank_rsd, mediannorm_rsd, raw_rsd, norm_close70_rsd, norm_close50_rsd)
all_rsd$which_type = paste0(all_rsd$type, "_", all_rsd$which)

ggplot(dplyr::filter(all_rsd, type %in% "non-cancer"), aes(x = rsd, y = which_type)) + geom_density_ridges() +
  coord_cartesian(xlim = c(0, 1.5)) + geom_vline(xintercept = 0.33, color = "red") +
  labs(caption = "Density RSDs using each type of normalization, with a red line at 0.33, \nthe mode for non-cancer median, n50 and n70")
knitr::kable(dplyr::group_by(all_rsd, which_type) %>%
  dplyr::summarise(median_rsd = median(rsd, na.rm = TRUE)), digits = 3)

knitr::kable(dplyr::group_by(all_rsd, which_type) %>%
  dplyr::summarise(mode = mode_value(rsd),
                   height = mode_peak(rsd),
                   fraction = mode_fraction(rsd)), digits = 2)

Calculate & Compare P-Values

calc_ttest = function(data, sample_class){
  split_data = split(data, sample_class)
  n_split = purrr::map_int(split_data, length)
  if (sum(n_split >= 3) == 2){
    t_res = t.test(split_data[[1]], split_data[[2]])
    p_value = t_res$p.value
  } else {
    p_value = 1
  }
}

calculate_pvalues = function(lc_median_intensity, rank = 0.5, label = "n50"){
  rank_norm = lc_median_intensity %>%
    dplyr::mutate(diff_rank = abs(ratiofit_rank - {{rank}})) %>%
    dplyr::group_by(Sample) %>%
    dplyr::arrange(diff_rank) %>%
    dplyr::slice(1) %>%
    dplyr::select(Height, Sample) %>%
    dplyr::mutate(Value = Height)
  norm_intensity = dplyr::left_join(lc_median_intensity, rank_norm[, c("Sample", "Value")], by = "Sample") %>%
    dplyr::mutate(HeightNorm = Height / Value)

  imf_pvalues = norm_intensity %>%
    dplyr::group_by(imf) %>%
    dplyr::summarise(p_value = -1 * log10(calc_ttest(HeightNorm, type)),
                     label = label)

  imf_pvalues
}
mediannorm_pvalues = dplyr::left_join(lc_median_intensity, lung_norm, by = "Sample") %>%
  dplyr::mutate(HeightNorm = Height / Value) %>%
  dplyr::group_by(imf) %>%
  dplyr::summarise(p_value = -1 * log10(calc_ttest(HeightNorm, type)),
                   label = "median")

othernorm_pvalues = purrr::map(seq(1, nrow(df_norm)), function(in_row){
  calculate_pvalues(lc_median_intensity, df_norm[in_row, "rank"], df_norm[in_row, "label"])
})

compared_pvalues = purrr::map_df(othernorm_pvalues, function(in_pvalue){
  compare_table = dplyr::left_join(mediannorm_pvalues, in_pvalue, by = "imf")
  compare_table = dplyr::filter(compare_table, !((p_value.x == 0) & (p_value.y == 0)))
  compare_table = compare_table %>%
    dplyr::mutate(diff = p_value.y - p_value.x,
                  label = paste0(in_pvalue$label[1], "_", mediannorm_pvalues$label[1]))
  compare_table
})

ggplot(compared_pvalues, aes(x = diff)) + geom_histogram(bins = 60) + coord_cartesian(xlim = c(-0.5, 0.5)) +
  facet_wrap(~ label, nrow = 1) +
  labs(caption = "Lower p-value results in higher differences (positive side)")

compared_pvalues %>%
  dplyr::group_by(label) %>%
  dplyr::summarise(n_same = sum(dplyr::between(diff, -0.1, 0.1)),
                 n_better = sum(diff > 0.1),
                 n_worse = sum(diff < -0.1))

So this doesn't seem like we are making substantive changes, because at the most of things that we are making better, we are always making more worse.

Different Normalization: Up To 90th

We have one more suggestion from Hunter, and that is to take the 90th percentile, then take the previous 200 peaks, and sum those values, and use that for normalization.

calc_ranknorm = function(values, ranks, sample, rank_cutoff = 0.9, n_peak = 200){
  #message(sample[1])
  match_rank = max(ranks[ranks <= rank_cutoff])
  rank_index = which(ranks == match_rank)
  if (rank_index < n_peak) {
    n_peak = rank_index
  }
  use_values = seq(rank_index, rank_index - n_peak + 1, -1)
  sum(values[use_values])
}

sum_org90 = lc_median_intensity %>%
  dplyr::group_by(Sample) %>%
  dplyr::arrange(fractional_rank) %>%
  dplyr::summarise(Value = calc_ranknorm(Height, fractional_rank, Sample)) %>%
  dplyr::left_join(., lc_median_intensity, by = "Sample") %>%
  dplyr::mutate(HeightNorm = Height / Value) %>%
  dplyr::group_by(imf, type) %>%
  dplyr::summarise(mean = mean(HeightNorm),
                   sd = sd(HeightNorm),
                   rsd = sd / mean,
                   which = "org_under90")
sum_data90 = lc_median_intensity %>%
  dplyr::group_by(Sample) %>%
  dplyr::arrange(ratiofit_rank) %>%
  dplyr::summarise(Value = calc_ranknorm(Height, fractional_rank, Sample)) %>%
  dplyr::left_join(., lc_median_intensity, by = "Sample") %>%
  dplyr::mutate(HeightNorm = Height / Value) %>%
  dplyr::group_by(imf, type) %>%
  dplyr::summarise(mean = mean(HeightNorm),
                   sd = sd(HeightNorm),
                   rsd = sd / mean,
                   which = "fit_under90")

more_rsd = rbind(nrank_rsd, mediannorm_rsd, raw_rsd, norm_close70_rsd, norm_close50_rsd,
                sum_data90, sum_org90)
more_rsd$which_type = paste0(more_rsd$type, "_", more_rsd$which)
knitr::kable(dplyr::group_by(more_rsd, which_type) %>%
  dplyr::summarise(mode = mode_value(rsd),
                   height = mode_peak(rsd),
                   fraction = mode_fraction(rsd)), digits = 2)
ggplot(dplyr::filter(more_rsd, type %in% "non-cancer"), aes(x = rsd, y = which_type)) +
  geom_density_ridges() + coord_cartesian(xlim = c(0, 2))

Now we are also going to try the analytical solution to correcting the ranks.

norm_lc90 = lc_median_intensity %>%
  dplyr::group_by(Sample) %>%
  dplyr::mutate(diff_90 = abs(lc_rank - 0.9)) %>%
  dplyr::arrange(diff_90) %>%
  dplyr::slice(1) %>%
  dplyr::mutate(Value = Height)
anal_data90 = dplyr::left_join(norm_lc90[, c("Sample", "Value")], lc_median_intensity) %>%
  dplyr::mutate(HeightNorm = Height / Value) %>%
  dplyr::group_by(imf, type) %>%
  dplyr::summarise(mean = mean(HeightNorm),
                   sd = sd(HeightNorm),
                   rsd = sd / mean,
                   which = "anal90")
new_rsd = rbind(anal_data90, mediannorm_rsd, dplyr::filter(nrank_rsd, which %in% "n90"),
                sum_data90)
new_rsd$which_type = paste0(new_rsd$type, "_", new_rsd$which)

ggplot(dplyr::filter(new_rsd, type %in% "non-cancer"), aes(x = rsd, y = which_type)) +
  geom_density_ridges() + coord_cartesian(xlim = c(0, 2))

knitr::kable(dplyr::group_by(new_rsd, which_type) %>%
  dplyr::summarise(mode = mode_value(rsd),
                   height = mode_peak(rsd),
                   fraction = mode_fraction(rsd)), digits = 2)

Conclusions

Well, we can slightly improve things over just using the median value, , and that can be done with the analytical solution, which is nice, we don't need the crazy fit using all of the data. However, it's not much of an improvement, so it seems like an awful lot of work to get the little bit of improvement. Finally, the idea of using the sum of peaks under the 90th percentile rank actually made the RSD worse, which given it didn't matter which ranks we used implies that is not actually a good idea.



rmflight/metabolomicsUtilities documentation built on Oct. 28, 2023, 6:41 p.m.