In bihealth/metaquac: Metabolomics Targeted Quality Control

# Keep only columns (variable) of interest from the biocrates dataframe
selected_columns <- names(df) %in% c(
  "Compound", "Sample.Name", "Sample.Identification", "Sample.Type",
  "Analyte Intensity [cps]", "Internal Std. Intensity [cps]",
  LOWCON_CONDITIONS
)

df.filtered <- df[selected_columns]

# Renaming column for consistent naming accross datasets 
df.filtered <- df.filtered %>%
  rename(
    Analyte.Intensity = `Analyte Intensity [cps]`,
    Internal.Std.Intensity = `Internal Std. Intensity [cps]`,
  )

# Extract PBS (zero samples) first from the dataframe
df.zero_samples <- df.filtered[ df.filtered$Sample.Type == "Zero Sample", ]
# Extract QC 2 samples
df.qc_samples <- df.filtered[ df.filtered$Sample.Type == "QC Level 2", ]
# Filter the dataframe to keep only biological samples
# As we don't have any annotation file, we get our sample list directly from the dataframe
df.filtered <- df.filtered[ df.filtered$Sample.Type %in% c("Sample", "Pooled QC"), ]

# Problem in annotation, need to remove some files
# df.filtered <- df.filtered[ which(!df.filtered$Sample.Identification %in% c('44','45','47','49','50')),]

Metabolites RSD in Biocrates QC samples

No filtering is done at this stage and this table is given as infromation only

# Extra step to check whether some metabolites should be removed because they have a large RSD in the QC samples

df.qc_samples.summary <- df.qc_samples %>% 
  mutate(Analyte.Peak.Intensity.Normalised = Analyte.Intensity/Internal.Std.Intensity) %>%
  group_by(Compound) %>%
  mutate(relative_standard_deviation = rsd(na.omit(Analyte.Peak.Intensity.Normalised)))

df.qc_samples.summary %>%
  distinct(Compound, .keep_all = TRUE)  %>%
  select(Compound,relative_standard_deviation) %>%
  easy_datatable()

df.zero_samples <- df.zero_samples %>%
  group_by(Compound) %>%
  mutate(mean_zero_peak_intensity = mean(Analyte.Intensity, na.rm = T), median_zero_peak_intensity = median(Analyte.Intensity, na.rm = T), highest_zero_peak_intensity = max(Analyte.Intensity, na.rm = TRUE))
# mutate(mean_peak_area = mean(Analyte.Peak.Area..area., na.rm = T), median_peak_area = median(Analyte.Peak.Area..area., na.rm = T), highest_peak_area = max(Analyte.Peak.Area..area., na.rm = TRUE), mean_concentration = mean(Concentration..??M., na.rm = T), median_concentration = median(Concentration..??M., na.rm = T), highest_concentration = max(Concentration..??M., na.rm = TRUE))

# Replace "NaN" values by NA
df.zero_samples[sapply(df.zero_samples, is.nan)] <- NA
# Replace "-Inf" values created by max() by NA
df.zero_samples[df.zero_samples=="-Inf"] <- NA

df.zero_sample_summary <- df.zero_samples %>%
  group_by(Compound) %>%
  select(Compound, mean_zero_peak_intensity,median_zero_peak_intensity) %>%
  distinct(Compound, .keep_all = T)

df.filtered <- left_join(df.filtered,df.zero_sample_summary[,c("Compound","mean_zero_peak_intensity", "median_zero_peak_intensity")], by="Compound")

Number of samples above zero samples 3*median peak intensity threshold

df.filtered <- df.filtered %>%
  # mutate(above_3concentration = ifelse(Concentration....M. >= mean_concentration*3, TRUE, FALSE), above_3peak_area = ifelse(Analyte.Peak.Area..area. >= mean_peak_area*3, TRUE, FALSE))
  mutate(above_3mean_peak_intensity = ifelse(Analyte.Intensity >= mean_zero_peak_intensity*3, TRUE, FALSE), above_3median_peak_intensity = ifelse(Analyte.Intensity >= median_zero_peak_intensity*3, TRUE, FALSE))

df.filtered %>%
  group_by(Compound) %>%
  summarise(Number_sample_above = sum(above_3median_peak_intensity == TRUE, na.rm = T), Number_sample_below = sum(above_3median_peak_intensity == FALSE, na.rm = T), Number_NA = sum(is.na(above_3median_peak_intensity))) %>% 
  easy_datatable()

Number of metabolites per condition kept for analysis (after intensity and zero samples filters)

# First we set to NA all values below the threshold to discard them
df.filtered$Analyte.Intensity[ which(df.filtered$above_3median_peak_intensity == FALSE)] = NA

df.filtered <- df.filtered %>%
  mutate(Above.LOD = ifelse(Analyte.Intensity > LOWCON_MINIMUM_INTENSITY, TRUE, FALSE))

df.filtered$Analyte.Intensity[ which(df.filtered$Above.LOD == FALSE)] = NA

df.filtered <- df.filtered %>%
  mutate(Analyte.Intensity.Normalised = Analyte.Intensity/Internal.Std.Intensity)

condition_summary <- df.filtered %>%
  group_by_at(vars(Compound, one_of(LOWCON_CONDITIONS))) %>%
  summarise(
    Number_values = sum(!is.na(Analyte.Intensity)),
    Number_NA = sum(is.na(Analyte.Intensity))
  ) %>%
  mutate(Keep = ifelse(Number_values >= 1, TRUE, FALSE))

condition_summary %>%
  group_by_at(vars(one_of(LOWCON_CONDITIONS))) %>%
  summarise(Number_of_metabolite = sum(Keep == TRUE)) %>%
  easy_datatable()

Total peak intensity per sample

df.tot_area_HOS <- df.filtered %>%
  group_by(Sample.Name) %>%
  # filter(Cell.line == "HOS") %>%
  mutate(
    total_intensity = sum(Analyte.Intensity, na.rm = T),
    total_intensity_normalised = sum(Analyte.Intensity.Normalised, na.rm = T)
  ) %>%
  distinct(Sample.Name, .keep_all = TRUE) %>%
  ungroup()

df.tot_area_HOS$condition <- df.tot_area_HOS %>%
  select(one_of(LOWCON_CONDITIONS)) %>%
  purrr::imap(.f = ~ paste(.y, .x, sep=" ")) %>%
  as_tibble %>%
  tidyr::unite(condition, LOWCON_CONDITIONS, sep=" | ") %>%
  pull(condition)

df.tot_area_HOS$condition <- as.factor(df.tot_area_HOS$condition)

ggplot(data=df.tot_area_HOS, aes(x=reorder(Sample.Identification, as.numeric(condition)), y=total_intensity, fill=condition)) +
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Sample name", y = " Total peak intensity") +
  theme(legend.position = "bottom")

Total peak intensity per sample after normalisation to internal standards

ggplot(data=df.tot_area_HOS, aes(x=reorder(Sample.Identification, as.numeric(condition)), y=total_intensity_normalised, fill=condition)) +
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Sample name", y = " Total peak intensity") +
  theme(legend.position = "bottom")

# knitr::knit_exit()

Outlier detection on normalised total peak intensity using a 1.5 standard deviation cutoff

Remove SD outliers (i.e. set values to NA): r LOWCON_SD_OUTLIER_REM (according to parameter lowcon_sd_outlier_removal)

# Calculate 2 standard deviation per group
df.tot_area_HOS <- df.tot_area_HOS %>%
  group_by(condition) %>%
  mutate(tot_intensity_2_sd_low = mean(total_intensity_normalised)-1.5*sd(total_intensity_normalised),tot_intensity_2_sd_high = mean(total_intensity_normalised)+1.5*sd(total_intensity_normalised)) 

df.tot_area_HOS <- df.tot_area_HOS %>%
  mutate(inside_range = ifelse(total_intensity_normalised>tot_intensity_2_sd_low & total_intensity_normalised<tot_intensity_2_sd_high, TRUE, FALSE))

ggplot(data=df.tot_area_HOS, aes(x=reorder(Sample.Identification, as.numeric(condition)), y=total_intensity_normalised, fill=condition)) +
  geom_bar(stat="identity", aes(colour=inside_range))+
  # scale_fill_manual(inside_range = c("FALSE" = "red", "TRUE" = "white"), legend = FALSE) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Sample name", y = "Total peak intensity") +
  theme(legend.position = "bottom")

# Remove outlier samples from the data
if (LOWCON_SD_OUTLIER_REM){
  df.tot_area_HOS$total_intensity_normalised[ which(df.tot_area_HOS$inside_range == FALSE)] = NA
}

Mean peak intensity per condition (with standard deviation)

df.summary_HOS <- df.tot_area_HOS %>%
  group_by(condition) %>%
  mutate(mean_peak_intensity = mean(total_intensity_normalised, na.rm = T), standard_deviation = sd(total_intensity_normalised, na.rm = T)) %>%
  summarise(mean_peak_intensity = unique(mean_peak_intensity), standard_deviation = unique(standard_deviation))

ggplot(df.summary_HOS, aes(x=condition, y=mean_peak_intensity, fill=condition)) +
  geom_bar(position=position_dodge(), stat="identity") +
  geom_errorbar(aes(ymin=mean_peak_intensity-standard_deviation, ymax=mean_peak_intensity+standard_deviation),
                width=.2,                    # Width of the error bars
                position=position_dodge(.9)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Condition", y = "Mean peak intensity") +
  theme(legend.position = "bottom")

Summary RSD of total intensity area

summary_rsd_HOS <- df.tot_area_HOS %>%
  group_by(condition) %>%
  mutate(mean_peak_intensity = mean(total_intensity_normalised, na.rm = T), rsd = rsd(na.omit(total_intensity_normalised))) %>%
  summarise(mean_peak_intensity = unique(mean_peak_intensity), relative_standard_deviation = unique(rsd))

colnames(summary_rsd_HOS) <- c("Condition", "Mean peak intensity", "Relative Standard deviation (%)")
summary_rsd_HOS %>%
  easy_datatable()

Metabolite RSD per condition

A minimum number of 2 values is required for the RSD to be reported in the table.

# Bring condition names and inside range value to the main dataframe
df.filtered.condition.HOS <- left_join(df.filtered,df.tot_area_HOS[,c("Sample.Name","condition","inside_range")], by="Sample.Name")

# Remove outlier samples found to be outside 1.5 standard deviation
if (LOWCON_SD_OUTLIER_REM){
  df.filtered.condition.HOS$Analyte.Intensity.Normalised[ which(df.filtered.condition.HOS$inside_range == FALSE)] = NA
}

# Keeping data for showing table later
df.filtered.condition.HOS.table <- df.filtered.condition.HOS

df.filtered.condition.HOS <- df.filtered.condition.HOS %>%
  # filter(Cell.line == "HOS") %>%
  group_by(condition, Compound) %>%
  mutate(
    relative_standard_deviation = rsd(na.omit(Analyte.Intensity.Normalised)),
    metabolite_mean_peak_area = mean(Analyte.Intensity.Normalised, na.rm = T)
  ) %>%
  distinct(Compound, condition, .keep_all = TRUE)

df.metabolite_rsd_HOS <- df.filtered.condition.HOS %>%
  select(Compound, condition, relative_standard_deviation) %>%
  spread(Compound, relative_standard_deviation)

df.metabolite_rsd_HOS %>%
  easy_datatable()

Full normalized intensity dataset

# Table that shows samples with normalized intensity (long)
df.filtered %>%
  select(Compound, Sample.Identification, Sample.Type,
         all_of(LOWCON_CONDITIONS), Analyte.Intensity.Normalised) %>%
  easy_datatable(
    caption = "Long table of normalized intensities",
    export_csv = params$data_export_long,
    export_path =  paste0(params$data_export_prefix, "_lowcon_fia_long.csv")
  )

# Table that shows samples with normalized intensity (wide)
df.filtered %>%
  wide_conc_table_compounds_x_samples(value = "Analyte.Intensity.Normalised") %>%
  easy_datatable(
    caption = paste(
      "Wide table of normalized intensities",
      "(compounds x samples, with some metadata)"
    ),
    export_csv = params$data_export_wide,
    export_path =  paste0(params$data_export_prefix, "_lowcon_fia_wide.csv")
  )

cat("\n## SD-filtered normalized intensity dataset\n\n")

# Table that shows samples with normalized intensity (long)
df.filtered.condition.HOS.table %>%
  ungroup() %>%
  select(Compound, Sample.Identification, Sample.Type,
         all_of(LOWCON_CONDITIONS), Analyte.Intensity.Normalised) %>%
  easy_datatable(
    caption = "Long table of normalized intensities"
  )

# Table that shows samples with normalized intensity (wide)
df.filtered.condition.HOS.table %>%
  ungroup() %>%
  wide_conc_table_compounds_x_samples(value = "Analyte.Intensity.Normalised") %>%
  easy_datatable(caption = paste(
    "Wide table of normalized intensities",
    "(compounds x samples, with some metadata)"
  ))