# Select dataset for QC
biocrates <- datasets$discarded

Compound variability {.tabset}

Overview

The plot below shows the standard deviation of each individual metabolite against their average concentration, separately for QC and biological samples (i.e. technical vs biological variability).

A linear regression analysis is used to attempt to separate the technical and biological variability. The outputs of the models are used to score and assess this separation, and can be used as indicator for data quality.

if (ENOUGH_REFERENCE_QC){
  compound_var_stats <- compound_variability_scoring(biocrates)
  cat("Please see the details and the plot below for more information.")
} else {
  message("Error: Not enough QC replicates for comparative analysis.")
  compound_var_stats = list(
    r_squared = NA,
    qc_slope = NA,
    delta_slope = NA,
    delta_slope_pvalue = NA,
    delta_intercept = NA,
    delta_intercept_pvalue = NA,
    # wilcox_pvalue = NA,
    k = NA
  )
}
sd_mean_comp <- plot_compound_sd_vs_mean(
  data = biocrates,
  target = ENV$CONCENTRATION,
  sample_types = c(SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC),
  return_with_data = TRUE)

sd_mean_comp$scatterplot

Details

An integrative linear model is used to assess differences in technical and biological variability: log(SD) ~ Group + log(Mean)/Group with Group being a factor indicating QC or biological samples.

Model results are combined to identify four important main cases of differences in technical and biological variability:

1) k >= 7: A clear biological variability on top of the technical one. 2) k = 2:6: Inconclusive differences in biological and technical variability. 3) k = 0:1: A strong technical variability which hides the biological one. 4) k < 0: Something clearly wrong (when variability in QC decreases with increasing amount of metabolite concentration).

with final score k being computed to allow the classification of the four main cases as follows:

k = 0
k = k + (r_squared > 0.8)
k = k + 2*(delta_intercept > 0 & delta_intercept_pvalue < 0.05)
k = k + 4*(delta_slope > 0)
k = k - 8*(qc_slope < 0)

The results are as follows:

# Compare SDs vs Means over different study variable groups
sd_plots_fun <- function(var, data, info, parent){
  # Plot compound SDs vs Means of ENV$CONCENTRATION
  plot_sd <- plot_compound_sd_vs_mean(data = data, shape = BATCH,
                                   target = ENV$CONCENTRATION,
                                   sample_types = c(SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC),
                                   study_class = var)

  plot_rsd <- plot_compound_rsd_vs_mean(data = data, shape = BATCH,
                                    target = ENV$CONCENTRATION,
                                    sample_types = c(SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC),
                                    study_class = var)

  return(list(
    Header = paste0("\n#### ", info, var, "\n"),
    Plot_sd = plot_sd,
    Plot_rsd = plot_rsd))
}

if (length(STUDY_VARIABLES) > 0) {
  cat("\n### Per study variable {.tabset}\n")
  cat("\nFor visual examination, SD as well as %RSDs are plotted against the mean per compound not
only separated by sample type but further separated by groups in study variables of the biological
samples.\n")
  var_plots <- recursive_execution(vars = STUDY_VARIABLES,
                                   end_fun = sd_plots_fun,
                                   data = biocrates,
                                   keep_sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC))
  for (var_plot in var_plots) {
    cat(var_plot$Header)
    print(var_plot$Plot_sd)
    print(var_plot$Plot_rsd)
  }
}
# Remove "biocrates" dataset to ensure following sections select the dataset they need
rm(biocrates)


bihealth/metaquac documentation built on Aug. 7, 2021, 8:40 a.m.