The following multivariate analyses are based on the preprocessed dataset 4, unless explicitly stated otherwise.

# Select dataset
biocrates <- datasets$filter_samples_by_compound_mv_kept

# Get number of Sample, Pooled and Reference QCs
num_samples <- biocrates %>%
  select(Sample.Name, Sample.Type) %>% unique() %>%
  filter(Sample.Type == SAMPLE_TYPE_BIOLOGICAL) %>% nrow()
num_poolqc <- biocrates %>%
  select(Sample.Name, Sample.Type) %>% unique() %>%
  filter(Sample.Type == SAMPLE_TYPE_POOLED_QC) %>% nrow()
num_refqc <- biocrates %>%
  select(Sample.Name, Sample.Type) %>% unique() %>%
  filter(Sample.Type == ENV$SAMPLE_TYPE_REFERENCE_QC) %>% nrow()
num_cal <- biocrates %>%
  select(Sample.Name, Sample.Type) %>% unique() %>%
  filter(startsWith(Sample.Type, "Standard L")) %>% nrow()

# Get number of Compounds
num_compounds <- biocrates %>%
  select(Compound) %>% unique() %>% nrow()

Clustered heatmaps {.tabset}

Heatmaps of centered and scaled log10 target values (i.e. concentration, intensity or area) by compounds and samples. Sample labels may be colored by additional variables (e.g. Batch or Sample.Type). Compounds as well as samples are hierarchically clustered with Pearson correlation as distance metric or euclidean distance metric (aka L2 norm).

# Heatmap figure sizes

# Calculate width for clustered heatmaps

# Width of left layout col (key and dendogram) + right margin in heatmap
fig_width_heatmap_left_col <- 7/5
fig_width_heatmap_border <- fig_width_heatmap_left_col + 0.2

# Figure width for different sample sets
fig_width_heatmap_all <-
  max(7, 0.15 * (num_samples + num_poolqc + num_refqc) + fig_width_heatmap_border)
fig_width_heatmap_qcrep <-
  max(7, 0.15 * (num_poolqc + num_refqc) + fig_width_heatmap_border)
fig_width_heatmap_samples <-
  max(7, 0.15 * num_samples + fig_width_heatmap_border)

# Calculate height for clustered heatmaps
# fig_height_heatmap_compounds <- 0.2 * (num_compounds) + 3
fig_height_heatmap_compounds <- 7

Pearson {.tabset}

All sample types {.tabset}

if (!is.null(BATCH)){
  cat("##### Batch-colored labels\n")
  # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC
  plot_sample_heatmap(data = biocrates,
                      target = ENV$CONCENTRATION,
                      sample_types = c(SAMPLE_TYPE_BIOLOGICAL,
                                       ENV$SAMPLE_TYPE_REFERENCE_QC,
                                       SAMPLE_TYPE_POOLED_QC),
                      sample_color_by = BATCH,
                      lwid = c(
                        fig_width_heatmap_left_col/fig_width_heatmap_all,
                        1-fig_width_heatmap_left_col/fig_width_heatmap_all
                      ))
}
Type-colored labels
# Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC
plot_sample_heatmap(data = biocrates,
                    target = ENV$CONCENTRATION,
                    sample_types = c(SAMPLE_TYPE_BIOLOGICAL,
                                     ENV$SAMPLE_TYPE_REFERENCE_QC,
                                     SAMPLE_TYPE_POOLED_QC),
                    sample_color_by = "Sample.Type",
                    lwid = c(
                      fig_width_heatmap_left_col/fig_width_heatmap_all,
                      1-fig_width_heatmap_left_col/fig_width_heatmap_all
                    ))
cat(paste0("\n#### QC samples {.tabset}\n"))
if (!is.null(BATCH)){
  cat("##### Batch-colored labels\n")
  # Heatmap for concentration in samples of type ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC
  plot_sample_heatmap(data = biocrates,
                      target = ENV$CONCENTRATION,
                      sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC,
                                       SAMPLE_TYPE_POOLED_QC),
                      sample_color_by = BATCH,
                      lwid = c(
                        fig_width_heatmap_left_col/fig_width_heatmap_qcrep,
                        1-fig_width_heatmap_left_col/fig_width_heatmap_qcrep
                      ))
}
cat(paste0("\n##### Type-colored labels\n"))
# Heatmap for concentration in samples of type ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC
plot_sample_heatmap(data = biocrates,
                    target = ENV$CONCENTRATION,
                    sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC,
                                     SAMPLE_TYPE_POOLED_QC),
                    sample_color_by = "Sample.Type",
                    lwid = c(
                      fig_width_heatmap_left_col/fig_width_heatmap_qcrep,
                      1-fig_width_heatmap_left_col/fig_width_heatmap_qcrep
                    ))
if (!is.null(BATCH) || length(STUDY_VARIABLES) > 0){
  cat("#### Biological samples {.tabset}\n")
}
if (!is.null(BATCH)){
  cat("##### Batch-colored labels\n")
  # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL
  plot_sample_heatmap(data = biocrates,
                      target = ENV$CONCENTRATION,
                      sample_types = c(SAMPLE_TYPE_BIOLOGICAL),
                      sample_color_by = BATCH,
                      lwid = c(
                        fig_width_heatmap_left_col/fig_width_heatmap_samples,
                        1-fig_width_heatmap_left_col/fig_width_heatmap_samples
                      ))
}
# Compare Sample heatmaps over different study variable groups
heat_plots_fun <- function(var, data, info, parent){
  cat(paste0("\n##### ", info, var, "-colored labels\n"))
  # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL
  plot_sample_heatmap(data = data,
                      target = ENV$CONCENTRATION,
                      sample_types = c(SAMPLE_TYPE_BIOLOGICAL),
                      sample_color_by = var,
                      lwid = c(
                        fig_width_heatmap_left_col/7,
                        1-fig_width_heatmap_left_col/7
                      ))
}

if (length(STUDY_VARIABLES) > 0){
  tmp <- recursive_execution(vars = STUDY_VARIABLES,
                             end_fun = heat_plots_fun,
                             data = biocrates,
                             keep_sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC))
}

Euclidean {.tabset}

All sample types {.tabset}

if (!is.null(BATCH)){
  cat("##### Batch-colored labels\n")
  # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC
  plot_sample_heatmap(data = biocrates,
                      target = ENV$CONCENTRATION,
                      sample_types = c(SAMPLE_TYPE_BIOLOGICAL,
                                       ENV$SAMPLE_TYPE_REFERENCE_QC,
                                       SAMPLE_TYPE_POOLED_QC),
                      sample_color_by = BATCH,
                      dist_fun = stats::dist)
}
Type-colored labels
# Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC
plot_sample_heatmap(data = biocrates,
                    target = ENV$CONCENTRATION,
                    sample_types = c(SAMPLE_TYPE_BIOLOGICAL,
                                     ENV$SAMPLE_TYPE_REFERENCE_QC,
                                     SAMPLE_TYPE_POOLED_QC),
                    sample_color_by = "Sample.Type",
                    dist_fun = stats::dist)
cat(paste0("\n#### QC samples {.tabset}\n"))
if (!is.null(BATCH)){
  cat("##### Batch-colored labels\n")
  # Heatmap for concentration in samples of type ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC
  plot_sample_heatmap(data = biocrates,
                      target = ENV$CONCENTRATION,
                      sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC,
                                       SAMPLE_TYPE_POOLED_QC),
                      sample_color_by = BATCH,
                      dist_fun = stats::dist)
}
cat(paste0("\n##### Type-colored labels\n"))
# Heatmap for concentration in samples of type ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC
plot_sample_heatmap(data = biocrates,
                    target = ENV$CONCENTRATION,
                    sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC,
                                     SAMPLE_TYPE_POOLED_QC),
                    sample_color_by = "Sample.Type",
                    dist_fun = stats::dist)
if (!is.null(BATCH) || length(STUDY_VARIABLES) > 0){
  cat("#### Biological samples {.tabset}\n")
}
if (!is.null(BATCH)){
  cat("##### Batch-colored labels\n")
  # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL
  plot_sample_heatmap(data = biocrates,
                      target = ENV$CONCENTRATION,
                      sample_types = c(SAMPLE_TYPE_BIOLOGICAL),
                      sample_color_by = BATCH,
                      dist_fun = stats::dist)
}
# Compare Sample heatmaps over different study variable groups
heat_plots_fun <- function(var, data, info, parent){
  cat(paste0("\n##### ", info, var, "-colored labels\n"))
  # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL
  plot_sample_heatmap(data = data,
                      target = ENV$CONCENTRATION,
                      sample_types = c(SAMPLE_TYPE_BIOLOGICAL),
                      sample_color_by = var,
                      dist_fun = stats::dist)
}

if (length(STUDY_VARIABLES) > 0){
  tmp <- recursive_execution(vars = STUDY_VARIABLES,
                             end_fun = heat_plots_fun,
                             data = biocrates,
                             keep_sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC))
}

Missing value imputation

Median imputation is applied for the remaining visualizations on biological, reference and pooled QC samples (separately), as the methods are not suited to handle missing values. Samples which still contain missing values after imputation are completely removed.

# Imputate missing values
biocrates_imputed <- impute_median(
  data = biocrates,
  target = ENV$CONCENTRATION,
  sample_types = c(SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC))

# Removing samples which couldn't be imputed
filter_res <- remove_samples_na(
  data = biocrates_imputed,
  target = ENV$CONCENTRATION,
  max_ratio = 0.0,
  max_mode = "inclusive")
biocrates_imputed <- filter_res$data
easy_datatable(filter_res$removed, show_type = "statistics",
               caption = "Removed samples with MVs after imputation")

easy_datatable(
  biocrates_imputed,
  caption = "Remaining median imputed dataset 4",
  export_csv = params$data_export_long,
  export_path =  paste0(params$data_export_prefix, "_imputed_long.csv")
)

biocrates_imputed %>%
  wide_conc_table_compounds_x_samples() %>%
  easy_datatable(
    caption = paste0(
      "Remaining median imputed dataset 4",
      " (wide: compounds x samples, with some metadata)"
    ),
    export_csv = params$data_export_wide,
    export_path =  paste0(params$data_export_prefix, "_imputed_wide.csv")
  )

Sample correlation {.tabset}

Scatter plot matrices illustrate the correlation between samples based on compound concentration.

In the case of high sample numbers (> 9), several matrices will be generated based on sample groups of size up to 9 with one overlapping sample between each group (i.e. the last sample in a matrix is the first sample in the next matrix).

This analysis is based on the imputed dataset 4.

# Matrix for Reference QCs
cat(paste0("\n### Reference QCs {.tabset}\n"))
plot_sample_scatter_matrix(biocrates_imputed,
                           sample_types = ENV$SAMPLE_TYPE_REFERENCE_QC,
                           cat_header = "#### ")
# Matrix for Pooled QCs
cat(paste0("\n### ", SAMPLE_TYPE_POOLED_QC, "s {.tabset}\n"))
plot_sample_scatter_matrix(biocrates_imputed,
                           sample_types = SAMPLE_TYPE_POOLED_QC,
                           cat_header = "#### ")

All samples {.tabset}

# Matrix for all samples
plot_sample_scatter_matrix(biocrates_imputed,
                           cat_header = "#### ")

PCA {.tabset}

Principal component analysis (PCA) is performed to emphasize compound concentrations-based sample type, batch and potential study group relationships.

This analysis is based on the imputed dataset 4.

All samples

# All sample PCA over concentration
plot_pca(data = biocrates_imputed, target = ENV$CONCENTRATION,
         colour = "Sample.Type", label = ENV$PLOT_SAMPLE_LABEL, shape = BATCH)

Biological vs QC samples

biocrates_imputed_relevant <- subset(
  biocrates_imputed, subset = Sample.Type %in%  c(SAMPLE_TYPE_BIOLOGICAL, SAMPLE_TYPE_POOLED_QC, ENV$SAMPLE_TYPE_REFERENCE_QC))

# Sample, Pooled and Reference QCs PCA over concentration
plot_pca(data = biocrates_imputed_relevant, target = ENV$CONCENTRATION,
         colour = "Sample.Type", label = ENV$PLOT_SAMPLE_LABEL, shape = BATCH)
biocrates_imputed_sample <- subset(biocrates_imputed,
                                   subset = Sample.Type ==  SAMPLE_TYPE_BIOLOGICAL)

# Samples PCAs colored by different study variables
pca_fun <- function(var, data, info, parent){
  infos <- c()
  no_var <- data %>%
    group_by(Compound) %>%
    summarize(zero_variance = var(!!sym(ENV$CONCENTRATION), na.rm = TRUE)) %>%
    filter(zero_variance == 0) %>%
    pull(Compound)
  if(length(no_var) > 0){
    infos <- c(
      infos,
      paste0("Removed compounds with zero variance: ",
             paste0(no_var, collapse = ", "))
    )
    data <- data %>%
      filter(!Compound %in% no_var)
  }
  return(list(
    Header = paste0("\n#### ", info, var, "\n"),
    Infos = infos,
    Plot = plot_pca(data = data, target = ENV$CONCENTRATION,
                    colour = var, label = ENV$PLOT_SAMPLE_LABEL, shape = BATCH)))
}

if (length(STUDY_VARIABLES) > 0){
  cat("\n### Biological samples (colored groups) {.tabset}\n")
  pcas <- recursive_execution(vars = STUDY_VARIABLES,
                              end_fun = pca_fun,
                              data = biocrates_imputed_sample)
  for (pca in pcas){
    cat(pca$Header)
    for(info in pca$Infos) {cat(paste0(info, "\n"))}
    print(pca$Plot)
  }
}
# Remove "biocrates" datasets to ensure following sections select the dataset they need
rm(biocrates, biocrates_imputed, biocrates_imputed_relevant, biocrates_imputed_sample)


bihealth/metaquac documentation built on Aug. 7, 2021, 8:40 a.m.