The following multivariate analyses are based on the preprocessed dataset 4, unless explicitly stated otherwise.
# Select dataset biocrates <- datasets$filter_samples_by_compound_mv_kept # Get number of Sample, Pooled and Reference QCs num_samples <- biocrates %>% select(Sample.Name, Sample.Type) %>% unique() %>% filter(Sample.Type == SAMPLE_TYPE_BIOLOGICAL) %>% nrow() num_poolqc <- biocrates %>% select(Sample.Name, Sample.Type) %>% unique() %>% filter(Sample.Type == SAMPLE_TYPE_POOLED_QC) %>% nrow() num_refqc <- biocrates %>% select(Sample.Name, Sample.Type) %>% unique() %>% filter(Sample.Type == ENV$SAMPLE_TYPE_REFERENCE_QC) %>% nrow() num_cal <- biocrates %>% select(Sample.Name, Sample.Type) %>% unique() %>% filter(startsWith(Sample.Type, "Standard L")) %>% nrow() # Get number of Compounds num_compounds <- biocrates %>% select(Compound) %>% unique() %>% nrow()
Heatmaps of centered and scaled log10 target values (i.e. concentration, intensity or area) by compounds and samples. Sample labels may be colored by additional variables (e.g. Batch or Sample.Type). Compounds as well as samples are hierarchically clustered with Pearson correlation as distance metric or euclidean distance metric (aka L2 norm).
# Heatmap figure sizes # Calculate width for clustered heatmaps # Width of left layout col (key and dendogram) + right margin in heatmap fig_width_heatmap_left_col <- 7/5 fig_width_heatmap_border <- fig_width_heatmap_left_col + 0.2 # Figure width for different sample sets fig_width_heatmap_all <- max(7, 0.15 * (num_samples + num_poolqc + num_refqc) + fig_width_heatmap_border) fig_width_heatmap_qcrep <- max(7, 0.15 * (num_poolqc + num_refqc) + fig_width_heatmap_border) fig_width_heatmap_samples <- max(7, 0.15 * num_samples + fig_width_heatmap_border) # Calculate height for clustered heatmaps # fig_height_heatmap_compounds <- 0.2 * (num_compounds) + 3 fig_height_heatmap_compounds <- 7
if (!is.null(BATCH)){ cat("##### Batch-colored labels\n") # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC plot_sample_heatmap(data = biocrates, target = ENV$CONCENTRATION, sample_types = c(SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC), sample_color_by = BATCH, lwid = c( fig_width_heatmap_left_col/fig_width_heatmap_all, 1-fig_width_heatmap_left_col/fig_width_heatmap_all )) }
# Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC plot_sample_heatmap(data = biocrates, target = ENV$CONCENTRATION, sample_types = c(SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC), sample_color_by = "Sample.Type", lwid = c( fig_width_heatmap_left_col/fig_width_heatmap_all, 1-fig_width_heatmap_left_col/fig_width_heatmap_all ))
cat(paste0("\n#### QC samples {.tabset}\n")) if (!is.null(BATCH)){ cat("##### Batch-colored labels\n") # Heatmap for concentration in samples of type ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC plot_sample_heatmap(data = biocrates, target = ENV$CONCENTRATION, sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC), sample_color_by = BATCH, lwid = c( fig_width_heatmap_left_col/fig_width_heatmap_qcrep, 1-fig_width_heatmap_left_col/fig_width_heatmap_qcrep )) }
cat(paste0("\n##### Type-colored labels\n")) # Heatmap for concentration in samples of type ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC plot_sample_heatmap(data = biocrates, target = ENV$CONCENTRATION, sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC), sample_color_by = "Sample.Type", lwid = c( fig_width_heatmap_left_col/fig_width_heatmap_qcrep, 1-fig_width_heatmap_left_col/fig_width_heatmap_qcrep ))
if (!is.null(BATCH) || length(STUDY_VARIABLES) > 0){ cat("#### Biological samples {.tabset}\n") }
if (!is.null(BATCH)){ cat("##### Batch-colored labels\n") # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL plot_sample_heatmap(data = biocrates, target = ENV$CONCENTRATION, sample_types = c(SAMPLE_TYPE_BIOLOGICAL), sample_color_by = BATCH, lwid = c( fig_width_heatmap_left_col/fig_width_heatmap_samples, 1-fig_width_heatmap_left_col/fig_width_heatmap_samples )) }
# Compare Sample heatmaps over different study variable groups heat_plots_fun <- function(var, data, info, parent){ cat(paste0("\n##### ", info, var, "-colored labels\n")) # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL plot_sample_heatmap(data = data, target = ENV$CONCENTRATION, sample_types = c(SAMPLE_TYPE_BIOLOGICAL), sample_color_by = var, lwid = c( fig_width_heatmap_left_col/7, 1-fig_width_heatmap_left_col/7 )) } if (length(STUDY_VARIABLES) > 0){ tmp <- recursive_execution(vars = STUDY_VARIABLES, end_fun = heat_plots_fun, data = biocrates, keep_sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC)) }
if (!is.null(BATCH)){ cat("##### Batch-colored labels\n") # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC plot_sample_heatmap(data = biocrates, target = ENV$CONCENTRATION, sample_types = c(SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC), sample_color_by = BATCH, dist_fun = stats::dist) }
# Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC plot_sample_heatmap(data = biocrates, target = ENV$CONCENTRATION, sample_types = c(SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC), sample_color_by = "Sample.Type", dist_fun = stats::dist)
cat(paste0("\n#### QC samples {.tabset}\n")) if (!is.null(BATCH)){ cat("##### Batch-colored labels\n") # Heatmap for concentration in samples of type ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC plot_sample_heatmap(data = biocrates, target = ENV$CONCENTRATION, sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC), sample_color_by = BATCH, dist_fun = stats::dist) }
cat(paste0("\n##### Type-colored labels\n")) # Heatmap for concentration in samples of type ENV$SAMPLE_TYPE_REFERENCE_QC and SAMPLE_TYPE_POOLED_QC plot_sample_heatmap(data = biocrates, target = ENV$CONCENTRATION, sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC), sample_color_by = "Sample.Type", dist_fun = stats::dist)
if (!is.null(BATCH) || length(STUDY_VARIABLES) > 0){ cat("#### Biological samples {.tabset}\n") }
if (!is.null(BATCH)){ cat("##### Batch-colored labels\n") # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL plot_sample_heatmap(data = biocrates, target = ENV$CONCENTRATION, sample_types = c(SAMPLE_TYPE_BIOLOGICAL), sample_color_by = BATCH, dist_fun = stats::dist) }
# Compare Sample heatmaps over different study variable groups heat_plots_fun <- function(var, data, info, parent){ cat(paste0("\n##### ", info, var, "-colored labels\n")) # Heatmap for concentration in samples of type SAMPLE_TYPE_BIOLOGICAL plot_sample_heatmap(data = data, target = ENV$CONCENTRATION, sample_types = c(SAMPLE_TYPE_BIOLOGICAL), sample_color_by = var, dist_fun = stats::dist) } if (length(STUDY_VARIABLES) > 0){ tmp <- recursive_execution(vars = STUDY_VARIABLES, end_fun = heat_plots_fun, data = biocrates, keep_sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC)) }
Median imputation is applied for the remaining visualizations on biological, reference and pooled QC samples (separately), as the methods are not suited to handle missing values. Samples which still contain missing values after imputation are completely removed.
# Imputate missing values biocrates_imputed <- impute_median( data = biocrates, target = ENV$CONCENTRATION, sample_types = c(SAMPLE_TYPE_BIOLOGICAL, ENV$SAMPLE_TYPE_REFERENCE_QC, SAMPLE_TYPE_POOLED_QC)) # Removing samples which couldn't be imputed filter_res <- remove_samples_na( data = biocrates_imputed, target = ENV$CONCENTRATION, max_ratio = 0.0, max_mode = "inclusive") biocrates_imputed <- filter_res$data easy_datatable(filter_res$removed, show_type = "statistics", caption = "Removed samples with MVs after imputation") easy_datatable( biocrates_imputed, caption = "Remaining median imputed dataset 4", export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_imputed_long.csv") ) biocrates_imputed %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = paste0( "Remaining median imputed dataset 4", " (wide: compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_imputed_wide.csv") )
Scatter plot matrices illustrate the correlation between samples based on compound concentration.
In the case of high sample numbers (> 9), several matrices will be generated based on sample groups of size up to 9 with one overlapping sample between each group (i.e. the last sample in a matrix is the first sample in the next matrix).
This analysis is based on the imputed dataset 4.
# Matrix for Reference QCs cat(paste0("\n### Reference QCs {.tabset}\n")) plot_sample_scatter_matrix(biocrates_imputed, sample_types = ENV$SAMPLE_TYPE_REFERENCE_QC, cat_header = "#### ")
# Matrix for Pooled QCs cat(paste0("\n### ", SAMPLE_TYPE_POOLED_QC, "s {.tabset}\n")) plot_sample_scatter_matrix(biocrates_imputed, sample_types = SAMPLE_TYPE_POOLED_QC, cat_header = "#### ")
# Matrix for all samples plot_sample_scatter_matrix(biocrates_imputed, cat_header = "#### ")
Principal component analysis (PCA) is performed to emphasize compound concentrations-based sample type, batch and potential study group relationships.
This analysis is based on the imputed dataset 4.
# All sample PCA over concentration plot_pca(data = biocrates_imputed, target = ENV$CONCENTRATION, colour = "Sample.Type", label = ENV$PLOT_SAMPLE_LABEL, shape = BATCH)
biocrates_imputed_relevant <- subset( biocrates_imputed, subset = Sample.Type %in% c(SAMPLE_TYPE_BIOLOGICAL, SAMPLE_TYPE_POOLED_QC, ENV$SAMPLE_TYPE_REFERENCE_QC)) # Sample, Pooled and Reference QCs PCA over concentration plot_pca(data = biocrates_imputed_relevant, target = ENV$CONCENTRATION, colour = "Sample.Type", label = ENV$PLOT_SAMPLE_LABEL, shape = BATCH)
biocrates_imputed_sample <- subset(biocrates_imputed, subset = Sample.Type == SAMPLE_TYPE_BIOLOGICAL) # Samples PCAs colored by different study variables pca_fun <- function(var, data, info, parent){ infos <- c() no_var <- data %>% group_by(Compound) %>% summarize(zero_variance = var(!!sym(ENV$CONCENTRATION), na.rm = TRUE)) %>% filter(zero_variance == 0) %>% pull(Compound) if(length(no_var) > 0){ infos <- c( infos, paste0("Removed compounds with zero variance: ", paste0(no_var, collapse = ", ")) ) data <- data %>% filter(!Compound %in% no_var) } return(list( Header = paste0("\n#### ", info, var, "\n"), Infos = infos, Plot = plot_pca(data = data, target = ENV$CONCENTRATION, colour = var, label = ENV$PLOT_SAMPLE_LABEL, shape = BATCH))) } if (length(STUDY_VARIABLES) > 0){ cat("\n### Biological samples (colored groups) {.tabset}\n") pcas <- recursive_execution(vars = STUDY_VARIABLES, end_fun = pca_fun, data = biocrates_imputed_sample) for (pca in pcas){ cat(pca$Header) for(info in pca$Infos) {cat(paste0(info, "\n"))} print(pca$Plot) } }
# Remove "biocrates" datasets to ensure following sections select the dataset they need rm(biocrates, biocrates_imputed, biocrates_imputed_relevant, biocrates_imputed_sample)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.