# Preprocessing parameters ppparams <- list( statuses_to_keep = params$preproc_keep_status, statuses_q500_urine = PREPROC_Q500_URINE_LIMITS, threshold_2a = params$filter_compound_qc_ref_max_mv_ratio, threshold_2b = params$filter_compound_qc_ref_max_rsd, threshold_2c = params$filter_compound_qc_pool_max_mv_ratio, threshold_2d = params$filter_compound_qc_pool_max_rsd, threshold_3a = 1, threshold_3b = params$filter_compound_bs_max_mv_ratio, threshold_3c = params$filter_compound_bs_min_rsd, threshold_4 = params$filter_sample_max_mv_ratio )
The following preprocessing procedures are applied on the data in this order:
r ppparams$threshold_2a * 100
%)
b. Based on %RSD of Reference QCs (>=r ppparams$threshold_2b
%)
c. Based on missing values ratio in Pooled QCs (>=r ppparams$threshold_2c * 100
%)
d. Based on %RSD of Pooled QCs (>=r ppparams$threshold_2d
%)r ppparams$threshold_3a * 100
%)
b. Based on missing values ratio in biological samples (>=r ppparams$threshold_3b * 100
%)
c. Based on %RSD in biological samples (<r ppparams$threshold_3c
%)r ppparams$threshold_4 * 100
%)The preprocessing statuses and filter thresholds can be modified with the following parameters of
the create_report
function (refer to the readme or function description for details):
preproc_keep_status
filter_compound_qc_ref_max_mv_ratio
filter_compound_qc_ref_max_rsd
filter_compound_qc_pool_max_mv_ratio
filter_compound_qc_pool_max_rsd
filter_compound_bs_max_mv_ratio
filter_compound_bs_min_rsd
filter_sample_max_mv_ratio
The result of each preprocessing or filter step can be found in the tab with the corresponding enumerator. Please note, as each tab contains a table with the complete dataset in the corresponding filtered form, loading the tab for the first time might take a couple of seconds.
message("**Note**: The beginning of each following section will indicate on which dataset calculations and visualizations are based. Thereby, the preprocessed datasets will be referenced as **1**, **2a**, **2b**, **3a**, **3b** and **4**, resp.") # Execute preprocessing datasets <- execute_preprocessing(data = biocrates, ppparams = ppparams) # Remove "biocrates" dataset to ensure following sections select the preprocessed dataset they need rm(biocrates)
The status preprocessing discards measurements based on quality statuses.
By default, only "Valid" and "Semi Quant." measurements are considered as reliable and are kept for further analysis. Hence, all other statuses including "< LLOQ" and "> ULOQ" (out of quantification range / extrapolated calibration), "STD/QC < Limit" and "STD/QC > Limit" (insufficient accuracy in STDs and QCs), "ISTD Out of Range" (internal standard too high or low) and other statuses are usually considered as unreliable and measurements are transformed to missing values (set to NA).
cat("Note: Status names may differ in custom generic data.\n")
message("Note: Statuses \"< LLOQ\" and \"> ULOQ\" have been updated if data is within range of cal 0.25 and cal 9, resp. (i.e. set to \"Valid\", see parameter `preproc_q500_urine_limits`).\n")
Based on the parameter preproc_keep_status
, measurements with the following statuses will be
retained:
r paste0(ppparams$statuses_to_keep, collapse = "**, **")
Missing Concentration values before status preprocessing:
r sum(is.na(datasets$original[[ENV$CONCENTRATION]]))
of r nrow(datasets$original)
(r round(sum(is.na(datasets$original[[ENV$CONCENTRATION]])) / nrow(datasets$original) * 100)
%)
Missing Concentration values after status preprocessing:
r sum(is.na(datasets$discarded[[ENV$CONCENTRATION]]))
of r nrow(datasets$discarded)
(r round(sum(is.na(datasets$discarded[[ENV$CONCENTRATION]])) / nrow(datasets$discarded) * 100)
%)
easy_datatable( summarize_missing_values_per_sample(datasets$discarded, plot = FALSE), round_colums = "% Missing Values", show_type = "statistics", caption = "Missing values per sample after status preprocessing") easy_datatable( summarize_missing_values_per_compound(datasets$discarded, plot = FALSE), round_colums = "% Missing Values", show_type = "statistics", caption = "Missing values per compound after status preprocessing") easy_datatable( datasets$discarded, caption = paste0("Full data after status preprocessing"), export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_filter1_long.csv") ) datasets$discarded %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = "Full data after status preprocessing (wide: compounds x samples, with some metadata)", export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_filter1_wide.csv") )
easy_datatable( datasets$original %>% select(all_of(COLUMN_SAMPLE_NAME), Compound, MetIDQ_Status) %>% rename(Status_before = MetIDQ_Status) %>% left_join( datasets$original_urine %>% select(all_of(COLUMN_SAMPLE_NAME), Compound, MetIDQ_Status) %>% rename(Status_after = MetIDQ_Status), by = c(COLUMN_SAMPLE_NAME, "Compound") ) %>% filter(Status_before != Status_after), show_type = "statistics", caption = "Status values changed according due Q500 urine cal limits")
r ppparams$threshold_2a * 100
%)To alter the threshold use parameter filter_compound_qc_ref_max_mv_ratio
.
Number of compounds before: r length(unique(datasets$discarded$Compound))
Number of compounds left: r length(unique(datasets$filter_compounds_by_qc_mv_kept$Compound))
# 2. Remove unreliable compounds # a. Based on missing values ratio in QCs (samples of type ENV$SAMPLE_TYPE_REFERENCE_QC) # Valid ratio of NAs is max 0.2 by default easy_datatable( datasets$filter_compounds_by_qc_mv_removed, show_type = "statistics", caption = paste0("Removed compounds due to ", ppparams$threshold_2a*100, "% QC MVs")) easy_datatable( datasets$filter_compounds_by_qc_mv_kept, caption = paste0("Full data kept after ", ppparams$threshold_2a*100, "% QC MVs filtering"), export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_filter2a_long.csv") ) datasets$filter_compounds_by_qc_mv_kept %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = paste0( "Full data kept after ", ppparams$threshold_2a*100, "% QC MVs filtering", " (wide: compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_filter2a_wide.csv") )
r ppparams$threshold_2b
%)To alter the threshold use parameter filter_compound_qc_ref_max_rsd
.
Number of compounds before: r length(unique(datasets$filter_compounds_by_qc_mv_kept$Compound))
Number of compounds left: r length(unique(datasets$filter_compounds_by_qc_rsd_kept$Compound))
easy_datatable( datasets$filter_compounds_by_qc_rsd_removed, show_type = "statistics", caption = paste0("Removed compounds due to ", ppparams$threshold_2b, " %RSD in QCs")) easy_datatable( datasets$filter_compounds_by_qc_rsd_kept, caption = paste0("Full data kept after ", ppparams$threshold_2b, "%RSD QC filtering"), export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_filter2b_long.csv") ) datasets$filter_compounds_by_qc_rsd_kept %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = paste0( "Full data kept after ", ppparams$threshold_2b, "%RSD QC filtering", " (wide: compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_filter2b_wide.csv") )
r ppparams$threshold_2c * 100
%)To alter the threshold use parameter filter_compound_qc_pool_max_mv_ratio
.
Number of compounds before: r length(unique(datasets$filter_compounds_by_qc_rsd_kept$Compound))
Number of compounds left: r length(unique(datasets$filter_compounds_by_qc_pool_mv_kept$Compound))
# 2. Remove unreliable compounds # a. Based on missing values ratio in QCs (samples of type ENV$SAMPLE_TYPE_REFERENCE_QC) # Valid ratio of NAs is max 0.2 by default easy_datatable( datasets$filter_compounds_by_qc_pool_mv_removed, show_type = "statistics", caption = paste0("Removed compounds due to ", ppparams$threshold_2c*100, "% Pooled QC MVs")) easy_datatable( datasets$filter_compounds_by_qc_pool_mv_kept, caption = paste0("Full data kept after ", ppparams$threshold_2c*100, "% Pooled QC MVs filtering"), export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_filter2c_long.csv") ) datasets$filter_compounds_by_qc_pool_mv_kept %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = paste0( "Full data kept after ", ppparams$threshold_2c*100, "% Pooled QC MVs filtering", " (wide: compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_filter2c_wide.csv") )
r ppparams$threshold_2d
%)To alter the threshold use parameter filter_compound_qc_pool_max_rsd
.
Number of compounds before: r length(unique(datasets$filter_compounds_by_qc_pool_mv_kept$Compound))
Number of compounds left: r length(unique(datasets$filter_compounds_by_qc_pool_rsd_kept$Compound))
easy_datatable( datasets$filter_compounds_by_qc_pool_rsd_removed, show_type = "statistics", caption = paste0("Removed compounds due to ", ppparams$threshold_2d, " %RSD in Pooled QCs")) easy_datatable( datasets$filter_compounds_by_qc_pool_rsd_kept, caption = paste0("Full data kept after ", ppparams$threshold_2d, "%RSD Pooled QC filtering"), export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_filter2d_long.csv") ) datasets$filter_compounds_by_qc_pool_rsd_kept %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = paste0( "Full data kept after ", ppparams$threshold_2d, "%RSD Pooled QC filtering", " (wide: compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_filter2d_wide.csv") )
r ppparams$threshold_3a * 100
%)Number of compounds before: r length(unique(datasets$filter_compounds_by_qc_pool_rsd_kept$Compound))
Number of compounds left: r length(unique(datasets$filter_compounds_by_sample_all_mv_kept$Compound))
easy_datatable( datasets$filter_compounds_by_sample_all_mv_removed, show_type = "statistics", caption = paste0("Removed compounds due to ", ppparams$threshold_3a*100, "% Sample MVs")) easy_datatable( datasets$filter_compounds_by_sample_all_mv_kept, caption = paste0("Full data kept after ", ppparams$threshold_3a*100, "% Sample MVs filtering"), export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_filter3a_long.csv") ) datasets$filter_compounds_by_sample_all_mv_kept %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = paste0( "Full data kept after ", ppparams$threshold_3a*100, "% Sample MVs filtering", " (wide: compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_filter3a_wide.csv") )
r ppparams$threshold_3b * 100
%)To alter the threshold use parameter filter_compound_bs_max_mv_ratio
.
Number of compounds before: r length(unique(datasets$filter_compounds_by_sample_all_mv_kept$Compound))
Number of compounds left: r length(unique(datasets$filter_compounds_by_sample_mv_kept$Compound))
easy_datatable( datasets$filter_compounds_by_sample_mv_removed, show_type = "statistics", caption = paste0("Removed compounds due to ", ppparams$threshold_3b*100, "% Sample MVs")) easy_datatable( datasets$filter_compounds_by_sample_mv_kept, caption = paste0("Full data kept after ", ppparams$threshold_3b*100, "% Sample MVs filtering"), export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_filter3b_long.csv") ) datasets$filter_compounds_by_sample_mv_kept %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = paste0( "Full data kept after ", ppparams$threshold_3b*100, "% Sample MVs filtering", " (wide: compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_filter3b_wide.csv") )
r ppparams$threshold_3c
%)To alter the threshold use parameter filter_compound_bs_min_rsd
.
Number of compounds before: r length(unique(datasets$filter_compounds_by_sample_mv_kept$Compound))
Number of compounds left: r length(unique(datasets$filter_compounds_by_sample_rsd_kept$Compound))
easy_datatable( datasets$filter_compounds_by_sample_rsd_removed, show_type = "statistics", caption = paste0( "Removed compounds due to ", ppparams$threshold_3c, " %RSD in biological samples") ) easy_datatable( datasets$filter_compounds_by_sample_rsd_kept, caption = paste0( "Full data kept after ", ppparams$threshold_3c, " %RSD filtering in biological samples"), export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_filter3c_long.csv") ) datasets$filter_compounds_by_sample_rsd_kept %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = paste0( "Full data kept after ", ppparams$threshold_3c, " %RSD filtering in biological samples", " (wide: compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_filter3c_wide.csv") )
r ppparams$threshold_4 * 100
%)To alter the threshold use parameter filter_sample_max_mv_ratio
.
Number of all samples before: r length(unique(datasets$filter_compounds_by_sample_rsd_kept$Sample.Name))
Number of all samples left: r length(unique(datasets$filter_samples_by_compound_mv_kept$Sample.Name))
Number of biological samples before:
r datasets$filter_compounds_by_sample_rsd_kept %>% filter(Sample.Type == SAMPLE_TYPE_BIOLOGICAL) %>% distinct(Sample.Name) %>% nrow()
Number of biological samples left:
r datasets$filter_samples_by_compound_mv_kept %>% filter(Sample.Type == SAMPLE_TYPE_BIOLOGICAL) %>% distinct(Sample.Name) %>% nrow()
easy_datatable( datasets$filter_samples_by_compound_mv_removed, show_type = "statistics", caption = paste0("Removed samples due to ", ppparams$threshold_4*100, "% Compound MVs")) easy_datatable( datasets$filter_samples_by_compound_mv_kept, caption = paste0("Full data kept after ", ppparams$threshold_4*100, "% Compound MVs filtering"), export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_filter4_long.csv") ) datasets$filter_samples_by_compound_mv_kept %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = paste0( "Full data kept after ", ppparams$threshold_4*100, "% Compound MVs filtering", " (wide: compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_filter4_wide.csv") )
Missing value counts either by samples or by compounds based on unprocessed data (i.e. no values have been turned into missing values based on status and nothing is filtered) as well as the full unprocessed dataset is available in the tables below.
easy_datatable( summarize_missing_values_per_sample(datasets$original, plot = FALSE), round_colums = "% Missing Values", caption = "Missing values per sample before status preprocessing", show_type = "statistics" ) easy_datatable( summarize_missing_values_per_compound(datasets$original, plot = FALSE), round_colums = "% Missing Values", caption = "Missing values per compound before status preprocessing", show_type = "statistics" ) easy_datatable( datasets$original, caption = paste0("Full data before preprocessing"), export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_original_long.csv") ) datasets$original %>% wide_conc_table_compounds_x_samples() %>% easy_datatable( caption = paste0( "Full data before preprocessing", " (wide: compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_original_wide.csv") )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.