# Preprocessing
# Author: Mathias Kuhring
# Discarding unreliable measurements
# Based on Biocratis measurement status
#
# Normal MetIDQ data export seems to contain
# for LC:
# * 0 for measurements below LOD (MetIDQ_Status "< LOD")
# * NA for invalid calibrated measurements (MetIDQ_Status "No Intercept")
# * Invalid for measurements exluded by user in MetIDQ or Xcalibur (MetIDQ_Status "Invalid")
# for FIA additionally:
# * Empty fields for measurements below LOD (MetIDQ_Status "< LOD")
# (difference between 0 and empty fields not known, yet)
#
# 0, NA as well as empty fields are concidered missing values and set to NA!
#
# In addition, only "Valid" measurements will be kept, i.e. all other statuses
# including "< LLOQ" and "> ULOQ" (out of quantification range / extrapolated
# calibration), "STD/QC < Limit" and "STD/QC > Limit" (insufficient accuracy in
# STDs and QCs), "ISTD Out of Range" (internal standard too high or low) and
# other statuses are considered as unreliable and are set to NA!
#
# Currently, there is no differentiation between stati and thus missing values.
discard_unreliable_measurements <- function(data,
keep = c("Valid"),
targets = ENV$DATA_TYPES,
sample_types = NULL){
# Remove names from target vector (otherwise dplyr::across fails)
names(targets) <- NULL
assertthat::assert_that(all(targets %in% names(data)))
# If not stated otherwise, discard from all samples types
if (is.null(sample_types)){
sample_types <- unique(data$Sample.Type)
}
assertthat::assert_that(all(sample_types %in% data$Sample.Type))
# Identify and discard (i.e. set NA) every measurement but the once to keep
data_out <- data %>%
dplyr::mutate(
Discard = Sample.Type %in% sample_types & !(MetIDQ_Status %in% keep),
dplyr::across(
.cols = dplyr::all_of(targets),
.fns = ~ifelse(Discard, NA, .)
)
) %>%
dplyr::select(-Discard)
return(data_out)
}
# Summary stats on missing values per compound
summarize_missing_values_per_compound <- function(
data, target = ENV$CONCENTRATION, plot = FALSE){
# Count missing values
mv_compounds <- data %>%
group_by(Compound, Class) %>%
summarize(`# Missing Values` = sum(is.na(UQ(sym(target)))),
`% Missing Values` = sum(is.na(UQ(sym(target)))) / n() * 100,
`# Total` = n()) %>%
arrange(desc(`% Missing Values`)) %>%
ungroup()
# Factorize compounds ordered by compound class and name
compound_levels <- mv_compounds %>%
arrange(Class, Compound) %>%
select(Compound) %>%
distinct() %>%
pull(Compound)
mv_compounds$Compound <- factor(mv_compounds$Compound,
levels = rev(compound_levels))
# Plot number of missing values
if (plot){
g <- ggplot(data = mv_compounds,
mapping = aes(x = Compound,
y = `# Missing Values`,
color = Class,
fill = Class)) +
geom_bar(stat = "identity", alpha = 0.75) +
coord_flip() +
theme(legend.position = "bottom")
print(g)
}
# Return table of missing values
return(mv_compounds)
}
# Summary stats on missing values per sample
summarize_missing_values_per_sample <- function(
data, target = ENV$CONCENTRATION, plot = FALSE){
# Check for available sample location variables
pos_cols <- c(
COLUMN_WELL_POSITION,
COLUMN_SEQUENCE_POSITION,
COLUMN_WELL_COORDINATES
)
best_pos_col <- which(pos_cols %in% names(data))[1]
# Count missing values
mv_samples <- data %>%
group_by(Sample.Name, Sample.Type)
if (!is.na(best_pos_col)) {
mv_samples <- mv_samples %>%
group_by_at(vars(one_of(pos_cols[best_pos_col])), .add = TRUE)
}
mv_samples <- mv_samples %>%
summarize(`# Missing Values` = sum(is.na(UQ(sym(target)))),
`% Missing Values` = sum(is.na(UQ(sym(target)))) / n() * 100,
`# Total` = n()) %>%
arrange(desc(`% Missing Values`)) %>%
ungroup()
# Factorize samples ordered by sample type and name
sample_levels <- mv_samples %>%
arrange(Sample.Type, Sample.Name) %>%
select(Sample.Name) %>%
distinct() %>%
pull(Sample.Name)
mv_samples$Sample.Name <- factor(mv_samples$Sample.Name,
levels = rev(sample_levels))
# Plot number of missing values
if (plot){
g <- ggplot(data = mv_samples,
mapping = aes(x = Sample.Name,
y = `# Missing Values`,
color = Sample.Type,
fill = Sample.Type)) +
geom_bar(stat = "identity", alpha = 0.75) +
coord_flip() +
theme(legend.position = "bottom")
print(g)
}
# Return table of missing values
return(mv_samples)
}
# Run all filter preprocessing steps
execute_preprocessing <- function(data, ppparams){
datasets <- list()
datasets$original <- data
# Special case: correct cal limits in Q500 urine data
datasets$original_urine <- datasets$original
if (ppparams$statuses_q500_urine) {
if (grepl(pattern = "µM|uM", x = ENV$CONCENTRATION)){
# Correct
datasets$original_urine <- datasets$original %>%
left_join(Q500_URINE_LIMITS, by = "Compound") %>%
mutate(MetIDQ_Status = if_else(
condition = MetIDQ_Status %in% c("< LLOQ", "> ULOQ") &
!!sym(ENV$CONCENTRATION) >= LLOQ_uM &
!!sym(ENV$CONCENTRATION) <= ULOQ_uM,
true = "Valid",
false = MetIDQ_Status
)) %>%
select(-LLOQ_uM, -ULOQ_uM)
} else {
message(paste(
"Error: Can't replace Q500 urine cal limits due to non-matching unit",
"(needs µM or uM):", ENV$CONCENTRATION
))
}
}
# 1. Discard unreliable measurements
datasets$discarded <- discard_unreliable_measurements(
data = datasets$original_urine,
keep = ppparams$statuses_to_keep,
targets = ENV$DATA_TYPES
)
# 2. Remove unreliable compounds
# a. Based on missing values ratio in QCs (samples of type ENV$SAMPLE_TYPE_REFERENCE_QC)
filter_res <- remove_compounds_na(
data = datasets$discarded,
target = ENV$CONCENTRATION,
sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC),
max_ratio = ppparams$threshold_2a)
datasets$filter_compounds_by_qc_mv_kept <- filter_res$data
datasets$filter_compounds_by_qc_mv_removed <- filter_res$removed
# b. Based on %RSD of QCs (default >15%)
filter_res <- remove_compounds_rsd(
data = datasets$filter_compounds_by_qc_mv_kept,
target = ENV$CONCENTRATION,
sample_types = c(ENV$SAMPLE_TYPE_REFERENCE_QC),
max_rsd = ppparams$threshold_2b)
datasets$filter_compounds_by_qc_rsd_kept <- filter_res$data
datasets$filter_compounds_by_qc_rsd_removed <- filter_res$removed
# c. Based on missing values ratio in Pooled QCs (samples of type SAMPLE_TYPE_POOLED_QC)
filter_res <- remove_compounds_na(
data = datasets$filter_compounds_by_qc_rsd_kept,
target = ENV$CONCENTRATION,
sample_types = c(SAMPLE_TYPE_POOLED_QC),
max_ratio = ppparams$threshold_2c)
datasets$filter_compounds_by_qc_pool_mv_kept <- filter_res$data
datasets$filter_compounds_by_qc_pool_mv_removed <- filter_res$removed
# d. Based on %RSD of Pooled QCs (default >15%)
filter_res <- remove_compounds_rsd(
data = datasets$filter_compounds_by_qc_pool_mv_kept,
target = ENV$CONCENTRATION,
sample_types = c(SAMPLE_TYPE_POOLED_QC),
max_rsd = ppparams$threshold_2d)
datasets$filter_compounds_by_qc_pool_rsd_kept <- filter_res$data
datasets$filter_compounds_by_qc_pool_rsd_removed <- filter_res$removed
# 3. Remove underrepresented compounds
# a. Based on missing value ratio over all (!) study samples (>20%)
filter_res <- remove_compounds_na(
data = datasets$filter_compounds_by_qc_pool_rsd_kept,
target = ENV$CONCENTRATION,
sample_types = c(SAMPLE_TYPE_BIOLOGICAL),
max_ratio = ppparams$threshold_3a)
datasets$filter_compounds_by_sample_all_mv_kept <- filter_res$data
datasets$filter_compounds_by_sample_all_mv_removed <- filter_res$removed
# b. Based on missing value ratio over all (!) study samples (>25%)
filter_res <- remove_compounds_na(
data = datasets$filter_compounds_by_sample_all_mv_kept,
target = ENV$CONCENTRATION,
sample_types = c(SAMPLE_TYPE_BIOLOGICAL),
max_ratio = ppparams$threshold_3b)
datasets$filter_compounds_by_sample_mv_kept <- filter_res$data
datasets$filter_compounds_by_sample_mv_removed <- filter_res$removed
# c. Based on %RSD of biological samples (default <15%)
filter_res <- remove_compounds_rsd_low(
data = datasets$filter_compounds_by_sample_mv_kept,
target = ENV$CONCENTRATION,
sample_type = SAMPLE_TYPE_BIOLOGICAL,
min_rsd = ppparams$threshold_3c)
datasets$filter_compounds_by_sample_rsd_kept <- filter_res$data
datasets$filter_compounds_by_sample_rsd_removed <- filter_res$removed
# 4. Remove underrepresented samples
# a. Based on missing value ratio over compounds
filter_res <- remove_samples_na(
data = datasets$filter_compounds_by_sample_rsd_kept,
target = ENV$CONCENTRATION,
max_ratio = ppparams$threshold_4)
datasets$filter_samples_by_compound_mv_kept <- filter_res$data
datasets$filter_samples_by_compound_mv_removed <- filter_res$removed
return(datasets)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.