# Keep only columns (variable) of interest from the biocrates dataframe selected_columns <- names(df) %in% c( "Compound", "Sample.Name", "Sample.Identification", "Sample.Type", "Analyte Peak Area [area]", ENV$CONCENTRATION, "Internal Std. Peak Area [area]", "Analyte Intensity [cps]", LOWCON_CONDITIONS ) df.filtered <- df[selected_columns] # Renaming column for consistent naming accross datasets df.filtered <- df.filtered %>% rename( Analyte.Peak.Area = `Analyte Peak Area [area]`, Concentration = !!sym(ENV$CONCENTRATION), Analyte.Intensity = `Analyte Intensity [cps]`, Internal.Std.Peak.Area = `Internal Std. Peak Area [area]` ) # Here we separate the samples in 3 dataframes: # zero_samples: Used for calculating the lower limit to remove background/contaminants from the biological samples # qc_samples: Used to check RSD in QC to see if we reject some metabolites that are too variable # filtered: This is the main dataframe where the data for biological samples is kept # Extract PBS (zero samples) first from the dataframe df.zero_samples <- df.filtered[ df.filtered$Sample.Type == "Zero Sample", ] # Extract QC 2 samples df.qc_samples <- df.filtered[ df.filtered$Sample.Type == "QC Level 2", ] # Filter the dataframe to keep only biological samples # Since we don't always have an annotation file, we get our sample list directly from the dataframe df.filtered <- df.filtered[ df.filtered$Sample.Type %in% c("Sample", "Pooled QC"), ]
No filtering is done at this stage and this table is given as infromation only
# QC SAMPLES SECTION # Extra step to check whether some metabolites should be removed because they have a large RSD in the QC samples # Normalise the metabolites in QC samples using internal standards and compute their RSD df.qc_samples.summary <- df.qc_samples %>% mutate(Analyte.Peak.Area.Normalised = Analyte.Peak.Area/Internal.Std.Peak.Area) %>% group_by(Compound) %>% mutate(relative_standard_deviation = rsd(na.omit(Analyte.Peak.Area.Normalised))) # Display the table of RSD values for every metabolites df.qc_samples.summary %>% distinct(Compound, .keep_all = TRUE) %>% select(Compound,relative_standard_deviation) %>% easy_datatable()
# ZERO SAMPLES SECTION # Compute the mean, median, highest peak area for every metabolites in the zero samples (we do the same for concentration) # Please note that the raw peak area are used and they are not normalised to the internal standard at this stage # Only the median peak area will actually be used later on, the rest is only calculated for information purposes df.zero_samples <- df.zero_samples %>% group_by(Compound) %>% mutate(mean_peak_area = mean(Analyte.Peak.Area, na.rm = T), median_peak_area = median(Analyte.Peak.Area, na.rm = T), highest_peak_area = max(Analyte.Peak.Area, na.rm = TRUE), mean_concentration = mean(Concentration, na.rm = T), median_concentration = median(Concentration, na.rm = T), highest_concentration = max(Concentration, na.rm = TRUE)) # Replace "NaN" values by NA df.zero_samples[sapply(df.zero_samples, is.nan)] <- NA # Replace "-Inf" values created by max() by NA df.zero_samples[df.zero_samples=="-Inf"] <- NA # Remove all the duplicates to the every compound appearing once with the required values (mean and median) df.zero_sample_summary <- df.zero_samples %>% group_by(Compound) %>% select(Compound, mean_concentration,mean_peak_area,median_peak_area) %>% distinct(Compound, .keep_all = T) # Add this information to the main dataframe so we can perform some filtering using the zero samples values df.filtered <- left_join(df.filtered,df.zero_sample_summary[,c("Compound","mean_concentration","mean_peak_area", "median_peak_area")], by="Compound")
# Create new columns in the main dataframe that contain whether a metabolite peak area (or concentration) is above 3 times the zero sample value # Please note that only the median peak area is used later on for filtering (The rest is ignored and calculated for tests only) df.filtered <- df.filtered %>% mutate(above_3concentration = ifelse(Concentration >= mean_concentration*3, TRUE, FALSE), above_3mean_peak_area = ifelse(Analyte.Peak.Area >= mean_peak_area*3, TRUE, FALSE), above_3median_peak_area = ifelse(Analyte.Peak.Area >= median_peak_area*3, TRUE, FALSE))
# Create a summary table showing how many samples have the metabolites above or below the 3 median zero sample threshold # Please note that if the zero sample value is NA, NA will be reported in the table df.filtered %>% group_by(Compound) %>% summarise(Number_sample_above = sum(above_3median_peak_area == TRUE, na.rm = T), Number_sample_below = sum(above_3median_peak_area == FALSE, na.rm = T), Number_NA = sum(is.na(above_3median_peak_area))) %>% easy_datatable()
# First we set to NA all values below the threshold to discard them df.filtered$Analyte.Peak.Area[ which(df.filtered$above_3median_peak_area == FALSE)] = NA # THIS THE 20000 intensity (not peak area) filter, this should be switch with LOD values that we will calculate for every metabolites # Since we don't have this information yet we use this threshold (Ela has generated the data to compute the values) df.filtered <- df.filtered %>% mutate(Above.LOD = ifelse(Analyte.Intensity > LOWCON_MINIMUM_INTENSITY, TRUE, FALSE)) # Set all metabolites under this threshold to NA to discard them from further analysis df.filtered$Analyte.Peak.Area[ which(df.filtered$Above.LOD == FALSE)] = NA # Normalise all remaning metabolites using their internal standard # Here we only perform a simple scaling (other approach may be considered but this works fine) df.filtered <- df.filtered %>% mutate(Analyte.Peak.Area.Normalised = Analyte.Peak.Area/Internal.Std.Peak.Area) # Compute the number of replicate left for each metabolite PER BIOLOGICAL CONDITION (IMPORTANT) condition_summary <- df.filtered %>% group_by_at(vars(Compound, one_of(LOWCON_CONDITIONS))) %>% summarise( Number_values = sum(!is.na(Analyte.Peak.Area)), Number_NA = sum(is.na(Analyte.Peak.Area)) ) %>% mutate(Keep = ifelse(Number_values >= 1, TRUE, FALSE)) # Create a table that shows how many metabolites are kept PER BIOLOGICAL CONDITION condition_summary %>% group_by_at(vars(one_of(LOWCON_CONDITIONS))) %>% summarise(Number_of_metabolite = sum(Keep == TRUE)) %>% easy_datatable()
# Compute the total peak area per sample using the only the metabolites that remain (that passed all the filtering steps) df.tot_area_HOS <- df.filtered %>% group_by(Sample.Name) %>% mutate( total_area = sum(Analyte.Peak.Area, na.rm = T), total_area_normalised = sum(Analyte.Peak.Area.Normalised, na.rm = T) ) %>% distinct(Sample.Name, .keep_all = TRUE) %>% ungroup() # We also create a column that contain the "biological condition" so we don't have to change the factors everytime to group the samples by condition df.tot_area_HOS$condition <- df.tot_area_HOS %>% select(one_of(LOWCON_CONDITIONS)) %>% purrr::imap(.f = ~ paste(.y, .x, sep=" ")) %>% as_tibble %>% tidyr::unite(condition, LOWCON_CONDITIONS, sep=" | ") %>% pull(condition) # Set the conditions as factor df.tot_area_HOS$condition <- factor(df.tot_area_HOS$condition) # Simply plot the total peak area per samples (BEFORE normalisation to internal standards) and group them by biological condition (color) ggplot(data=df.tot_area_HOS, aes(x=reorder(Sample.Identification, as.numeric(condition)), y=total_area, fill=condition)) + geom_bar(stat="identity")+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(x = "Sample name", y = " Total peak area") + theme(legend.position = "bottom")
# Simply plot the total peak area per samples (AFTER normalisation to internal standards) and group them by biological condition (color) ggplot(data=df.tot_area_HOS, aes(x=reorder(Sample.Identification, as.numeric(condition)), y=total_area_normalised, fill=condition)) + geom_bar(stat="identity")+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(x = "Sample name", y = " Total peak area") + theme(legend.position = "bottom")
Remove SD outliers (i.e. set values to NA): r LOWCON_SD_OUTLIER_REM
(according
to parameter lowcon_sd_outlier_removal)
# Calculate 1.5 standard deviation per BIOLOGICAL CONDITION (This can be changed to 2, but usually even 1.5 does not reject any sample) df.tot_area_HOS <- df.tot_area_HOS %>% group_by(condition) %>% mutate(tot_area_2_sd_low = mean(total_area_normalised)-1.5*sd(total_area_normalised),tot_area_2_sd_high = mean(total_area_normalised)+1.5*sd(total_area_normalised)) # Check if the samples are within the 1.5 std range df.tot_area_HOS <- df.tot_area_HOS %>% mutate(inside_range = ifelse(total_area_normalised>tot_area_2_sd_low & total_area_normalised<tot_area_2_sd_high, TRUE, FALSE)) # Plot the same as before with the addition of whether a sample is inside or outside the 1.5 std range ggplot(data=df.tot_area_HOS, aes(x=reorder(Sample.Identification, as.numeric(condition)), y=total_area_normalised, fill=condition)) + geom_bar(stat="identity", aes(colour=inside_range))+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(x = "Sample name", y = " Total peak area") + theme(legend.position = "bottom")
# Remove outlier samples from the data (We just reject every sample that are outside the range) if (LOWCON_SD_OUTLIER_REM){ df.tot_area_HOS$total_area_normalised[ which(df.tot_area_HOS$inside_range == FALSE)] = NA }
# Here we group the samples by biological condition and compute the standard deviation for the total peak area (normalised) df.summary_HOS <- df.tot_area_HOS %>% group_by(condition) %>% mutate(mean_peak_area = mean(total_area_normalised, na.rm = T), standard_deviation = sd(total_area_normalised, na.rm = T)) %>% summarise(mean_peak_area = unique(mean_peak_area), standard_deviation = unique(standard_deviation)) # And we plot it # Please note that samples that have been rejected are not considered in the SD calculation ggplot(df.summary_HOS, aes(x=condition, y=mean_peak_area, fill=condition)) + geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=mean_peak_area-standard_deviation, ymax=mean_peak_area+standard_deviation), width=.2, # Width of the error bars position=position_dodge(.9)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(x = "condition", y = "Mean peak area") + theme(legend.position = "bottom")
# Compute an RSD of the total peak area per biological condition summary_rsd_HOS <- df.tot_area_HOS %>% group_by(condition) %>% mutate(mean_peak_area = mean(total_area_normalised, na.rm = T), rsd = rsd(na.omit(total_area_normalised))) %>% summarise(mean_peak_area = unique(mean_peak_area), relative_standard_deviation = unique(rsd)) # Show it in a table colnames(summary_rsd_HOS) <- c("condition", "Mean peak area", "Relative Standard deviation (%)") summary_rsd_HOS %>% easy_datatable()
A minimum number of 2 values is required for the RSD to be reported in the table.
# We go back on using a prior main dataframe so we need to bring some data were computed in between (condition and inside the 1.5 SD range) # Bring condition names and inside range value to the main dataframe df.filtered.condition.HOS <- left_join(df.filtered,df.tot_area_HOS[,c("Sample.Name","condition","inside_range")], by="Sample.Name") # Remove outlier samples found to be outside 1.5 standard deviation if (LOWCON_SD_OUTLIER_REM){ df.filtered.condition.HOS$Analyte.Peak.Area.Normalised[ which(df.filtered.condition.HOS$inside_range == FALSE)] = NA } # Keeping data for showing table later df.filtered.condition.HOS.table <- df.filtered.condition.HOS # Compute the RSD for every single compound per BIOLOGICAL CONDITION # A MINIMUM OF 2 REPLICATES IS OBVIOUSLY NECESSARY df.filtered.condition.HOS <- df.filtered.condition.HOS %>% group_by(condition, Compound) %>% mutate( relative_standard_deviation = rsd(na.omit(Analyte.Peak.Area.Normalised)), metabolite_mean_peak_area = mean(Analyte.Peak.Area.Normalised, na.rm = T) ) %>% distinct(Compound, condition, .keep_all = TRUE) # Restructure the dataframe so we can display the RSD per metabolite per condtion in a table df.metabolite_rsd_HOS <- df.filtered.condition.HOS %>% select(Compound, condition, relative_standard_deviation) %>% spread(Compound, relative_standard_deviation) df.metabolite_rsd_HOS %>% easy_datatable()
# Table that shows samples with normalized areas (long) df.filtered %>% select(Compound, Sample.Identification, Sample.Type, all_of(LOWCON_CONDITIONS), Analyte.Peak.Area.Normalised) %>% easy_datatable( caption = "Long table of normalized areas", export_csv = params$data_export_long, export_path = paste0(params$data_export_prefix, "_lowcon_lc_long.csv") ) # Table that shows samples with normalized areas (wide) df.filtered %>% wide_conc_table_compounds_x_samples(value = "Analyte.Peak.Area.Normalised") %>% easy_datatable( caption = paste( "Wide table of normalized areas", "(compounds x samples, with some metadata)" ), export_csv = params$data_export_wide, export_path = paste0(params$data_export_prefix, "_lowcon_lc_wide.csv") )
cat("\n## SD-filtered normalized area dataset\n\n") # Table that shows samples with normalized areas (long) df.filtered.condition.HOS.table %>% ungroup() %>% select(Compound, Sample.Identification, Sample.Type, all_of(LOWCON_CONDITIONS), Analyte.Peak.Area.Normalised) %>% easy_datatable(caption = "Long table of normalized areas") # Table that shows samples with normalized areas (wide) df.filtered.condition.HOS.table %>% ungroup() %>% wide_conc_table_compounds_x_samples(value = "Analyte.Peak.Area.Normalised") %>% easy_datatable(caption = paste( "Wide table of normalized areas", "(compounds x samples, with some metadata)" ))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.