# global knitting options for code rendering knitr::opts_chunk$set( collapse = TRUE, comment = "#>") # change to TRUE to enable global knitting options for automatic saving of all plots as .png and .pdf if (FALSE) { knitr::opts_chunk$set( dev = c("png", "pdf"), fig.keep = "all", dev.args = list(pdf = list(encoding = "WinAnsi", useDingbats = FALSE)), fig.path = file.path("fig_output", paste0(gsub("\\.[Rr]md", "", knitr::current_input()), "_")) ) }
This is an example of a data processing pipeline for compound-specific Gas Chromatography Isotope Ratio Mass Spectrometry (GC-IRMS) carbon isotope measurements. It can be downloaded as a template (or just to see the plain-text code) by following the Source
link above. Knitting for stand-alone data analysis works best to HTML
rather than the website rendering you see here. To make this formatting change simply delete line #6 in the template file (the line that says rmarkdown::html_vignette:
).
Note that all code chunks that contain a critical step towards the final data (i.e. do more than visualization or a data summary) are marked with (*)
in the header to make it easier to follow all key steps during interactive use.
This example was run using isoreader version r packageVersion("isoreader")
and isoprocessor version r packageVersion("isoprocessor")
. If you want to reproduce the example, please make sure that you have these or newer versions of both packages installed:
# restart your R session (this command only works in RStudio) .rs.restartR() # installs the development tools package if not yet installed if(!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") # installs the newest version of isoreader and isoprocessor devtools::install_github("isoverse/isoreader") devtools::install_github("isoverse/isoprocessor")
library(tidyverse) # general data wrangling and plotting library(isoreader) # reading the raw data files library(isoprocessor) # processing the data
This analysis was run using isoreader version r packageVersion("isoreader")
and isoprocessor version r packageVersion("isoprocessor")
.
For use as a data processing template, please follow the Source
link above, download the raw file and adapt as needed. Knitting for stand-alone data analysis works best to HTML
rather than the in-package default html_vignette
.
All code chunks that contain a critical step towards the final data (i.e. do more than visualization or a data summary) are marked with (*)
in the header to make it easier to follow all key steps during interactive use.
# set file path(s) to data files, folders or rds collections # can be multiple folders or mix of folders and files, using example data set here # (this is a reduced data set of mostly standards with just a few example samples) data_path <- iso_get_processor_example("gc_irms_example_carbon.cf.rds") # read files iso_files_raw <- # path to data files data_path %>% # read data files in parallel for fast read iso_read_continuous_flow(parallel = TRUE) %>% # filter out files with read errors (e.g. from aborted analysis) iso_filter_files_with_problems()
# process iso_files <- iso_files_raw %>% # set peak table from vendor data table iso_set_peak_table_automatically_from_vendor_data_table() %>% # convert units from mV to V for amplitudes and area iso_convert_peak_table_units(V = mV, Vs = mVs) %>% # rename key file info columns iso_rename_file_info(id1 = `Identifier 1`, id2 = `Identifier 2`) %>% # parse text info into numbers iso_parse_file_info(number = Analysis) %>% # process other file information that is specific to the naming conventions # of this particular sequence iso_mutate_file_info( # what is the type of each analysis? type = case_when( str_detect(id1, "[Zz]ero") ~ "on_off", str_detect(id1, "[Ll]inearity") ~ "lin", str_detect(id1, "A5") ~ "std", TRUE ~ "sample" ), # was there seed oxidation? seed_oxidation = ifelse(`Seed Oxidation` == "1", "yes", "no"), # what was the injection volume based on the AS method name? injection_volume = str_extract(`AS Method`, "AS PTV [0-9.]+") %>% parse_number() %>% iso_double_with_units("uL"), # what was the concentration? (assuming Preparation = concentration or volume) concentration = str_extract(Preparation, "[0-9.]+ ?ng( per |/)uL") %>% parse_number() %>% iso_double_with_units("ng/uL"), # or the volume? volume = str_extract(Preparation, "[0-9.]+ ?uL") %>% parse_number() %>% iso_double_with_units("uL"), # what folder are the data files in? (assuming folder = sequence) folder = basename(dirname(file_path)) ) %>% # add in additional sample metadata (could be any info) # note: this would typically be stored in / read from a csv or excel file iso_add_file_info( tibble::tribble( # column names ~id1, ~sample, ~fraction, # metadata (row-by-row) "OG268_CZO-SJER_F1", "SJER", "F1", "OG271_CZO-CTNA_F1", "CTNA", "F1", "OG281_CZO-Niwot_Tundra2_F1", "Niwot_Tundra2", "F1", "OG282_CZO-Niwot_Tundra1_F1", "Niwot_Tundra1", "F1" ), join_by = "id1" ) %>% # focus only on the relevant file info, discarding the rest iso_select_file_info( folder, Analysis, file_datetime, id1, id2, type, sample, seed_oxidation, injection_volume, concentration, volume )
# display file information iso_files %>% iso_get_file_info() %>% select(-file_id, -folder) %>% iso_make_units_explicit() %>% knitr::kable()
# plot the chromatograms iso_plot_continuous_flow_data( # select a few analyses (these #s must exist!) iso_filter_files(iso_files, Analysis %in% c(2046, 2106, 2107)), # select data and aesthetics data = c(44), color = id1, panel = Analysis, # zoom in on time interval time_interval = c(1000, 3200), # peak labels for all peaks > 2V peak_label = iso_format(rt, d13C, signif = 3), peak_label_options = list(size = 3), peak_label_filter = amp44 > 2000 ) + # customize resulting ggplot theme(legend.position = "bottom")
# find files with zero in the Identifier 1 field on_offs <- iso_files %>% iso_filter_files(type == "on_off") # visualize the on/offs iso_plot_continuous_flow_data(on_offs, data = 44, color = id2)
# calculate on/off summary on_off_summary <- on_offs %>% # retrieve peak table iso_get_peak_table( select = c(amp44, area44, d13C), include_file_info = c(file_datetime, id2) ) %>% # summarize information group_by(file_datetime, id2) %>% iso_summarize_data_table() # table on_off_summary %>% iso_make_units_explicit() %>% knitr::kable(digits = 3) # plot on_off_summary %>% # use generic data plot function iso_plot_data( x = file_datetime, y = `d13C sd`, size = `amp44 mean`, color = id2, points = TRUE, date_labels = "%b %d - %H:%M" ) + # customize resulting ggplot expand_limits(y = 0)
# find files with linearity in the Identifier 1 field lins <- iso_files %>% iso_filter_files(type == "lin") # visualize the linearity runs iso_plot_continuous_flow_data(lins, data = 44, color = format(file_datetime))
# calculate linearity summary linearity_summary <- lins %>% # retrieve data table iso_get_peak_table( select = c(amp44, area44, d13C), include_file_info = c(file_datetime) ) %>% # calculate linearity regression lines nest(data = -file_datetime) %>% mutate( fit = map(data, ~lm(d13C ~ amp44, data = iso_strip_units(.x))), coefs = map(fit, broom::tidy) ) # table linearity_summary %>% unnest(coefs) %>% select(file_datetime, term, estimate, std.error) %>% filter(term == "amp44") %>% mutate( term = "linearity [permil/V]", estimate = signif(estimate, 2), std.error = signif(std.error, 1) ) %>% knitr::kable(digits = 4) # plot linearity_summary %>% unnest(data) %>% # use generic data plot function iso_plot_data( x = amp44, y = d13C, color = format(file_datetime), points = TRUE, panel = file_datetime, geom_smooth(method = "lm") )
# this information is often maintained in a csv or Excel file instead # but generated here from scratch for demonstration purposes peak_maps <- tibble::tribble( # column names -> defining 2 peak maps, 'std' and 'sample' here # but could define an unlimited number of different maps ~compound, ~ref_nr, ~`rt:std`, ~`rt:sample`, # peak map data (row-by-row) "CO2", 1, 79, 79, "CO2", 2, 178, 178, "CO2", 3, 278, 278, "CO2", 4, 377, 377, "CO2", 5, 477, 477, "CO2", 6, 576, 576, "CO2", 7, 676, 676, "CO2", 8, 775, 775, "CO2", 9, 875, 875, "nC16", NA, 1156, 1156, "nC17", NA, 1280, 1280, "nC18", NA, 1409, 1409, "nC19", NA, 1540, 1540, "nC20", NA, 1670, 1670, "nC21", NA, 1797, 1797, "nC22", NA, 1921, 1921, "nC23", NA, 2040, 2040, "nC24", NA, 2156, 2156, "nC25", NA, 2267, 2267, "nC26", NA, 2375, 2375, "nC27", NA, 2479, 2479, "nC28", NA, 2580, 2580, "nC29", NA, 2674, 2678, "nC30", NA, 2770, 2773, "nC31", NA, NA, 2863, "CO2", 10, 3552, 3552, "CO2", 11, 3592, 3592 ) peak_maps %>% knitr::kable(digits = 0)
# identify peaks peak_table_w_ids <- iso_files %>% # focus on standards and samples only iso_filter_files(type %in% c("std", "sample")) %>% # filter out analyses that ## a) did not inject iso_filter_files(!Analysis %in% c(2031, 2040, 2069, 2098, 2116, 2138)) %>% ## b) had double or rogue peaks from mis-injection iso_filter_files(!Analysis %in% c(2044, 2132, 2134)) %>% # map peaks iso_map_peaks(peak_maps, map_id = type) %>% # peak table iso_get_peak_table(include_file_info = everything())
# use same plotting as earlier, except now with peak features added in iso_files %>% iso_filter_files(Analysis %in% c(2046, 2106, 2107)) %>% iso_plot_continuous_flow_data( # select data and aesthetics data = 44, color = id1, panel = Analysis, # zoom in on time interval time_interval = c(1000, 3200), # provide our peak table with ids peak_table = peak_table_w_ids, # define peak labels, this can be any valid expression peak_label = iso_format(id = peak_info, d13C), # specify which labels to show (removing the !is_identified or putting a # restriction by retention time can help a lot with busy chromatograms) peak_label_filter = is_identified | !is_identified | is_missing ) + # customize resulting ggplot theme(legend.position = "bottom")
# look at missing and unidentified peaks summary (not run during knitting) # -> check if any of the missing peaks are unexpected (should be there but aren't) # -> check if any unidentified peaks should be added to the peak maps # -> check if any ambiguous peaks need clearer peak map retention times # when in doubt, look at the chromatograms of the problematic files! # as needed: iterate on peak map update, then peak mapping inspection peak_table_w_ids %>% iso_get_problematic_peak_mappings() %>% iso_summarize_peak_mappings()
# examine variation in the reference peaks peak_table_w_ids %>% # focus on reference peaks only filter(!is.na(ref_nr)) %>% # mark ref peak used for raw delta values (assuming the same in all anlysis) mutate( ref_info = paste0(ref_nr, ifelse(is_ref == 1, "*", "")) %>% factor() %>% fct_inorder() ) %>% # visualize iso_plot_ref_peaks( # specify the ratios to visualize x = Analysis, ratio = `r45/44`, fill = ref_info, panel_scales = "fixed" ) + labs(x = "Analysis", fill = "Reference\npeak")
# focus on analyte peaks and calculate useful summary statistics peak_table_analytes <- peak_table_w_ids %>% # omit reference peaks for further processing (i.e. analyte peaks only) filter(is.na(ref_nr)) %>% # for each analysis, calculate a few useful summary statistics iso_mutate_peak_table( group_by = file_id, rel_area = area44 / sum(area44, na.rm = TRUE), # calculate mean area for all peaks mean_area = mean(area44, na.rm = TRUE), # calculate mean area and amplitude for identified peaks mean_area_identified = mean(area44[!is.na(compound)], na.rm = TRUE), mean_amp_identified = mean(amp44[!is.na(compound)], na.rm = TRUE) )
peak_table_analytes %>% # focus on identified peaks (comment out this line to see ALL peaks) filter(!is.na(compound)) %>% # visualize with convenience function iso_plot_data iso_plot_data( # choose x and y (multiple y possible) x = area44, y = c(amp44, d13C), # choose aesthetics color = compound, shape = type, label = compound, size = 3, # decide what geoms to include points = TRUE ) + # further customize ggplot with a log scale x axis scale_x_log10()
# optinally, use an interactive plot to explore your data # - make sure you install the plotly library --> install.packages("plotly") # - switch to eval=TRUE in the options of this chunk to include in knit # - this should work for all plots in this example processing file library(plotly) ggplotly(dynamicTicks = TRUE)
# evaluate yield # (only makes sense if both injection volume and concentration are available) peak_table_analytes %>% # focus on standards filter(type == "std") %>% # calculate injection amount mutate(injection = iso_double_with_units( as.numeric(injection_volume) * as.numeric(concentration), units = "ng")) %>% # visualize iso_plot_data( # x and y x = injection, y = c(mean_area_identified, mean_amp_identified), # aesthetics color = factor(injection_volume), shape = factor(concentration), size = file_datetime, points = TRUE ) + # modify plot with trendline and axis limits geom_smooth(method = "lm", mapping = aes(color = NULL, shape = NULL)) + expand_limits(y = 0, x = 0)
# this information is often maintained in a csv or Excel file instead # but generated here from scratch for demonstration purposes standards <- tibble::tribble( ~compound, ~true_d13C, "nC16", -26.15, "nC17", -31.88, "nC18", -32.70, "nC19", -31.99, "nC20", -33.97, "nC21", -28.83, "nC22", -33.77, "nC23", -33.37, "nC24", -32.13, "nC25", -28.46, "nC26", -32.94, "nC27", -30.49, "nC28", -33.20, "nC29", -29.10, "nC30", -29.84 ) %>% mutate( type = "std", true_d13C = iso_double_with_units(true_d13C, "permil") ) standards %>% knitr::kable(digits = 2)
peak_table_w_stds <- peak_table_analytes %>% iso_add_standards(stds = standards, match_by = c("type", "compound"))
Determine calibrations fits for all individual standard analyses.
stds_w_calibs <- peak_table_w_stds %>% # focus on standards filter(type == "std") %>% # remove unassigned peaks iso_remove_problematic_peak_mappings() %>% # prepare for calibration by defining the grouping column(s) iso_prepare_for_calibration(group_by = Analysis) %>% # run calibration iso_generate_calibration(model = lm(d13C ~ true_d13C)) %>% # unnest some useful file information for visualization iso_get_calibration_data( select = c(file_datetime, injection_volume, seed_oxidation, mean_area_identified)) # check for problematic calibrations stds_w_calibs %>% iso_get_problematic_calibrations() -> problematic.calibs # View(problematic.calibs) # move forward only with good calibrations stds_w_calibs <- stds_w_calibs %>% iso_remove_problematic_calibrations()
# look at coefficients and summary stds_w_calibs %>% # unnest calibration parameters iso_get_calibration_parameters( select_from_coefs = c(term, estimate, SE = std.error, signif), select_from_summary = c(fit_R2 = adj.r.squared, fit_RSD = sigma, residual_df = df.residual)) %>% arrange(term) %>% knitr::kable(digits = 4) # visualize the calibration coefficients stds_w_calibs %>% # plot calibration parameters iso_plot_calibration_parameters( # x-axis, could also be e.g. Analysis or file_datetime (also use date_breaks!) x = mean_area_identified, # aesthetics color = seed_oxidation, size = mean_area_identified, # highlight RSD threshold geom_hline(data = tibble(term = factor("RSD"), threshold = 0.5), mapping = aes(yintercept = threshold), linetype = 2) )
stds_w_calibs %>% # pull out all peak data to including residuals iso_get_calibration_data() %>% # focus on standard peaks filter(is_std_peak) %>% # calculate area deviation from mean iso_mutate_peak_table( group_by = Analysis, area_dev = iso_double_with_units((area44/mean(area44) - 1) * 100, "%") ) %>% # visualize iso_plot_data( # x and y x = compound, y = c(residual = resid, `area dev. from mean` = area_dev), # grouping and aesthetics group = paste(Analysis, calib), color = seed_oxidation, linetype = calib, # geoms lines = TRUE )
# define a global calibration across all standards global_calibs <- peak_table_w_stds %>% # remove (most) problematic peaks to speed up calibration calculations # note: could additionally remove unidentified peaks if they are of no interest iso_remove_problematic_peak_mappings(remove_unidentified = FALSE) %>% # prepare for calibration (no grouping to go across all analyses) iso_prepare_for_calibration() %>% # run different calibrations iso_generate_calibration( model = c( linear = lm(d13C ~ true_d13C), with_area = lm(d13C ~ true_d13C + area44), with_area_cross = lm(d13C ~ true_d13C * area44) ), # specify which peaks to include in the calibration, here: # - all std_peaks (this filter should always be included!) # - standard peaks between 5 and 100 Vs # - only analyses with seed oxidation (same as all the samples) use_in_calib = is_std_peak & area44 > iso_double_with_units(5, "Vs") & area44 < iso_double_with_units(100, "Vs") & seed_oxidation == "yes" ) %>% iso_remove_problematic_calibrations()
# look at coefficients and summary global_calibs %>% # unnest calibration parameters iso_get_calibration_parameters( select_from_coefs = c(term, estimate, SE = std.error, signif), select_from_summary = c(fit_R2 = adj.r.squared, fit_RSD = sigma, residual_df = df.residual)) %>% arrange(term) %>% knitr::kable(digits = 4) # visualize coefficients for the different global calibrations global_calibs %>% iso_plot_calibration_parameters()
global_calibs %>% iso_plot_residuals( x = compound, group = paste(Analysis, calib), points = FALSE, lines = TRUE, trendlines = FALSE, value_ranges = FALSE )
# note that depending on the number of data points, this may take a while # for faster calculations, chose calculate_error = FALSE global_calibs_applied <- global_calibs %>% # which calibration to use? can include multiple if desired to see the result # in this case, the area conscious calibrations are not necessary filter(calib == "linear") %>% # apply calibration indication what should be calcculated iso_apply_calibration(true_d13C, calculate_error = TRUE) # calibration ranges global_calibs_with_ranges <- global_calibs_applied %>% # evaluate calibration range for the measured area.Vs and predicted d13C iso_evaluate_calibration_range(area44, true_d13C_pred) # show calibration ranges global_calibs_with_ranges %>% iso_get_calibration_range() %>% iso_remove_list_columns() %>% knitr::kable(d = 2) # create calibrated peak table peak_table_calibrated <- global_calibs_with_ranges %>% iso_get_calibration_data()
# replicate earlier overview plot but now with the calibrated delta values # and with a higlight of the calibration ranges and which points are in range peak_table_calibrated %>% # focus on identified peaks (comment out this line to see ALL peaks) filter(!is.na(compound)) %>% # visualize with convenience function iso_plot_data iso_plot_data( # choose x and y (multiple y possible) x = area44, y = true_d13C_pred, # choose aesthetics color = in_range, shape = type, label = compound, size = 3, # decide what geoms to include points = TRUE ) %>% # highlight calibration range iso_mark_calibration_range() + # further customize ggplot with a log scale x axis scale_x_log10() + # legend theme(legend.position = "bottom", legend.direction = "vertical")
# visualize how standard line up after calibration peak_table_calibrated %>% # focus on peaks in calibration filter(in_calib) %>% # visualize iso_plot_data( # x and y x = true_d13C, y = true_d13C_pred, y_error = true_d13C_pred_se, # aesthetics color = compound, size = area44, # geoms points = TRUE, # add 1:1 trendline geom_abline(slope = 1, intercept = 0) )
# Warning: data outside the calibration range MUST be taken with a # grain of salt, especially at the low signal area/amplitude end peak_table_calibrated %>% # focus on samples filter(type == "sample") %>% # focus on identified peaks (comment out this line to see ALL peaks) # if there's a lot of data, might need to hone in on a few at a time filter(!is.na(compound)) %>% # visualize iso_plot_data( # x and y x = sample, y = true_d13C_pred, y_error = true_d13C_pred_se, # aesthetics color = in_range, size = rel_area, # paneling panel = compound, panel_scales = "fixed", # geoms points = TRUE ) %>% # mark calibration range (include optionally) iso_mark_calibration_range() + # color palette (here example of manual: www.google.com/search?q=color+picker) scale_color_manual( values = c("#984EA3", "#E41A1C", "#E41A1C", "#377EB8", "#4DAF4A") ) + # clarify size scale scale_size_continuous(breaks = c(0.01, 0.05, 0.1, 0.2, 0.3), labels = function(x) paste0(100*x, "%")) + # plot labels labs(x = NULL)
# visualize relative abundances peak_table_calibrated %>% # focus on samples filter(type == "sample") %>% # focus on identified peaks (comment out this line to see ALL peaks) #filter(!is.na(compound)) %>% # visualize iso_plot_data( # x and y x = sample, y = rel_area, # aesthetics fill = compound, # barchart geom_bar(stat = "identity") ) + # scales scale_y_continuous(labels = function(x) paste0(100*x, "%"), expand = c(0, 0)) + # plot labels labs(x = NULL, y = "Relative abundance (by area)")
# generate data summary peak_data <- peak_table_calibrated %>% # focus on identified peaks in the samples filter(type == "sample", !is.na(compound)) %>% select(sample, compound, Analysis, area44, true_d13C_pred, true_d13C_pred_se, in_range) %>% arrange(sample, compound, Analysis) # summarize replicates # this example data set does not contain any replicates but typically all # analyses should in which case a statistical data summary can be useful peak_data_summary <- peak_data %>% # here: only peaks within the area calibration range are included # you have to explicitly decide which peaks you trust if they are out of range filter( in_range %in% c("in range", "<'true_d13C_pred' range", ">'true_d13C_pred' range") ) %>% # summarize for each sample and compound group_by(sample, compound) %>% iso_summarize_data_table(area44, true_d13C_pred) peak_data_summary %>% iso_make_units_explicit() %>% knitr::kable(d = 2)
Final data processing and visualization usually depends on the type of data and the metadata available for contextualization (e.g. core depth, source organism, age, etc.). The relevant metadata can be added easily with iso_add_file_info()
during the initial data load / file info procesing. Alternatively, just call iso_add_file_info()
again at this later point or use dplyr's left_join
directly.
# @user: add final data processing and plot(s)
# export the global calibration with all its information and data to Excel peak_table_calibrated %>% iso_export_calibration_to_excel( filepath = format(Sys.Date(), "%Y%m%d_gc_irms_example_carbon_export.xlsx"), # include data summary as an additional useful tab `data summary` = peak_data_summary )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.