This manual should be read in it's entirety before a user tries to use the package, but can also be used as a quick reference to individual functions or example use cases. It is not intended to be a comprehensive resource describing all possible flags or outputs for each function, rather this vignette seeks to provide a brief overview of the functionality of the package and normal use cases. Please refer to each individual function's help page using the ?[function name] command in R after installation. For clarity function names are bolded and function arguments or flags are italicized.
This package is designed to allow users to align 2D-GCMS metabolite peaks common to multiple samples from raw Chromatof files and identify metabolites based on a standard reference library. Metabolomics data processing can be roughly binned into three parts:
#Add colored rectangles par(mai=c(0.1,0.1,0.1,0.1)) plot.new() rect(0,0.6,1/3,0.867, col="indianred1") rect(1/3,0.6,2/3,0.867, col="steelblue2") rect(2/3,0.6,3/3,0.867, col="springgreen3") #Label boxes text(0.5,0.9, "Metabolomics Data Processing", cex=1.5) text(1/6,0.85, "Raw Signal Filtering\nand Peak Calling\n(Chromatof)", pos=1,cex=0.8) text(3/6,0.85, "Multi-sample Peak\nAlignment and\nMetabolite Identification\n(R2DGC)", cex=0.8, pos=1) text(5/6,0.85, "Data Normalization\nand Statistical\nInference\n(Metaboanalyst)", pos=1,cex=0.8) arrows(0.17,0.63,0.85,0.63,length = 0.1 ,lwd=3) #Add box1 peaks points(seq(0,0.2,(0.2-0)/8),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)*0.5)+0.45,type="l", lwd=3) points(seq(0.1,0.3,(0.3-0.1)/8),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)*0.5)+0.45,type="l", lwd=3) points(seq(0,0.15,(0.15-0)/8),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)*0.5)+0.22,type="l", lwd=3) points(seq(0.16,0.3,(0.3-0.16)/8),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)*0.5)+0.22,type="l", lwd=3) #Add box2 peaks points(seq(0,0.13,(0.13-0)/8)+(0.36),(c(0.0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.26,type="l", lwd=3) points(seq(0.17,0.3,(0.3-0.17)/8)+(0.34),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.26,type="l", lwd=3) points(seq(0,0.13,(0.13-0)/8)+(0.36),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.32,type="l", lwd=3) points(seq(0.17,0.3,(0.3-0.17)/8)+(0.34),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.32,type="l", lwd=3) points(seq(0,0.13,(0.13-0)/8)+(0.36),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.38,type="l", lwd=3) points(seq(0.17,0.3,(0.3-0.17)/8)+(0.34),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.38,type="l", lwd=3) points(seq(0,0.13,(0.13-0)/8)+(0.36),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.44,type="l", lwd=3) points(seq(0.17,0.3,(0.3-0.17)/8)+(0.34),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.44,type="l", lwd=3) #Add box3 peaks points(seq(0,0.13,(0.13-0)/8)+(0.7),(c(0.0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.2,type="l", lwd=3) points(seq(0.17,0.3,(0.3-0.17)/8)+(0.69),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.2,type="l", lwd=3) points(seq(0,0.13,(0.13-0)/8)+(0.7),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.26,type="l", lwd=3) points(seq(0.17,0.3,(0.3-0.17)/8)+(0.69),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.26,type="l", lwd=3) points(seq(0,0.13,(0.13-0)/8)+(0.7),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.42,type="l", lwd=3) points(seq(0.17,0.3,(0.3-0.17)/8)+(0.69),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.42,type="l", lwd=3) points(seq(0,0.13,(0.13-0)/8)+(0.7),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.48,type="l", lwd=3) points(seq(0.17,0.3,(0.3-0.17)/8)+(0.69),(c(0,0.08,0.12,0.15,0.16,0.15,0.13,0.09,0)/3)+0.48,type="l", lwd=3) rect(xleft = 0.69,ybottom = 0.19,xright = 1,0.325, lwd = 3) rect(xleft = 0.69,ybottom = 0.41,xright = 1,0.546, lwd = 3) text(0.85, 0.57, "Group A") text(0.85, 0.35, "Group B") #Add dashed lines between peaks arrows(1/3,0.2,1/3,0.6,code = 0,lty=2) arrows(2/3,0.2,2/3,0.6,code = 0,lty=2) arrows(1/6.5,0.28,1/6.5,0.45,code = 1, length=0.1,lwd=2) text(0.5, 0, "Figure 1. Overview of metabolomics data processing",cex=0.8)
The basic workflow for metabolite alignment consists of two optional pre-processing functions and a function for performing the final alignment.
par(mai=c(0.1,0.1,0,0.1)) plot.new() text(0.5,1,"Input Sample\nChromatof File Paths", cex=0.8) arrows(0.5,0.96,0.5,0.905,length=0.1, lwd=2) rect(0.3,0.75,0.7,0.9, col="springgreen3") text(0.5,0.83,"Optional: Ion Filtering\n(FindProblemIons)", cex=0.8) arrows(0.5,0.745,0.5,0.705,length=0.1, lwd=2) rect(0.3,0.55,0.7,0.7, col="steelblue2") text(0.5,0.63,"Optional: Intra-Sample\nPeak Compression\n(PrecompressFiles)", cex=0.8) arrows(0.5,0.545,0.5,0.505,length=0.1, lwd=2) rect(0.3,0.3,0.7,0.5, col="indianred1") text(0.5,0.4,"Multi-Sample Peak\nAlignment and\nMetabolite Identification\n(ConsensusAlign)", cex=0.8) rect(0.8,0.5,1,0.65, col="plum3") text(0.9,0.58,"Standard Library\nCreation\n(MakeReference)", cex=0.69) text(0.9,0.77,"Input Metabolite\nStandard\nChromatof File Paths", cex=0.8) arrows(0.9,0.71,0.9,0.655,length=0.1, lwd=2) arrows(0.9,0.495,0.705,0.4,length=0.1, lwd=2) arrows(0.5,0.295,0.5,0.255,length=0.1, lwd=2) rect(-0.02,0.03,1.02,0.25,col="snow2") text(0.5,0.225,"Output List") rect(0,0.05,0.3,0.2, col="wheat1") text(0.15,0.13,"Peak Alignment Table", cex=0.8) rect(0.35,0.05,0.65,0.2, col="khaki2") text(0.5,0.13,"Peak Info Table", cex=0.8) rect(0.7,0.05,1,0.2, col="lightgoldenrod") text(0.85,0.13,"Incongruent Quant\nMass List", cex=0.8) text(0.5, 0, "Figure 2. Basic Workflow. Functions shown in parenthesis",cex=0.8)
There are multiple ways to install this package. The first involves downloading the binary source file from my github page (https://github.com/rramaker/R2DGC) and installing the package manually:
install.packages("/PathToDownloadedFile/R2DGC_0.1.0.tgz")
The package can be directly installed from github with the devtools package
library(devtools) install_github("rramaker/R2DGC")
Alternatively the package can be downloaded from the CRAN repository. ##Coming Soon!
install.packages("R2DGC")
Once installed the functions from R2DGC can be made available in your current working environment with the library() command:
library(R2DGC)
Ensuring input files are formatted properly is critical for success with this package. To facillitate proper formatting two example sample input files (SampleA and SampleB) have been provided with this package. The paths to these packages can be found easily:
system.file("extdata", "SampleA.txt", package="R2DGC") system.file("extdata", "SampleB.txt", package="R2DGC")
These are tab separated files with a header describing each column as is standard output from Chromatof software. I've constructed a table representing the first three lines of the SampleA file to demonstrate each of the 5 required columns in the input file:
As shown above the five required columns (the order shown is required) are the peak name, retention time (outputted by Chromatof as first retention time, second retention time), peak area, quant mass, and mass spectra (cutoff after the first two masses to save space). All of these columns can be specified as output in the data processing method used by the Chromatof software and do not need to be altered post-output. The fourth column is the only column that might change depending on the data processing method used. Chromatof allows for three different methods for quantifying peak areas. The simplest is the total ion chromatogram (TIC), which uses all ions to compute the peak area. This method has no unique quant mass, thus Chromatof outputs "T" for the quant mass as shown above. The second is the unique mass method, which uses an ion relatively unique to a peak to compute the area for a peak in an effort to improve resolution of overlapping peaks. Chromatof will output the unique mass used to compute peak area in the quant mass column if this method is selected as shown below:
The final method, apexing masses, is a compromise of the first two. This approach uses several ions that apex at similar retention times to compute the peak area. If this method is use "Apexing Masses" should be outputed as the fifth column instead of "Quant Mass". Chromatof will output each appexing mass separated by a colon as shown below:
Sample input files for all R2DGC functions should look identical to the SampleA.txt or SampleB.txt example files provided with the exception of the quant method used as described above. Chromatof allows for several other output columns that are not required for R2DGC. It's recommended to not include these in the input files to reduce memory usage during alignment. The example files have also been reduced to just 13 amino acids and 9 (Fatty Acid Methyl Esters) FAME standards to simplify examples, however peak files with several hundred peak are acceptable. Notice a consistent naming scheme was used for each of the FAME standards. This is required if the user wishes to use these peaks for retention time indexing in downstream processing. Other than retention time index standards the peak names are irrelevant and not used in downstream processing.
This is a Cooper lab specific function that allows a user to quickly annotate the FAME standards in their sample so they can be used for downstream retention time indexing. It only requires a vector of file paths that the user wishes to have the FAMEs annotated in be supplied to the inputFileList flag. I'll use sample files A and B for the example below even though they already have the FAME standards annotated correctly:
SampleA<-system.file("extdata", "SampleA.txt", package="CooperR2DGC") SampleB<-system.file("extdata", "SampleB.txt", package="CooperR2DGC") Find_FAME_Standards(inputFileList = c(SampleA,SampleB))
If the function can't find a match to a FAME standard it will write out "Potential Problem Match:" and the name of the FAME standard missing. If this happens it is strongly recommended that the user double check the annotation to make sure it actually found the correct peak for that FAME standard.
This is an optional step allowing the user to identify ions across all peak spectras to exclude in downstream processing steps. There are at least two potential reasons to exclude ions. The first is several ions may be absent or present at such a low level in all peaks such that removing it causes no difference in peak alignment quality, but improves alignment efficiency. By default Chromatof includes ion information for each mass ranging from 70 to 600 and we've found that at least half of these are present at such low levels that they can be excluded from downstream processing without a noticeable effect. The second reason to exclude an ion is because it may be so ubiquitious that it limits our ability to distinguish different peak spectras. We have found this is common for masses that are derived from the derivitization process used in sample preparation. The FindProblemIons function will identify ions that fall in both of these categories. Typically a representative input file is provided for this function and the output is used in downstream processing for all files. Here's an example using this function on one of the example files:
ProblemIons<-FindProblemIons(inputFile=system.file("extdata", "SampleA.txt", package="R2DGC"), possibleIons = 70:78)
If we assume $A$ and $B$ are each two peaks with spectra that can be represented as vectors $A = [a_{70},a_{71},..., a_{600}]$ and $B = [b_{70},b_{71},..., b_{600}]$ with each element representing an ion intensity. Than the similarity of two spectras, $S$, can be assessed as the dot product of the two vectors divided by the product of the length of each vector to normalize comparisons of all peaks such that:
Thus, ions that show a low "Sum50" should likely be excluded from downstream processing because our ability to resolve peaks within each sample or the degree to which each peak has a unique spectra is inhibited by including these ions. This function plots a z-scored Sum50 score for each ion specified by possible Ions. We have found that ions wiht Sum50 scores greater than 2 standard deviations below the mean should be excluded from downstream processing. This threshold can be changed using the commonIonThreshold flag. Any ion with a Sum50 score below the specified threshold will be labled in red on the plot.
This function also outputs a data frame containing information on each ion that should be excluded and whether it was classified as an "absent" or "common" ion:
head(ProblemIons)
The first column of this output is the ion and the second is the reason each ion was outputted. By default absent ions are those that represent less than 1% of the total ion intensity in each peak within the sample. This threshold can also be modified with the absentIonThreshold flag. This function defaults to using 1 core for processing, but I recommend increasing this by setting the numCores flag to as many cores as available. Increasing the number of cores used exponentially increases processing time. This goes for all functions in the R2DGC package except for the MakeReference function, which is less computationally intensive. The number of cores available can be determined using the parallel package's detectCores function:
library(parallel) detectCores()
This is another optional function that allows users to input a list of sample peak files and combine peaks that were unnecessarily split by Chromatof processing and likely represent the same analyte. We've found that a small number of peaks in each sample get unnecessarily split due to a variety of reasons including over-saturation in large peaks. This causes problems in downstream alignments because peaks will semi-randomly be assigned to each of the split peaks causing gaps in the final alignment table. This can easily be avoided by scanning each sample file for peaks that should be combined prior to alignment. An example of this command is shown below with a sample file that has had its Alanine peak split:
#Read in file containing split peak SampleC<-system.file("extdata", "SampleC.txt", package="R2DGC") CompressionInfo<-PrecompressFiles(inputFileList=SampleC) str(CompressionInfo, nchar.max=10)
As shown above this function returns a 15 column data frame (5 original columns + 2 parsed retention times from each peak and the sample file path) listing all peak pairs that were combined for each sample.
This function uses peak retention times and the dot product of the peak spectra to assess peak similarity. If we assume $A$ and $B$ are each two peaks with spectra that can be represented as vectors $A = [a_{70},a_{71},..., a_{600}]$ and $B = [b_{70},b_{71},..., b_{600}]$ with each element representing an ion intensity. Than the similarity of two spectras, $S$, can be assessed as the dot product of the two vectors divided by the product of the length of each vector to normalize comparisons of all peaks such that:
CompressionInfo<-PrecompressFiles(inputFileList=SampleC, commonIons = ProblemIons[,1])
If the outputFiles flag is set to TRUE, this function will peform a putative summation of peaks to be combined and write new sample files to the input file path with "_Processed.txt" appended to the end of the file path. It is almost always preferable to go back and combine peaks with the Chromatof software to ensure proper peak area integration, however these putatively combined peaks can be used for preliminary analysis.
One important thing to note is that the quantMethod flag should be set to the quant method used in the Chromatof data processing used to generate the sample files. If TIC or apexing masses are used (quantMethod = "T" or "A") than peaks that are nominated for combining will simply be summed together. However, if the unique mass method is used (quantMethod="U") and peaks nominated have different unique masses, proportional conversion will be performed to convert the peak masses to what they would be if they had the same unique mass. In other words, if a peak area, $A$, is computed with a unique mass of 73 and a second peak area $B$, is computed with a unique mass of 74, $B$ is converted to roughly what the mass would have been if its unique mass would have been 73 by multiplying it by the ratio of its ion intensities $b_{73}/b_{74}$. Thus, in this scenario with differing unique masses the final combined peak area, $C$, would be computed as follows:
We have found this proportional conversion to be servicable when the unique mass ion intensities are relatively comprable, however, this relationship does break down the more disparate the intensities. This should be a relatively rare occurance because samples with vastly differing unique masses are unlikely to meet the peak similarity threshold for compression or alignment.
This, the third and final optional function in the package, is used to generate metabolite standard libraries for reference in the ConsensusAlign function for identifying metabolites from which peaks are observed. A list of file paths to Chromatof files derived from metabolite standards are used as inputs and the output is a dataframe that can be used as input for the standardLibrary flag in the ConsensusAlign function. I've provided two example metabolite standard Chromatof files so users can be sure their formatting is correct. File paths to these example files can easily be found using the following command:
system.file("extdata", "Alanine_150226_1.txt", package="R2DGC") system.file("extdata", "Serine_022715_1.txt", package="R2DGC")
These are tab separated files generated by running standards in house with a series of FAME standards to use as retention indices. There is a header describing each column as is standard output from Chromatof software. Notice these files only require 3 columns and do not need the peak area or quant mass/apexing mass column used for sample files. I've constructed a table representing three lines of the the first standard file to demonstrate each of the 3 required columns in the input file:
Standard1<-system.file("extdata", "Alanine_150226_1.txt", package="R2DGC") Standard2<-system.file("extdata", "Serine_022715_1.txt", package="R2DGC") StandardLibrary<-MakeReference(inputFileList = c(Standard1, Standard2), RT1_Standards=paste0("FAME_", seq(8,24,2)))
Notice, in addition to providing the file paths to the standards I provided a vector with the names of each of the FAME standards in the RT1_Standards flag. A separate set of standards can be provided for use indexing the second retention time with the RT2_Standards flag as well. It is important to note, that retention time standards are only helpful if they span a large portion of possible retention times. I happen to know that the FAME standards are well spread across the my first retention time, but all have relatively the same second retention time so I've only used them to index the first retention time. Lets inspect the structure of the standard library output:
str(StandardLibrary, nchar.max=20)
The output is a data frame with two rows representing the two metabolite standards we ran the function on. The first three columns are the original peak name, retention time, and spectra columns from the input files. The following columns represent the two retention times followed by the first retention time relative to each FAME standard to allow for retention time indexing while performing alignments with the ConsensusAlign function (see section 2.5.3 below for more details). Though recommended, retention time standards are not required for alignment or standard library construction. If retention time standards are not desired the user should just submit a metabolite standard Chromatof file formatted identically to the examples above except with only one row describing the metabolite standard of interest.
We have provided a pre-formatted standard library with ~300 peaks with rentention indices calculated with 9 FAME standards (C:8, C:10, C:12, C:14, C:16, C:18, C:20, C:22, C:24). We've used the naming convention described above (Section 2.2), FAME_[number of carbons]. Any user can use this library regardless of whether they have FAME standards spiked into their samples if they perform a retention time standard free alignment (see section 2.6 below). Users who have spiked these FAME standards into their samples and named the FAME peaks according to the convention described. If only a subset of the FAME standards have been used, this library can still be used if the columns corresponding to the missing FAME peaks are deleted from the standard library data frame. The standard library can be accessed easily with the data() command:
data("StandardLibrary_030117") str(StandardLibrary_030117, nchar.max=10)
This creates a new variable named "StandardLibrary_030117" in your R environment, which can be used as input for the standardLibrary flag in the ConsensusAlign function (section 2.6).
The R2DGC package uses a very simple approach to incorporating retention time standards for universal retention time indexing. The maximum amount retention time the user's retention time standards cover is computed as a total RT length $L$. Next, the difference between a peak's retention time and each retention time standard is computed as a vector of differences, $D=[d_1,d_2,...,d_n]$, where $d$ representes the difference in retention time between a peak and a given retention time standard and $n$ is the total number of retention time standards used. Next each element in $D$ is divided by $L$ resulting in a retention index vector, $R$, with each retention time standard voting on the location of a peak relative to total retention time covered by all retention standards:
plot.new() points(c(.1,.35,.5,.8),rep(0.8,4), pch=16, col=c("black","red","black","black")) text(c(.1,.35,.5,.8),rep(0.85,5), labels = c("Std1","Analyte","Std2","Std3")) arrows(x0 = 0, y0 = 0.7,x1 = 1,y1 = 0.7, code=0, lwd=3) arrows(x0 = c(0,0.2,0.4,0.6,0.8,1), y0 = rep(0.7,6),x1 = c(0,0.2,0.4,0.6,0.8,1),y1 = rep(0.65,6), code=0, lwd=3) text(c(0,0.2,0.4,0.6,0.8,1),rep(0.62,6), labels = c(seq(0,10,2))) text(0.5,0.73, labels = "Retention Time",cex=0.75) arrows(x0 = .1, y0 = 0.5,x1 = .8,y1 = 0.5, code=0, lwd=2) text(0.45,0.52, labels = substitute(paste(italic("T"),"=7"))) arrows(x0 = c(.1,.8), y0 = rep(0.49,2),x1 = c(.1,.8),y1 = rep(0.51,2), code=0, lwd=2) arrows(x0 = .1, y0 = 0.4,x1 = .35,y1 = 0.4, code=0, lwd=2) text(.225,0.43, labels = expression(italic("d"[1])*"= -2.5")) arrows(x0 = c(.1,.35), y0 = rep(0.39,2),x1 = c(.1,.35),y1 = rep(0.41,2), code=0, lwd=2) arrows(x0 = .5, y0 = 0.3,x1 = .35,y1 = 0.3, code=0, lwd=2) text(.425,0.33, labels = expression(italic("d"[2])*"= 1.5")) arrows(x0 = c(.5,.35), y0 = rep(0.29,2),x1 = c(.5,.35),y1 = rep(0.31,2), code=0, lwd=2) arrows(x0 = .8, y0 = 0.2,x1 = .35,y1 = 0.2, code=0, lwd=2) text(.575,0.23, labels = expression(italic("d"[3])*"= 4.5")) arrows(x0 = c(.8,.35), y0 = rep(0.19,2),x1 = c(.8,.35),y1 = rep(0.21,2), code=0, lwd=2) text(.5,0, labels = "Figure 4. Computing retention index values", cex=0.75)
plot.new() points(c(.1,.35,.3,.5),rep(0.8,4), pch=16, col=c("black","red","black","black")) text(c(.1,.35,.3,.5),c(0.85,.75,.85,0.85), labels = c("Std1","","Std2","Std3"), cex=0.8) rect(xleft = 0.05,ybottom = 0.75,xright = 0.55,ytop = 0.88) text(0.3,0.9,"Column1") points(c(.2,.4,.6),rep(0.6,3), pch=16, col=c("black","black","black")) text(c(.2,.4,.6),rep(0.65,5), labels = c("Std1","Std2","Std3"),cex=0.8) rect(xleft = 0.15,ybottom = 0.55,xright = 0.65,ytop = 0.68) text(0.4,0.7,"Column2") points(c(.1,.5,1),rep(0.4,3), pch=16, col=c("black","black","black")) text(c(.1,.5,1),rep(0.45,5), labels = c("Std1","Std2","Std3"),cex=0.8) rect(xleft = 0.05,ybottom = 0.35,xright = 1.03,ytop = 0.48) text(0.55,0.5,"Column3") arrows(x0 = 0, y0 = 0.3,x1 = 1,y1 = 0.3, code=0, lwd=3) arrows(x0 = c(0,0.2,0.4,0.6,0.8,1), y0 = rep(0.3,6),x1 = c(0,0.2,0.4,0.6,0.8,1),y1 = rep(0.25,6), code=0, lwd=3) text(c(0,0.2,0.4,0.6,0.8,1),rep(0.22,6), labels = c(seq(0,10,2))) text(0.5,0.32, labels = "Retention Time",cex=0.75) text(.5,.05, labels = "Figure 5. Testing retention index values", cex=0.75)
An analyte (red dot) is run on column 1. Column 2 has a uniform retention time shift of one second from column 1 and column 3 has a non-linear retention time shift that worsens throughout the run. In this scenario we'll want to run a second analyte and compare its retention time to the first. Below I'll compute the retention index difference $W$ for all possible analyte 2 retention times to show $W$ behaves as expected on all three columns.
First lets assume analyte 2 was run on column 1 with retention times behaving identically to that observed for analyte 1:
#Compute retention index for analyte 1 FAME_Standards<-c(1,3,5) Total_Standard_Length<-(max(FAME_Standards)-min(FAME_Standards)) Analyte1_RT<-3.5 Analyte1_RetentionIndex<- (FAME_Standards-Analyte1_RT)/Total_Standard_Length #Compute retention index for all possible analyte 2s IndexDifferences<-c() for(Analyte2_RT in seq(0,10,0.1)){ Analyte2_RetentionIndex<- (FAME_Standards-Analyte2_RT)/Total_Standard_Length IndexDifferences[as.character(Analyte2_RT)]<-sum(abs(Analyte2_RetentionIndex-Analyte1_RetentionIndex)) } plot(seq(0,10,0.1),IndexDifferences, pch=16, xlab="Analyte 2 Retention Time", ylab="Retention Index Difference (W)")
FAME_Standards<-c(1,3,5) Total_Standard_Length<-(max(FAME_Standards)-min(FAME_Standards)) Analyte1_RT<-3.5 Analyte1_RetentionIndex<- (FAME_Standards-Analyte1_RT)/Total_Standard_Length IndexDifferences<-c() for(Analyte2_RT in seq(0,10,0.1)){ Analyte2_RetentionIndex<- (FAME_Standards-Analyte2_RT)/Total_Standard_Length IndexDifferences[as.character(Analyte2_RT)]<-sum(abs(Analyte2_RetentionIndex-Analyte1_RetentionIndex)) } plot(seq(0,10,0.1),IndexDifferences, pch=16, xlab="Analyte 2 Retention Time", ylab="Retention Index Difference (W)")
#Compute retention index for analyte 1 FAME_Standards1<-c(1,3,5) Total_Standard_Length1<-(max(FAME_Standards1)-min(FAME_Standards1)) Analyte1_RT<-3.5 Analyte1_RetentionIndex<- (FAME_Standards1-Analyte1_RT)/Total_Standard_Length #Compute retention index for analyte 2 FAME_Standards2<-c(2,4,6) Total_Standard_Length2<-(max(FAME_Standards2)-min(FAME_Standards2)) IndexDifferences<-c() for(Analyte2_RT in seq(0,10,0.1)){ Analyte2_RetentionIndex<- (FAME_Standards2-Analyte2_RT)/Total_Standard_Length2 IndexDifferences[as.character(Analyte2_RT)]<-sum(abs(Analyte2_RetentionIndex-Analyte1_RetentionIndex)) } plot(seq(0,10,0.1),IndexDifferences, pch=16, xlab="Analyte 2 Retention Time", ylab="Retention Index Difference (W)")
#Compute retention index for analyte 1 FAME_Standards1<-c(1,3,5) Total_Standard_Length1<-(max(FAME_Standards1)-min(FAME_Standards1)) Analyte1_RT<-3.5 Analyte1_RetentionIndex<- (FAME_Standards1-Analyte1_RT)/Total_Standard_Length #Compute retention index for analyte 2 FAME_Standards2<-c(2,4,6) Total_Standard_Length2<-(max(FAME_Standards2)-min(FAME_Standards2)) IndexDifferences<-c() for(Analyte2_RT in seq(0,10,0.1)){ Analyte2_RetentionIndex<- (FAME_Standards2-Analyte2_RT)/Total_Standard_Length2 IndexDifferences[as.character(Analyte2_RT)]<-sum(abs(Analyte2_RetentionIndex-Analyte1_RetentionIndex)) } plot(seq(0,10,0.1),IndexDifferences, pch=16, xlab="Analyte 2 Retention Time", ylab="Retention Index Difference (W)")
#Compute retention index for analyte 1 FAME_Standards1<-c(1,3,5) Total_Standard_Length1<-(max(FAME_Standards1)-min(FAME_Standards1)) Analyte1_RT<-3.5 Analyte1_RetentionIndex<- (FAME_Standards1-Analyte1_RT)/Total_Standard_Length #Compute retention index for analyte 2 FAME_Standards2<-c(1,5,10) Total_Standard_Length2<-(max(FAME_Standards2)-min(FAME_Standards2)) IndexDifferences<-c() for(Analyte2_RT in seq(0,10,0.1)){ Analyte2_RetentionIndex<- (FAME_Standards2-Analyte2_RT)/Total_Standard_Length2 IndexDifferences[as.character(Analyte2_RT)]<-sum(abs(Analyte2_RetentionIndex-Analyte1_RetentionIndex)) } plot(seq(0,10,0.1),IndexDifferences, pch=16, xlab="Analyte 2 Retention Time", ylab="Retention Index Difference (W)")
#Compute retention index for analyte 1 FAME_Standards1<-c(1,3,5) Total_Standard_Length1<-(max(FAME_Standards1)-min(FAME_Standards1)) Analyte1_RT<-3.5 Analyte1_RetentionIndex<- (FAME_Standards1-Analyte1_RT)/Total_Standard_Length #Compute retention index for analyte 2 FAME_Standards2<-c(1,5,10) Total_Standard_Length2<-(max(FAME_Standards2)-min(FAME_Standards2)) IndexDifferences<-c() for(Analyte2_RT in seq(0,10,0.1)){ Analyte2_RetentionIndex<- (FAME_Standards2-Analyte2_RT)/Total_Standard_Length2 IndexDifferences[as.character(Analyte2_RT)]<-sum(abs(Analyte2_RetentionIndex-Analyte1_RetentionIndex)) } plot(seq(0,10,0.1),IndexDifferences, pch=16, xlab="Analyte 2 Retention Time", ylab="Retention Index Difference (W)")
This is the final and only mandatory function included in the package. It takes a list of input sample Chromatof file paths formatted as described in section 2.2 above as well as an optional metabolite standard library as described in section 2.5 and will align common peaks across the samples and identify metabolites from which each aligned peak is likely derived from if a standard library is provided. Figure 9 diagrams the basic processed involved in the ConsensusAlign function:
par(mai=rep(0.1,4)) plot.new() rect(xleft = 0.25,ybottom = 0.9,xright = 0.75,ytop = 1, col="springgreen3") text(0.5,0.95,"Read in sample files", cex=0.75) arrows(x0 = 0.5,y0 = 0.9,x1 = 0.5,y1 = 0.875,length=0.05, lwd=2) rect(0.25,0.775,0.75,0.875, col="wheat1") text(0.5,0.825,"Optional: Compute retention indices", cex=0.75) arrows(x0 = 0.5,y0 = 0.775,x1 = 0.5,y1 = 0.75,length=0.05, lwd=2) rect(0.25,0.65,0.75,.75, col="indianred1") text(0.5,0.7,"Compute pairwise sample-seed\npeak similarity scores", cex=0.75) arrows(x0 = 0.5,y0 = 0.65,x1 = 0.5,y1 = 0.625,length=0.05, lwd=2) rect(0.25,0.525,0.75,.625, col="khaki2") text(0.5,0.575,"Optional: compute optimal\npeak similarity threshold", cex=0.75) arrows(x0 = 0.5,y0 = 0.525,x1 = 0.5,y1 = 0.5,length=0.05, lwd=2) rect(0.25,0.4,0.75,.5, col="steelblue2") text(0.5,0.45,"Find best peak pairs above\npeak similarity threshold", cex=0.75) arrows(x0 = 0.5,y0 = 0.4,x1 = 0.5,y1 = 0.375,length=0.05, lwd=2) rect(0.25,0.275,0.75,.375, col="plum3") text(0.5,0.325,"Optional: Relaxed threshold search\nfor high likelihood missing peaks", cex=0.75) arrows(x0 = 0.5,y0 = 0.275,x1 = 0.5,y1 = 0.25,length=0.05, lwd=2) rect(0.25,0.15,0.75,.25, col="lightgoldenrod") text(0.5,0.2,"Optional: Identify aligned\npeaks with reference library", cex=0.75) arrows(x0 = 0.5,y0 = 0.15,x1 = 0.5,y1 = 0.125,length=0.05, lwd=2) arrows(x0 = 0.1,y0 = 0.325,x1 = 0.1,y1 = 0.7, lwd=2, code=0) arrows(x0 = 0.25,y0 = 0.325,x1 = 0.1,y1 = 0.325, lwd=2, code=0) arrows(x0 = 0.25,y0 = 0.7,x1 = 0.1,y1 = 0.7, lwd=2, length=0.05, code=1) rect(0,0.45,0.2,0.575, col="coral") text(0.1,0.515, "Optional: repeat\nalignment with\nmultiple seeds", cex=0.7) rect(0,0,0.3,0.09, col="cadetblue2") text(0.15,0.045,"Peak Alignment Table", cex=0.8) rect(0.35,0.0,0.65,0.09, col="navajowhite") text(0.5,0.045,"Peak Info Table", cex=0.8) rect(0.7,0.0,1,0.09, col="pink2") text(0.85,0.045,"Incongruent Quant\nMass List", cex=0.8) text(0.5,.105, "Outputs", cex=0.8) rect(0,0,1,0.09) text(0.5,-0.02, "Figure 9. Overview of ConsensusAlign function.", cex=0.7)
As described above, sample files are read in from file paths provide by the inputFilePaths flag. If retention time standards are specified with the RT1_Standards or RT2_Standards flags, retention indices are computed for each peak in each sample file as described in section 2.5.2 above. Next, pairwise peak similarity scores are computed for each peak in each sample file and a designated seed file. The seed file defaults to the first file in the inputFilePaths argument, however this can be changed with the seedFile argument. Similarity scores are computed as described in section 2.4.2 above. Next, the best peak matches (between each sample file and the seed file) that are above a similarity score threshold are added to a final alignment table. The similarity score threshold dictates the stringency of the alignment and can be manually adjusted with the similarityCutoff flag - acceptable values usually range from 0 to 95 with higher values resulting in more missing values, but higher confidence alignments. By setting autoTuneMatchStringency to TRUE, the function will ignore the user inputted similarityCutoff and attempt to compute an optimal similarity score threshold (see section 2.6.2. below).
Peaks that are present in sample Chromatof files, but not present in the seed file will be added to the alignment table and searched for in other sample files only if they have a similarity score less than that specified by the dissimilarityCutoff flag (defaults 90 less than the similarityCutoff) for all other peaks already present in the alignment table. It's important that the dissimilarityCutoff is significantly less than the similarityCutoff so the same peaks aren't split over multiple rows in the alignment table. We've found this is a major source of missing data in other aligners.
After all alignments are completed the alignment table will be trimmed to only peaks that were successfulled found in a greater or equal fraction of the samples than that specified by the missingValueLimit flag (defaults to 0.75). If the missingPeakFinderSimilarityLax flag is less than 1 (typical range 0.75-1), the function will perform a second pass alignment on the missingValueLimit trimmed alignment table looking for the missing peaks in samples at a relaxed fraction of the initial similarityCutoff. We've found that these peaks are usually not truly missing in samples, rather they simply had a similarity score that fell just below the initial similarityCutoff threshold.
Specifying a seed file to which we compare all other sample files greatly reduces the number of pairwise comparisons necessary for alignment, allowing us to conduct threshold free alignments instead of specificying retention time or spectral match thresholds like many other aligners. However, a disadvantage to this approach is that alignments have the potential to be biased based on the specified seed file. To address this the user can provide multiple seed files to the seedFile flag as a vector (3 is usually sufficient depending on the number of total sample files). If more than one seed file is provided, the alignment will be performed with each seed file and only aligned peaks with greater 50% of peaks consistent across all alignments will be outputted. Variable sample values are assigned a median value from all alignments.
After the alignment(s) are complete, if the user provides a metabolite standard library data frame to the standardLibrary flag as described in section 2.5 above, the function will compute similarity scores between each peak in the final alignment table and each metabolite standard in the library. The top three hits in the library and their respective similarity scores are reported in the peak information table. For peaks not present in the standard library, the similarity scores will be very low (<0) and these matches can probably ignored. From our experience similarity scores greater than 70 warrant attention as a likely hit.
The last thing to note about this function is that it is compatible with all three Chromatof quant methods, so the user should specify the method used for data processing with the quantMethod flag. If the unique mass method is used (quantMethod="U"), any peaks that are aligned, but have different quant masses than the seed file will have their peak area proportionally converted as described in section 2.4.3 above and will be outputted in the incongruent quant mass list. If the apexing mass method is used (quantMethod="A"), if a sample peak has less than 50% apexing masses in common with the seed file and is aligned, the peak and sample will be outputted in the incrongruent quant mass list. These occurences should be rare in peaks that pass similarity score thresholds, however, we recommend examing the incongruent quant mass list if using the unique mass or apexing mass quant methods, manually changing the quant mass of the peaks, and reprocessing for final analyses.
Here's a brief example to examine the outputs using the sample input files described in section 2.2, the standard library generated in section 2.5, and filtering problem ions identified in section 2.3 above:
#Find sample input file paths SampleA<-system.file("extdata", "SampleA.txt", package="R2DGC") SampleB<-system.file("extdata", "SampleB.txt", package="R2DGC") #Perform alignment Alignment<-ConsensusAlign(c(SampleA,SampleB), standardLibrary = StandardLibrary, commonIons = ProblemIons)
The output consists of a list of three objects. The first is the alignment matrix. This is a matrix contains all aligned peak areas with the same number of rows as aligned peaks and columns as number of samples.
head(Alignment$AlignmentMatrix, n=3)
The next output is the peak info table. This is a dataframe that contains the same number of rows as aligned peaks and at least 7 columns containing information describing each peak. If a standard library is used, as we did above, the first three columns are the top three library hits provided as [standard name]_[similarity score]. The next 7 columns are the core columns always exported and consist of the peak name, retention time, area, spectra quant mass and parsed retention times as they appear in the first seed file or the first sample file a peak was observed in. The final columns are the retention indices for each peak with the same number of columns outputted as retention time standards specified.
str(Alignment$MetaboliteInfo,nchar.max=10)
The last object outputted is the UnmatchedQuantMasses dataframe. This is a dataframe that is only outputted if using the unique mass or apexing mass quant method and contains information on peaks that were aligned, but had incongruent quant masses as described above. It contains 12 columns, the original 5 columns from the Chromatof file for each of the two aligned peaks and the file path of each sample file. In this example we've used the default TIC quant method so we do not have an UnmatchedQuantMasses dataframe.
Alignment$UnmatchedQuantMasses
As described above, the user has the option of manually providing a similarity score threshold for identifying a successful alignment via the similarityCutoff flag of in the ConsensusAlign function, however user imposed thresholds are often arbitrary and can be difficult to optimize. Thus, we've provided the option of allowing the function to find a semi-optimized threshold by setting the autoTuneMatchStringency flag to TRUE. We've emprically developed an approach to do this that tries a range of potential similarity score thresholds to maximize the number peaks with at least one successful match, but minimize the total number of successful matches for each peak. More specifically this approach can be defined as:
#Find sample input file paths SampleA<-system.file("extdata", "SampleA.txt", package="R2DGC") SampleB<-system.file("extdata", "SampleB.txt", package="R2DGC") #Perform alignment Alignment<-ConsensusAlign(inputFileList = c(SampleA,SampleB), RT1_Standards = c(), numCores = 4)
#Find sample input file paths SampleA<-system.file("extdata", "SampleA.txt", package="R2DGC") SampleB<-system.file("extdata", "SampleB.txt", package="R2DGC") #Perform alignment Alignment<-ConsensusAlign(inputFileList = c(SampleA,SampleB), RT1_Standards = paste("FAME_", seq(8,24,2)), numCores = 4)
#Find reference example standards Standard1<-system.file("extdata", "Alanine_150226_1.txt", package="R2DGC") Standard2<-system.file("extdata", "Serine_022715_1.txt", package="R2DGC") #Make standard library StandardLibrary<-MakeReference(inputFileList = c(Standard1, Standard2), RT1_Standards=paste0("FAME_", seq(8,24,2))) #Find sample input file paths SampleA<-system.file("extdata", "SampleA.txt", package="R2DGC") SampleB<-system.file("extdata", "SampleB.txt", package="R2DGC") #Perform alignment Alignment<-ConsensusAlign(inputFileList = c(SampleA,SampleB), RT1_Standards = paste("FAME_", seq(8,24,2)), standardLibrary = standardLibrary, numCores = 4)
#Find problem ions ProblemIons<-FindProblemIons(inputFile=system.file("extdata", "SampleA.txt", package="R2DGC"), possibleIons = 70:78, numCores = 4) #Find sample input file paths SampleA<-system.file("extdata", "SampleA.txt", package="R2DGC") SampleB<-system.file("extdata", "SampleB.txt", package="R2DGC") SampleC<-system.file("extdata", "SampleC.txt", package="R2DGC") #Compress sample peaks (files outputted with _Processed.txt extention) PrecompressFiles(inputFileList = c(SampleA,SampleB,SampleC), outputFiles = T, commonIons = ProblemIons, numCores = 4) #Find reference example standards Standard1<-system.file("extdata", "Alanine_150226_1.txt", package="R2DGC") Standard2<-system.file("extdata", "Serine_022715_1.txt", package="R2DGC") #Make standard library StandardLibrary<-MakeReference(inputFileList = c(Standard1, Standard2), RT1_Standards=paste0("FAME_", seq(8,24,2))) #Perform alignment Alignment<-ConsensusAlign(inputFileList = paste0(c(SampleA,SampleB,SampleC),"_Processed.txt"), RT1_Standards = paste("FAME_", seq(8,24,2)), standardLibrary = StandardLibrary, commonIons = ProblemIons, numCores = 4)
#Find sample input file paths SampleA<-system.file("extdata", "SampleA.txt", package="R2DGC") SampleB<-system.file("extdata", "SampleB.txt", package="R2DGC") SampleC<-system.file("extdata", "SampleC.txt", package="R2DGC") #Perform alignment Alignment<-ConsensusAlign(inputFileList = c(SampleA,SampleB), RT1_Standards = paste("FAME_", seq(8,24,2)), numCores = 4, seedFile = c(1,2,3))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.