#' xMSwrapper.XCMS.matchedFilter
#'
#' Wrapper function for XCMS based on matched filter alignment algorithm,
#' evaluate.Features,evaluate.Samples,merge.Results, and feat.batch.annotation
#'
#' The wrapper function includes five stages to utilize information from the
#' technical replicates to optimize the data extraction process, enhance data
#' quality, search for targeted features/metabollites, perform QA & QC
#' evaluations including batch-effect evaluation & correction. : 1) features
#' are extracted using different parameters 2) results from each parameter
#' setting are evaluated for sample quality & feature consistency, 3) filtered
#' results from individual settings are merged to obtain a combined feature
#' table; the optimization score that takes into account the number of features
#' and average CV (see Uppal 2013) is used to determine the most optimal set of
#' parameters. 4) A targeted feature table using the list of m/z in the "refMZ"
#' file is generated 5) Quality measures of each featue includes: number of
#' samples including replicates with non-missing values (NumPres.All.Samples),
#' number of biological samples for which more than 60% of technical replicates
#' have a non-missing/non-zero value (NumPres.Biol.Samples), median coefficient
#' of variation (CV) within technical replicates summarized across all samples;
#' Qscore (Quality score which is calculated using NumPres.All.Samples,
#' NumPres.Biol.Samples, median CV, and delta ppm between mz.min and mz.max;
#' Higher is better) Users have the option to filter poor quality samples and
#' features based on correlation between technical replicates and feature
#' reproducibility measures such as PID or CV, respectively.
#'
#' @param cdfloc The folder where all CDF/mzXML files to be processed are
#' located. eg: "C:/CDF/". Use "cdfloc=NA" if the XCMS results already exist
#' in XCMS.outloc.
#' @param XCMS.outloc The folder where XCMS output will be written. eg:
#' "C:/XCMSoutput/"
#' @param xMSanalyzer.outloc The folder where xMSanalyzer output will be
#' written. eg: "C:/xMSanalyzeroutput/"
#' @param step.list list containing values for the step size
#' @param mz.diff.list list containing values for the minimum difference for
#' features with retention time overlap
#' @param sn.thresh.list list containing values for signal to noise ratio
#' cutoff variable
#' @param max Value for maxnimum number of peaks per EIC variable
#' @param bw bandwidth value list. eg: c(10,30)
#' @param minfrac.val minimum fraction of samples necessary in at least the
#' sample groups for it to be a valid group
#' @param minsamp minimum number of samples necessary in at least one of the
#' sample groups for it to be a valid group
#' @param mzwid width of overlapping m/z slices to use for creating peak
#' density chromatograms and grouping peaks across samples
#' @param sleep seconds to pause between plotting successive steps of the peak
#' grouping algorithm. Peaks are plotted as points showing relative intensity.
#' identified groups are flanked by dotted vertical lines.
#' @param retcor.family Used by matchedFilter alignment method. Use "gaussian"
#' to perform fitting by least squares without outlier removal. Or "symmetric"
#' to use a redescending M estimator with Tukey's biweight function that allows
#' outlier removal.
#' @param retcor.plottype Used by both matchedFilter and centWave alignment
#' methods. eg: "deviation" or "mdevden"
#' @param groupval.method Conflict resolution method while calculating peak
#' values for each group. eg: "medret" or "maxint"
#' @param numnodes Number of computing nodes to use. eg: 1
#' @param run.order.file Name of a tab-delimited file that includes sample
#' names sorted by the order in which they were run (sample names must match
#' the CDF file names)
#' @param max.mz.diff +/- mz tolerance in ppm for feature matching
#' @param max.rt.diff retention time tolerance for feature matching
#' @param merge.eval.pvalue Threshold for defining signifcance level of the
#' paired t-test or the Pearson correlation during the merge stage in
#' xMSanalyzer. The p-value is used to determine whether two features with same
#' m/z and retention time have identical intensity profiles.
#' @param mergecorthresh Correlation threshold to be used during the merge
#' stage in xMSanalyzer to determine whether two features with same m/z and
#' retention time have identical intensity profiles.
#' @param num_replicates Number of replicates per sample
#' @param subs If not all the CDF files in the folder are to be processed, the
#' user can define a subset using this parameter.
#' @param mz.tolerance.dbmatch m/z threshold for database matching
#' @param adduct.list List of adducts for matching m/zs in KEGG. eg:
#' c("M+H","M+H-H2O")
#' @param samp.filt.thresh Threshold for filtering samples based on Pearson
#' correlation between technical replicates. eg: 0.7
#' @param feat.filt.thresh Threshold for filtering samples based on percent
#' intensity difference or coefficient of variation. eg: 50
#' @param cormethod Method for determing correlation between technical
#' replicates. eg: "pearson" or "spearman
#' @param mult.test.cor Should Bonferroni multiple testing correction method be
#' applied for comparing intensities of overlapping m/z? Default: FALSE
#' @param missingvalue How are missing values represented? eg: 0 or NA
#' @param ignore.missing Should the missing values be ignored while computing
#' pearson correlation. eg: TRUE or FALSE
#' @param deltamzminmax.tol Maximum allowed delta ppm between mz.min and
#' mz.max. Eg: 10
#' @param refMZ Full path of the file with m/z of the targeted chemicals to
#' search for. If the value is "NA", then the list of metabolites in the
#' data(example_target_list) is used. The input file should be formatted as
#' data(example_target_list): Column A: m/z of the targeted metabolite
#' (required) Column B: retention time (Optional) Column C: Name of the
#' metabolite (required)
#' @param refMZ.mz.diff The ppm tolerance to search for targeted
#' metabolites/features in xMSanalyzer. Default: 10
#' @param refMZ.time.diff The time tolerance to search for targeted
#' metabolites/features in xMSanalyzer. Default: NA
#' @param void.vol.timethresh Time threshold for void volume. The program
#' searches for the void volume cutoff within the defined time limit. Default:
#' 30
#' @param replacezeroswithNA Should 0s be treated as missing values during
#' ComBat (TRUE or FALSE). Default: TRUE
#' @param scoreweight The w parameter in the scoring function defined in the
#' xMSanalyzer manuscript. Uppal 2013, BMC Bioinformatiocs. Default: 30
#' @param sample_info_file File listing the order in which the samples were
#' run. The format of the file should be as follows: Column A:File names
#' matching the CDF/mzXML files Column B: Sample ID Column C: Batch number
#' (This column should be labeled "Batch") Column D: Additional covariates to
#' adjust for (Optional)
#' @param filepattern File format of spectral data files. eg: ".cdf", ".mzXML"
#' @param charge_type Ionization mode. "pos" or "neg" Default: "pos"
#' @param minexp.pct Minimum fraction of samples in which the signal should be
#' present. Default: "0.1"
#' @param syssleep Sleep time in between KEGG REST queries. Default: "0.5"
#' @return A list is returned. \item{XCMS.merged.res }{Merged feature table,
#' P1 U P2 where P1 and P2 are two sets of parameter settings. The "QRscore" is
#' defined as the ratio of percentage of biological samples for which more than
#' 50 percent of technical replicates have a signal and median PID or CV.
#' QRscore=Percent good samples/median PID or CV} \item{XCMS.ind.res}{List with
#' results from individual parameter settings.}
#' \item{XCMS.ind.res.filtered}{List with results from individual parameter
#' settings after filtering.} \item{annot.res}{List with annotation results
#' after merging results.} \item{feat.eval.ind}{List with feature evaluation
#' results based on CV or PID at each parameter setting.}
#' \item{sample.eval.ind}{List with sample evaluation results at each parameter
#' setting based on correlation between technical replicates.} Outputs XCMS
#' results at each parameter settings to XCMS.outloc and the following to
#' xMSanalyzer.outloc: -feature consistency results -sample evaluation results
#' -feature list after merging results from parameter settings A and B -merge
#' summary
#' @author Karan Uppal <kuppal2@@emory.edu>
#' @keywords ~XCMS ~wrapper ~matched filter
xMSwrapper.XCMS.matchedFilter <- function(cdfloc,
XCMS.outloc, xMSanalyzer.outloc, step.list = c(0.001,
0.01, 0.1), mz.diff.list = c(0.001, 0.01,
0.1), sn.thresh.list = c(10), max = 50, bw = c(10),
minfrac.val = 0.5, minsamp = 2, mzwid = 0.25,
sleep = 0, retcor.family = "symmetric", retcor.plottype = "mdevden",
groupval.method = "medret", numnodes = 2, run.order.file = NA,
max.mz.diff = 15, max.rt.diff = 300, merge.eval.pvalue = 0.2,
mergecorthresh = 0.7, deltamzminmax.tol = 10,
num_replicates = 2, subs = NA, mz.tolerance.dbmatch = 10,
adduct.list = c("M+H"), samp.filt.thresh = 0.7,
feat.filt.thresh = 50, cormethod = "pearson",
mult.test.cor = TRUE, missingvalue = 0, ignore.missing = TRUE,
sample_info_file = NA, refMZ = NA, refMZ.mz.diff = 10,
refMZ.time.diff = NA, void.vol.timethresh = 30,
replacezeroswithNA = TRUE, scoreweight = 30, filepattern = ".cdf",
charge_type = "pos", minexp.pct = 0.1, syssleep = 0.5) {
suppressWarnings(sink(file = NULL))
x <- date()
x <- strsplit(x, split = " ")
targeted_feat_raw <- {
}
x1 <- unlist(x)
# fname<-paste(x1[2:5],collapse='_')
# fname<-paste(x1[2:3],x1[5],x1[4],collapse='_')
fname <- paste(x1[2:3], collapse = "")
fname <- paste(fname, x1[5], sep = "")
x1[4] <- gsub(x1[4], pattern = ":", replacement = "_")
fname <- paste(fname, x1[4], sep = "_")
fname <- paste(xMSanalyzer.outloc, "/Log_", fname,
".txt", sep = "")
print(paste("Program is running. Please check the logfile for runtime status: ",
fname, sep = ""))
data_rpd_all = new("list")
data_rpd = new("list")
union_list = new("list")
feateval_list <- new("list")
rsqres_list <- new("list")
annot.res <- {
}
annotres_list <- new("list")
best_score <- (-1e+06)
if (is.na(sample_info_file) == FALSE) {
sampleid_mapping <- read.table(sample_info_file,
sep = "\t", header = TRUE)
if (is.na(cdfloc) == FALSE) {
l1 <- list.files(cdfloc, filepattern)
minexp <- minfrac.val * length(l1)
} else {
l1 <- rep(1, num_replicates)
}
if (length(l1) != dim(sampleid_mapping)[1] &
(is.na(cdfloc) == FALSE)) {
num_mis_files <- dim(sampleid_mapping)[1] -
length(l1)
stop(paste("ERROR: Only ", length(l1),
" spectral files were found. ", num_mis_files,
" files are missing.", sep = ""))
}
} else {
if (is.na(cdfloc) == FALSE) {
l1 <- list.files(cdfloc, filepattern)
minexp <- minfrac.val * length(l1)
} else {
l1 <- rep(1, num_replicates)
}
}
if (length(l1)%%num_replicates > 0) {
stop(paste("ERROR: Not all samples have ",
num_replicates, " replicates.", sep = ""))
}
dir.create(XCMS.outloc, showWarnings = FALSE)
dir.create(xMSanalyzer.outloc, showWarnings = FALSE)
# subdir1<-paste(xMSanalyzer.outloc,'/Quality_assessment_files',sep='')
# subdir2<-paste(xMSanalyzer.outloc,'/XCMS_filtered_data',sep='')
# subdir3<-paste(xMSanalyzer.outloc,'/XCMS_with_xMSanalyzer_merged_data',sep='')
sink(fname)
print(sessionInfo())
if (is.na(refMZ) == FALSE) {
stddata <- read.table(refMZ, sep = "\t", header = TRUE)
print(refMZ)
print(head(stddata))
} else {
if (charge_type == "pos") {
data(example_target_list_pos)
stddata <- example_target_list_pos
} else {
if (charge_type == "neg") {
data(example_target_list_neg)
stddata <- example_target_list_neg
} else {
stop("Invalid option. 'charge_type' should be 'pos' or 'neg'.")
}
}
}
############################################ 1) Align profiles using the cdf.to.ftr wrapper
############################################ function in apLCMS
if (is.na(XCMS.outloc) == TRUE) {
stop("Undefined value for parameter, XCMS.outloc. Please define the XCMS output location.")
}
if (is.na(xMSanalyzer.outloc) == TRUE) {
stop("Undefined value for parameter, xMSanalyzer.outloc. Please define the xMSanalyzer output location.")
}
if (is.na(cdfloc) == FALSE) {
setwd(cdfloc)
if (is.na(XCMS.outloc) == FALSE) {
data_rpd_all = XCMS.align.matchedFilter(cdfloc,
XCMS.outloc, step.list = step.list,
mz.diff.list = mz.diff.list, sn.thresh.list = sn.thresh.list,
max = max, bw.val = bw, minfrac.val = minfrac.val,
minsamp.val = minsamp, mzwid.val = mzwid,
sleep.val = sleep, run.order.file = run.order.file,
subs = subs, retcor.family = retcor.family,
retcor.plottype = retcor.plottype,
groupval.method = groupval.method,
target.mz.list = stddata)
# data_rpd_all=XCMS.align.matchedFilter(cdfloc,
# XCMS.outloc,step.list, mz.diff.list,
# sn.thresh.list, max, bw, minfrac.val, minsamp,
# mzwid, sleep, run.order.file,subs,
# retcor.family, retcor.plottype,groupval.method)
} else {
stop("Undefined value for parameter, XCMS.outloc. Please define the output location.")
}
}
setwd(XCMS.outloc)
alignmentresults <- list.files(XCMS.outloc, "*.txt")
print("Files found in XCMS output location:")
print(alignmentresults)
for (i in 1:length(alignmentresults)) {
data_rpd_all[[i]] <- read.table(paste(XCMS.outloc,
"/", alignmentresults[i], sep = ""), header = TRUE)
}
subdir1 <- paste(xMSanalyzer.outloc, "/Stage1",
sep = "") #QC individual parameter settings
subdir2 <- paste(xMSanalyzer.outloc, "/Stage2",
sep = "") #Data filtering
subdir3 <- paste(xMSanalyzer.outloc, "/Stage3a",
sep = "") #Data merger/parameter optimization
subdir3b <- paste(xMSanalyzer.outloc, "/Stage3b",
sep = "")
subdir4a <- paste(xMSanalyzer.outloc, "/Stage4a",
sep = "") #Raw QC: batch effect eval, TIC, etc
dir.create(subdir1, showWarnings = FALSE)
dir.create(subdir2, showWarnings = FALSE)
dir.create(subdir3, showWarnings = FALSE)
dir.create(subdir3b, showWarnings = FALSE)
dir.create(subdir4a, showWarnings = FALSE)
if (is.na(sample_info_file) == FALSE) {
subdir4b <- paste(xMSanalyzer.outloc, "/Stage4b",
sep = "") #Batch-effect corrected QC: batch effect eval, TIC, etc
dir.create(subdir4b, showWarnings = FALSE)
}
if (is.na(adduct.list) == FALSE) {
subdir5 <- paste(xMSanalyzer.outloc, "/Stage5",
sep = "") #Putative unprocessed annotations;
dir.create(subdir5, showWarnings = FALSE)
}
bestscore <- (-1e+06)
{
# stop('Undefined value for parameter, cdfloc.
# Please enter path of the folder where the CDF
# files to be processed are located.') change
# location to the output folder
setwd(XCMS.outloc)
alignmentresults <- list.files(XCMS.outloc,
"*.txt")
if (length(data_rpd_all) > 0) {
curdata_dim = {
}
if (num_replicates == 2) {
fileroot = "_PID"
} else {
if (num_replicates > 2) {
fileroot = "_CV"
} else {
fileroot = ""
stop("Need at least 2 technical replicates per sample.")
}
}
# for(i in 1:length(alignmentresults))
cat("\n")
print("*******xMSanalyzer Stage 1: QC evaluation of invidual parameters*******")
cat("\n")
for (i in 1:length(data_rpd_all)) {
print("dim of data is ")
print(dim(data_rpd_all[[i]]))
print(paste("**Evaluating XCMS results from parameter setting ",
i, "**", sep = ""))
############################################ 2)Calculate pairwise correlation coefficients
file_name = sapply(strsplit(alignmentresults[i],
".txt"), head)
# curdata=read.table(paste(XCMS.outloc,'/',alignmentresults[i],sep=''),header=TRUE)
# curdata=check.mz.in.replicates(curdata)
curdata = data_rpd_all[[i]]
############################################# 3) Calculate Percent Intensity Difference
if (num_replicates > 1) {
feat.eval.result = evaluate.Features(curdata,
numreplicates = num_replicates,
min.samp.percent = 0.6, alignment.tool = "XCMS",
impute.bool = TRUE)
cnames = colnames(feat.eval.result)
feat.eval.result <- apply(feat.eval.result,
2, as.numeric)
feat.eval.result <- as.data.frame(feat.eval.result)
feat.eval.result.mat = cbind(curdata[,
c(1:8)], feat.eval.result)
feat.eval.outfile = paste(subdir1,
"/", file_name, fileroot, "featureassessment.txt",
sep = "")
# write results
write.table(feat.eval.result.mat,
feat.eval.outfile, sep = "\t",
row.names = FALSE)
curdata <- curdata[which(as.numeric(feat.eval.result$median) <=
feat.filt.thresh), ]
feat.eval.result.mat <- feat.eval.result.mat[which(as.numeric(feat.eval.result$median) <=
feat.filt.thresh), ]
# curdata<-cbind(curdata,feat.eval.result[which(as.numeric(feat.eval.result$median)<=feat.filt.thresh),])
curdata <- as.data.frame(curdata)
curdata <- replace(as.matrix(curdata),
which(is.na(curdata) == TRUE),
0)
if (is.na(deltamzminmax.tol) ==
FALSE) {
print("filtering by delta m/z")
mz_min_max <- cbind(curdata[,
2], curdata[, 3])
mz_min_max <- as.data.frame(mz_min_max)
deltappm_res <- apply(mz_min_max,
1, get_deltappm)
curdata <- curdata[which(as.numeric(deltappm_res) <=
deltamzminmax.tol), ]
feat.eval.result.mat <- feat.eval.result.mat[which(as.numeric(deltappm_res) <=
deltamzminmax.tol), ]
}
feateval_list[[i]] <- feat.eval.result.mat
data_rpd[[i]] <- curdata
if (num_replicates > 1) {
print(paste("**calculating pairwise ",
cormethod, " correlation**",
sep = ""))
rsqres_list <- evaluate.Samples(curdata,
num_replicates, alignment.tool = "XCMS",
cormethod, missingvalue, ignore.missing)
rsqres <- as.data.frame(rsqres_list$cor.matrix)
curdata <- as.data.frame(rsqres_list$feature.table)
rsqres <- as.data.frame(rsqres)
snames <- colnames(curdata[, -c(1:8)])
snames_1 <- snames[seq(1, length(snames),
num_replicates)]
rownames(rsqres) <- snames_1
pcor_outfile = paste(subdir1,
"/", file_name, "_sampleassessment_usinggoodfeatures.txt",
sep = "")
write.table(rsqres, pcor_outfile,
sep = "\t", row.names = TRUE)
rsqres_list[[i]] <- rsqres
} else {
print("**skipping sample evaluataion as only one replicate is present**")
}
if (num_replicates > 2) {
bad_samples <- which(rsqres$meanCorrelation <
samp.filt.thresh)
} else {
bad_samples <- which(rsqres$Correlation <
samp.filt.thresh)
}
if (length(bad_samples) > 0) {
bad_sample_names <- snames_1[bad_samples]
feat.eval.outfile = paste(subdir1,
"/", file_name, "_badsamples_at_cor",
samp.filt.thresh, ".txt", sep = "")
bad_sample_names <- as.data.frame(bad_sample_names)
colnames(bad_sample_names) <- paste("Samples with correlation between technical replicates <",
samp.filt.thresh, sep = "")
write.table(bad_sample_names,
file = feat.eval.outfile, sep = "\t",
row.names = FALSE)
}
bad_list = {
}
if (length(bad_samples) > 0) {
for (n1 in 1:length(bad_samples)) {
if (bad_samples[n1] > 1) {
bad_samples[n1] = bad_samples[n1] +
(bad_samples[n1] - 1) *
(num_replicates - 1)
}
}
for (n1 in 1:num_replicates) {
bad_list <- c(bad_list, (bad_samples +
n1 - 1))
}
bad_list <- bad_list[order(bad_list)]
}
if (i > 1) {
parent_bad_list <- intersect(parent_bad_list,
bad_list)
} else {
parent_bad_list <- bad_list
}
} else {
print("**skipping feature evaluataion as only one replicate is present**")
}
}
cat("\n")
print("********Stage 2: Filtering results from each parameter setting based on sample and feature quality checks*******")
cat("\n")
for (i in 1:length(alignmentresults)) {
file_name = sapply(strsplit(alignmentresults[i],
".txt"), head)
feat.eval.file = paste(subdir2, "/",
file_name, "cor", samp.filt.thresh,
fileroot, feat.filt.thresh, "filtereddata.txt",
sep = "")
# data_rpd[[i]]=read.table(feat.eval.file,header=TRUE)
curdata <- data_rpd[[i]]
feat.eval.result.mat <- feateval_list[[i]]
if (length(parent_bad_list) > 0) {
curdata <- curdata[, -c(parent_bad_list +
8)]
# maxint<-apply(curdata[,-c(1:4,((dim(curdata)[2]-6):dim(curdata)[2]))],1,max)
maxint <- apply(curdata[, -c(1:8)],
1, max)
badfeats <- which(maxint == 0)
if (length(badfeats) > 0) {
curdata <- curdata[-c(badfeats),
]
feat.eval.result.mat <- feat.eval.result.mat[-c(badfeats),
]
feateval_list[[i]] <- feat.eval.result.mat
}
}
data_rpd[[i]] <- curdata[which(as.numeric(feat.eval.result.mat$median) <=
feat.filt.thresh), ]
feat.eval.outfile = paste(subdir2,
"/", file_name, "cor", samp.filt.thresh,
fileroot, feat.filt.thresh, "filtereddata.txt",
sep = "")
# write results
write.table(data_rpd[[i]], feat.eval.outfile,
sep = "\t", row.names = FALSE)
feat.eval.outfile = paste(subdir1,
"/", file_name, fileroot, "featureassessment.txt",
sep = "")
# write results
write.table(feateval_list[[i]], feat.eval.outfile,
sep = "\t", row.names = FALSE)
}
########################################### 4) Merge two or more parameter settings
cat("\n")
print("*************Stage 3a: Merging features detected at different parameter settings********************")
cat("\n")
num_pairs = 1
finalres = {
}
rnames = {
}
for (i in 1:length(alignmentresults)) {
file_name = sapply(strsplit(alignmentresults[i],
".txt"), head)
# feat.eval.file=paste(xMSanalyzer.outloc,'/',file_name,'cor',samp.filt.thresh,fileroot,feat.filt.thresh,'filtereddata.txt',sep='')
# data_rpd[[i]]=read.table(feat.eval.file,header=TRUE)
a1 = sapply(strsplit(as.character(alignmentresults[i]),
"\\_thresh"), head, n = 2)[2]
a2 = sapply(strsplit(as.character(a1),
"\\_"), head, n = 2)
threshval = as.numeric(a2[1])
stepval = sapply(strsplit(as.character(a2[2]),
"\\."), head, n = 2)[2]
stepval = paste(".", stepval, sep = "")
stepval = as.numeric(stepval)
a1 = sapply(strsplit(as.character(alignmentresults[i]),
"\\_mzdiff"), head, n = 2)[2]
a2 = sapply(strsplit(as.character(a1[1]),
"\\_max"), head, n = 2)
mzdiff = as.numeric(a2[1])
a2 = sapply(strsplit(as.character(a2[2]),
"\\.txt"), head, n = 2)
max = as.numeric(a2[1])
p1 = paste(threshval, "_", stepval,
"_", mzdiff, "_", max, sep = "")
bool_num <- 1
for (j in i:length(alignmentresults)) {
bool_num <- 1
if (i == j) {
if (length(alignmentresults) >
1) {
bool_num <- 0
} else {
bool_num <- 1
}
}
# if(i!=j)
if (bool_num == 1) {
file_name = sapply(strsplit(alignmentresults[j],
".txt"), head)
# if(i!=j)
{
p1_p2 = paste("p", i, "_U_",
"p", j, sep = "")
}
# else { p1_p2='p1' }
feat.eval.A <- feateval_list[[i]]
feat.eval.B <- feateval_list[[j]]
feat.eval.A <- feat.eval.A[which(as.numeric(feat.eval.A$median) <=
feat.filt.thresh), ]
feat.eval.B <- feat.eval.B[which(as.numeric(feat.eval.B$median) <=
feat.filt.thresh), ]
# data_m=merge.Results(data_rpd[[i]],data_rpd[[j]],feateval[[i]],feateval[[j]],max.mz.diff,max.rt.diff,merge.eval.method,merge.eval.pvalue,alignment.tool='XCMS',
print(paste("Number of good quality features from setting ",
i, ":", dim(data_rpd[[i]])[1],
sep = ": "))
if (i != j) {
print(paste("Number of good quality features from setting ",
j, ":", dim(data_rpd[[j]])[1],
sep = ": "))
}
data_m = merge.Results(data_rpd[[i]],
data_rpd[[j]], feat.eval.A,
feat.eval.B, max.mz.diff, max.rt.diff,
merge.eval.pvalue, alignment.tool = "XCMS",
numnodes = numnodes, mult.test.cor,
mergecorthresh, missingvalue)
data_m <- unique(data_m)
numcols <- dim(data_m)[2]
data_m_int <- data_m[, -c(1:8,
(numcols - 7):numcols)]
numsamps <- dim(data_m_int)[2]/num_replicates
numcols <- dim(data_m)[2]
if (is.na(minexp.pct) == FALSE) {
minexp <- minexp.pct * dim(data_m_int)[2]
if (length(which(data_m[, 8] >=
minexp)) > 0) {
data_m <- data_m[which(data_m[,
8] >= minexp), ]
} else {
stop(paste("No features have non-missing value in ",
minexp, " samples", sep = ""))
}
}
data_m_int <- data_m[, -c(1:8,
(numcols - 7):numcols)]
numsamps <- dim(data_m_int)[2]/num_replicates
maxint <- apply(data_m_int, 1,
function(x) {
max(x, na.rm = TRUE)
})
numpeaks <- lapply(1:dim(data_m_int)[1],
function(j) {
length(which(is.na(data_m_int[j,
]) == FALSE))
})
union_list[[num_pairs]] <- data_m[,
c(1:8)]
# union_list[[num_pairs]]<-cbind(union_list[[num_pairs]],numpeaks)
union_list[[num_pairs]] <- cbind(union_list[[num_pairs]],
data_m$numgoodsamples)
union_list[[num_pairs]] <- cbind(union_list[[num_pairs]],
data_m$median)
# Qscore<-(as.numeric(data_m$numgoodsamples)/as.numeric(data_m$median+0.1))
# Qscore<-100*(Qscore/numsamps)
union_list[[num_pairs]] <- cbind(union_list[[num_pairs]],
data_m$Qscore)
union_list[[num_pairs]] <- cbind(union_list[[num_pairs]],
maxint)
union_list[[num_pairs]] <- cbind(union_list[[num_pairs]],
data_m_int)
featinfo <- colnames(data_m[,
c(1:7)])
cnames <- colnames(data_m_int)
merge.res.colnames <- c(featinfo,
"NumPres.All.Samples", "NumPres.Biological.Samples",
paste("median", fileroot, sep = ""),
"Qscore", "Max.Intensity", cnames)
colnames(union_list[[num_pairs]]) <- as.character(merge.res.colnames)
curres = {
}
curres = cbind(curres, p1_p2)
curres = cbind(curres, dim(union_list[[num_pairs]])[1])
curres = cbind(curres, mean(as.numeric(data_m$median)))
curres = cbind(curres, mean(as.numeric(data_m$Qscore)))
# curscore<-(dim(union_list[[num_pairs]])[1]-(scoreweight*mean(as.numeric(data_m$Qscore))))
curscore <- (dim(union_list[[num_pairs]])[1] -
(scoreweight * mean(as.numeric(data_m$median))))
if (curscore > bestscore) {
bestscore <- curscore
best_i <- num_pairs
best_pair <- p1_p2
}
curres = cbind(curres, curscore)
curres <- as.data.frame(curres)
finalres = rbind(finalres, curres)
finalname = paste("XCMS_feature_list_at_",
p1_p2, "cor", samp.filt.thresh,
fileroot, feat.filt.thresh,
".txt", sep = "")
# Output merge results
write.table(union_list[[num_pairs]],
file = paste(subdir3, "/", finalname,
sep = ""), sep = "\t", row.names = FALSE)
num_pairs = num_pairs + 1
}
}
}
finalres <- as.data.frame(finalres)
colnames(finalres) <- c("Parameter Combination",
"Number of Features", "median PID/CV between sample replicates",
"mean Qscore (Quality score)", "Parameter score")
write.table(finalres, file = paste(xMSanalyzer.outloc,
"/Stage3a/xcms_with_xMSanalyzer_merge_summary.txt",
sep = ""), sep = "\t", row.names = FALSE)
print("Most optimal feature setting:")
print(best_pair)
cat("\n")
#########################################################################
cat("\n")
print(paste("********Stage 3b: Generating final (pre-batcheffect correction) untargeted and targeted feature tables using ",
best_pair, " results******", sep = ""))
cat("\n")
# rawQCeval/
# pdf('Stage4a_QC_plots.pdf')
# pdf(file=paste('subdir4a/Stage4a_QC_plots.pdf',sep=''))
pdf(file = paste(subdir4a, "/Stage4a_QC_plots.pdf",
sep = ""))
d1 <- union_list[[best_i]]
if (is.na(refMZ) == FALSE) {
stddata <- read.table(refMZ, sep = "\t",
header = TRUE)
print(refMZ)
print(head(stddata))
} else {
if (charge_type == "pos") {
data(example_target_list_pos)
stddata <- example_target_list_pos
} else {
if (charge_type == "neg") {
data(example_target_list_neg)
stddata <- example_target_list_neg
} else {
stop("Invalid option. 'charge_type' should be 'pos' or 'neg'.")
}
}
}
Sys.sleep(1)
hist(d1$NumPres.All.Samples, main = "Histogram NumPres.All.Samples",
col = "orange", xlab = "Number of samples (including replicates) \n with non-zero intensity values",
ylab = "Number of features", cex.main = 0.7)
hist(d1$NumPres.Biological.Samples, main = "Histogram NumPres.Biological.Samples \n (using all data)",
col = "orange", xlab = "Number of biological samples \n with non-zero intensity values",
ylab = "Number of features", cex.main = 0.7)
# h1<-hist(d1$median_CV,main='Histogram median CV
# \n (using all data)',col='orange',xlab='median
# CV%', ylab='Number of features',cex.main=0.7)
# h1<-hist(d1$median_CV,breaks=seq(0,max(d1$median_CV,na.rm=TRUE)+10,10),main='Histogram
# median CV \n (using all
# data)',col='orange',xlab='median CV%',
# ylab='Number of features',cex.main=0.7)
if (num_replicates > 2) {
h1 <- hist(d1$median_CV, breaks = seq(0,
max(d1$median_CV, na.rm = TRUE) +
10, 10), main = "Histogram median CV \n (using all data)",
col = "orange", xlab = "median CV%",
ylab = "Number of features", cex.main = 0.7)
} else {
h1 <- hist(d1$median_PID, breaks = seq(0,
max(d1$median_PID, na.rm = TRUE) +
10, 10), main = "Histogram median CV \n (using all data)",
col = "orange", xlab = "median CV%",
ylab = "Number of features", cex.main = 0.7)
}
hist(d1$Qscore, main = "Histogram Qscore \n (using all data)",
col = "orange", xlab = "Quality score",
ylab = "Number of features", cex.main = 0.7)
# pie(h1$counts,col=rainbow(length(h1$counts)),labels=h1$breaks,main='Pie
# chart of median CVs (%) \n using all
# features',cex.main=0.7)
# pie(h1$counts,col=rainbow(length(h1$counts)),labels=h1$breaks,main='Pie
# chart of median CVs (%) \n using all
# features',cex.main=0.7)
lab_text <- paste(h1$breaks, "-", h1$breaks +
10, sep = "")
if (num_replicates > 2) {
pie(h1$counts, col = rainbow(length(h1$counts)),
labels = lab_text, main = paste("Pie chart of median CVs (%) \n using all features\n; average=",
mean(d1$median_CV), sep = ""),
cex.main = 0.7)
} else {
pie(h1$counts, col = rainbow(length(h1$counts)),
labels = lab_text, main = paste("Pie chart of median PIDs (%) \n using all features\n; average=",
mean(d1$median_PID), sep = ""),
cex.main = 0.7)
}
d2 <- d1[order(d1$time), ]
plot(d2$time, d2$Max.Intensity, main = "TIC \n (using all data)",
col = "orange", xlab = "Time (s)",
ylab = "Max intensity across all samples/profiles",
cex.main = 0.7)
plot(d2$time, d2$mz, main = "m/z vs Time \n (using all data)",
col = "orange", xlab = "Time (s)",
ylab = "m/z", cex.main = 0.7)
plot(d2$time, d2$Qscore, main = "Qscore vs Time \n (using all data)",
col = "orange", xlab = "Time (s)",
ylab = "Qscore", cex.main = 0.7)
plot(d2$mz, d2$Qscore, main = "Qscore vs m/z \n (using all data)",
col = "orange", xlab = "m/z", ylab = "Qscore",
cex.main = 0.7)
max_numzeros <- dim(d1)[2] * 1
if (is.na(void.vol.timethresh) == FALSE) {
dfirst15 <- d1[which(d1[, 2] < void.vol.timethresh),
]
if (nrow(dfirst15) > 1) {
ind1 <- which(dfirst15[, 12] ==
max(dfirst15[, 12]))
time_thresh <- dfirst15[ind1, 2]
time_thresh <- time_thresh - (0.3 *
time_thresh)
# qplot(dfirst15[,2],dfirst15[,9],xlab='Time (s)',
# ylab='Max intensity across all samples')
plot(dfirst15[, 2], dfirst15[, 9],
xlab = "Time (s)", ylab = "Max intensity across all samples",
main = paste("Estimated void volume time: ",
time_thresh, " s", sep = ""))
abline(v = time_thresh, col = 4,
lty = 3)
d1 <- d1[which(d1$time >= time_thresh),
]
print("Estimated void volume time cutoff")
print(time_thresh)
}
d1_int <- round(d1[, -c(1:12)], 0)
d1 <- cbind(d1[, c(1:12)], d1_int)
rm(d1_int)
# sfname<-paste(xMSanalyzer.outloc,'/stage5/feature_table_',best_pair,'_cor',samp.filt.thresh,fileroot,feat.filt.thresh,'filtered.txt',sep='')
finalname <- paste("featuretable_",
best_pair, "_cor", samp.filt.thresh,
fileroot, feat.filt.thresh, "_voidtimefilt.txt",
sep = "")
write.table(d1, file = paste(subdir3b,
"/", finalname, sep = ""), sep = "\t",
row.names = FALSE)
} else {
finalname <- paste("feature_table_",
best_pair, "_cor", samp.filt.thresh,
fileroot, feat.filt.thresh, ".txt",
sep = "")
write.table(d1, file = paste(subdir3b,
"/", finalname, sep = ""), sep = "\t",
row.names = FALSE)
}
cat("\n")
print(paste("*********Stage 4a: Search for target features, performing QC evaluation using ",
best_pair, " results********", sep = ""))
cat("\n")
Sys.sleep(1)
print("Dim data after void time filtering")
print(dim(d1))
data_m <- d1[, -c(1:12)]
if (replacezeroswithNA == TRUE) {
data_m <- replace(as.matrix(data_m),
which(data_m == 0), NA)
}
counna <- apply(data_m, 1, function(x) {
length(which(is.na(x) == TRUE))
})
maxzeros <- 1 * dim(data_m)[2]
if (length(which(counna < maxzeros))) {
data_m <- data_m[which(counna < maxzeros),
]
}
maxint <- apply(data_m, 1, function(x) {
max(x, na.rm = TRUE)
})
maxint_ord <- order(maxint, decreasing = TRUE)
# [maxint_ord[1:5000]
X <- t(data_m) #[maxint_ord[1:2000],])
X <- replace(as.matrix(X), which(is.na(X) ==
TRUE), 0)
tic.eval(d1[, -c(1:12)], outloc = subdir4a)
feat.eval.result = evaluate.Features(d1[,
-c(9:12)], numreplicates = num_replicates,
min.samp.percent = 0.6, alignment.tool = "XCMS",
impute.bool = TRUE)
Sys.sleep(1)
s1 <- "Stage 1 results: QC evaluation of invidual parameters from apLCMS"
s2 <- "Stage 2 results: filtered results from each paramter setting based on sample and feature quality (CV within replicates) checks"
s3 <- "Stage 3a results: merged results using stage 2 filtered files"
s3b <- "Stage 3b results: Final untargeted and targeted feature tables"
s4a <- "Stage 4a results: QC evaluation of targeted and untargeted data before batch-effect correction"
sm <- rbind(s1, s2, s3, s3b, s4a)
targeted_feat_raw <- eval.target.mz(dataA = d1[,
-c(3:12)], refMZ = stddata, feature.eval.result = feat.eval.result,
mzthresh = refMZ.mz.diff, timethresh = refMZ.time.diff,
outloc = subdir4a, folderheader = "raw")
if (is.na(sample_info_file) == FALSE) {
sampleid_mapping <- read.table(sample_info_file,
sep = "\t", header = TRUE)
sampleid_mapping[, 1] <- gsub(sampleid_mapping[,
1], pattern = filepattern, replacement = "")
cnames <- colnames(data_m)
cnames <- gsub(cnames, pattern = filepattern,
replacement = "")
colnames(data_m) <- cnames
batch_inf <- sampleid_mapping[, 3]
batch_labels <- as.factor(sampleid_mapping[,
3])
l1 <- levels(batch_labels)
pca.eval(X = X, samplelabels = batch_labels,
filename = "raw", ncomp = 5, center = TRUE,
scale = TRUE, legendlocation = "topright",
legendcex = 0.5, outloc = subdir4a)
Sys.sleep(1)
cnames = colnames(feat.eval.result)
if (dim(sampleid_mapping)[2] < 4) {
mod <- rep(1, dim(data_m)[2])
} else {
mod <- sampleid_mapping[, -c(1:3)]
}
dev.off()
Sys.sleep(1)
#######################################################################
cat("\n")
print(paste("**********Stage 4b: Processing data using ComBat and post-correction QC evaluation using ",
best_pair, " results*******", sep = ""))
cat("\n")
# pdf('Stage4b_QC_plots.pdf')
# pdf(file=paste('subdir4b/Stage4b_QC_plots.pdf',sep=''))
pdf(file = paste(subdir4b, "/Stage4b_QC_plots.pdf",
sep = ""))
##################################################################
adjdata <- try(sva::ComBat(dat = data_m,
batch = batch_inf, mod = mod, par.prior = TRUE),
silent = TRUE)
if (is(adjdata, "try-error")) {
data_m1 <- cbind(d1[, c(1:4)], data_m)
# print(adjdata)
print("sva::ComBat caused an error.")
print(adjdata)
print("Too many missing values can cause this error. Trying MetabComBat...")
# adjdata<-MetabComBat(dat=data_m1,saminfo=sampleid_mapping,par.prior=T,filter=F,write=F)
adjdata <- MetabComBat(dat = data_m1,
saminfo = sampleid_mapping, par.prior = T,
filter = F, write = F, prior.plots = F)
adjdata <- adjdata[, -c(1:4)]
}
maxint <- apply(adjdata, 1, function(x) {
max(x, na.rm = TRUE)
})
adjdata2 <- replace(as.matrix(adjdata),
which(is.na(adjdata) == TRUE), 0)
adjdata2 <- replace(as.matrix(adjdata2),
which(adjdata2 < 0), 0)
adjdata2 <- cbind(d1[, c(1:8)], adjdata2)
expression_xls <- paste(subdir4b,
"/ComBat_corrected_", best_pair,
"_feature_table.txt", sep = "")
feat.eval.result = evaluate.Features(adjdata2,
numreplicates = num_replicates,
min.samp.percent = 0.6, alignment.tool = "XCMS",
impute.bool = TRUE)
cnames = colnames(feat.eval.result)
feat.eval.result <- apply(feat.eval.result,
2, as.numeric)
feat.eval.result <- as.data.frame(feat.eval.result)
feat.eval.result.mat = cbind(adjdata2[,
c(1:4)], feat.eval.result)
numpeaks <- apply(adjdata2[, -c(1:8)],
1, countpeaks)
maxint <- apply(adjdata2[, -c(1:8)],
1, function(x) {
max(x, na.rm = TRUE)
})
adjdata2 <- cbind(adjdata2[, c(1:7)],
numpeaks, feat.eval.result.mat$numgoodsamples,
feat.eval.result.mat$median, feat.eval.result.mat$Qscore,
maxint, adjdata2[, -c(1:8)])
colnames(adjdata2) <- colnames(d1)
write.table(adjdata2, file = paste(subdir4b,
"/", expression_xls, sep = ""),
sep = "\t", row.names = FALSE)
# X<-t(adjdata2[maxint_ord[1:2000],-c(1:9)])
tic.eval(adjdata2[, -c(1:12)], outloc = subdir4b)
X <- t(adjdata2[, -c(1:12)])
X <- as.matrix(X)
X <- replace(as.matrix(X), which(is.na(X) ==
TRUE), 0)
pca.eval(X, samplelabels = batch_labels,
filename = "ComBat", ncomp = 5,
center = TRUE, scale = TRUE, legendlocation = "topright",
legendcex = 0.5, outloc = subdir4b)
# eval.reference.metabs(dataA=adjdata2,stdData=stddata,mzthresh=10,outloc=getwd(),folderheader='ComBat')
targeted_feat_combat <- eval.target.mz(dataA = adjdata2[,
-c(3:12)], refMZ = stddata, feature.eval.result = feat.eval.result,
mzthresh = refMZ.mz.diff, timethresh = refMZ.time.diff,
outloc = subdir4b, folderheader = "ComBat")
s4b <- "Stage 4b results: Batch-effect evaluation post ComBat and QC evaluation of targeted data post batch-effect correction"
sm <- rbind(sm, s4b)
} else {
adjdata2 = NULL
targeted_feat_combat <- NULL
}
dev.off()
metlin.res = {
}
kegg.res = {
}
# length(union_list[[num_pairs]]$mz
if (is.na(adduct.list) == FALSE) {
cat("\n")
print("*********Stage 5: Mapping m/z values to known metabolites using KEGG*********")
cat("\n")
annot.res <- feat.batch.annotation.KEGG(adjdata2,
mz.tolerance.dbmatch, adduct.list,
subdir5, numnodes = numnodes, syssleep = syssleep)
annotres_list <- annot.res
s5 <- "Stage 5 results: Annotation of features"
sm <- rbind(sm, s5)
}
print("*************Processing complete**********")
# print('*********Characterizing
# metabolites*********')
} else {
stop(paste("No files exist in", XCMS.outloc,
"Please check the input value for cdfloc",
sep = ""))
}
}
suppressWarnings(sink(file = NULL))
sm <- as.data.frame(sm)
colnames(sm) <- "Output_description"
setwd(xMSanalyzer.outloc)
write.table(sm, file = "Readme.txt", sep = "\t",
row.names = FALSE)
print(paste("Processing is complete. Program results can be found at: ",
xMSanalyzer.outloc, sep = ""))
return(list(XCMS.merged.res = union_list, XCMS.ind.res = data_rpd_all,
XCMS.ind.res.filtered = data_rpd, final.feat.table.annot = annotres_list,
feat.eval.ind = feateval_list, sample.eval.ind = rsqres_list,
final.feat.table.raw = d1, final.feat.table.combat = adjdata2,
final.targeted.feat.table.raw = targeted_feat_raw,
final.targeted.feat.table.combat = targeted_feat_combat))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.