Nothing
#' Constructor of AdductSpec object deconvolute spectra
#' MS2 and MS1 levels
#' @param mzXmlDir character a full path to a directory containing
#' either .mzXML or .mzML data
#' @param runOrder character a full path to a csv file specifying the
#' runorder for each of the files
#' the first column must contain the precise file name and the second
#' column an integer representing the precise run order.
#' @param nCores numeric the number of cores to use for parallel computation.
#' The default
#' is to use all available cores detected using the function
#' parallel::detectCores()
#' @param intStdMass numeric vector of the mass of the internal standard.
#' Default is the mass of
# the T3 peptide of HSA.
#' @param intStdPeakList numeric vector of masses for the internal
#' standard peaks
#' @param TICfilter numeric minimimum total ion current of an MS/MS scan.
#' Any MS/MS scan below this value will be filtered out (default=0).
#' @param DNF dynamic noise filter minimum signal to noise threshold
#' (default = 2), calculated as the ratio between the linear model predicted
#' intensity value and the actual intensity.
#' @param minInt numeric minimum intensity value
#' @param minPeaks minimum number of signal peaks following dynamic
#' noise filtration (default = 5).
#' @param intStd_MaxMedRtDrift numeric the maximum retention time drift
#' window (in seconds) to identify internal standard MS/MS spectrum scans
#' (default = 600).
#' @param intStd_MaxPpmDev numeric the maximum mass accuracy window (in ppm).
#' to identify internal standard MS/MS spectrum scans (default = 200 ppm).
#' @param minSpecEx numeric the minimum percentage of the total ion
#' current explained
#' by the internal standard fragments (default = 40). Sometime spectra are not
#' identified due to this cutoff being set too high. If unexpected datapoints
#' have been interpolated then reduce this value.
#' @param outputPlotDir character string for the output directory for plots,
#' default is working directory.
#' @description reads mzXML files from a directory extracts metadata info,
#' groups ion signals with \code{\link{signalGrouping}}, filters noise
#' dynamically
#' \code{\link{dynamicNoiseFilter}} and identifies precursor ion charge state,
#' by isotopic pattern.
#' \examples{
#' eh = ExperimentHub();
#' temp = query(eh, 'adductData');
#' temp[['EH1957']]; #first mzXML file
#' file.rename(cache(temp["EH1957"]), file.path(hubCache(temp),
#' 'ORB35017.mzXML'));
#' temp[['EH1958']]; #second mzXML file
#' file.rename(file.path(hubCache(temp),'1958'), file.path(hubCache(temp),
#' 'ORB35022.mzXML'));
#' adductSpecGen(mzXmlDir=system.file("extdata", package =
#' "adductData"), runOrder=paste0(system.file("extdata", package =
#' "adductomicsR"),'/runOrder.csv'), nCores=4,
#' intStdMass=834.77692,intStdPeakList=c(290.21, 403.30, 516.38,
#' 587.42,849.40, 884.92, 958.46, 993.97,1050.52, 1107.06, 1209.73,
#' 1337.79,1465.85), TICfilter=10000, DNF=2, minInt=300,
#' minPeaks=5,intStd_MaxMedRtDrift=360,
#' intStd_MaxPpmDev=200,minSpecEx=40,outputPlotDir=NULL)
#' }
#' @return AdductSpec object
adductSpecGen <- function(mzXmlDir = NULL,
runOrder = NULL,
nCores = NULL,
intStdMass = 834.77692,
intStdPeakList = c(
290.21,
403.30,
516.38,
587.42,
849.40,
884.92,
958.46,
993.97,
1050.52,
1107.06,
1209.73,
1337.79,
1465.85
),
TICfilter = 10000,
DNF = 2,
minInt = 300,
minPeaks = 5,
intStd_MaxMedRtDrift = 360,
intStd_MaxPpmDev = 200,
minSpecEx = 40,
outputPlotDir = NULL) {
# set global options
options(stringsAsFactors = FALSE)
# if is.null mzXmlDir select mzXML file containing directory
if (is.null(mzXmlDir)) {
message("Select your .mzXML data directory")
mzXmlDir <- tcltk::tk_choose.dir(default = "",
caption = "Select your .mzXML data directory")
}
# if is.null runOrder select .csv file
if (is.null(runOrder)) {
message("Select your .csv file containing the precise run order")
runOrder <-
tcltk::tclvalue(tcltk::tkgetOpenFile(title = "select your run
order csv file"))
}
if (is.character(runOrder)) {
if (!grepl('\\.csv$', runOrder)) {
stop('runOrder table file must be a .csv')
}
runOrder <-
as.data.frame(data.table::fread(
runOrder,
header = TRUE,
stringsAsFactors = FALSE
))
}
if (!is.data.frame(runOrder)) {
stop(
"runOrder table must be supplied as either a
full path to a csv file
or as a data.frame"
)
}
if (!is.character(runOrder[, 1])) {
stop('first column of runorder table must contain file name (is
character)')
}
if (!is.integer(runOrder[, 2])) {
stop('second column of runOrder table must contain a run order integer
(is integer)')
}
# identify all mzXML files in raw-data directory
MS2files <- list.files(
path = mzXmlDir,
pattern = "*.mzXML$",
full.names = TRUE,
recursive = FALSE
)
message(paste0(
length(MS2files),
" MS (.mzXML) files were detected within the directory..."
))
# sort ms2 files by run order info
idxTmp <-
match(gsub('\\.mzXML$', '', runOrder[, 1]),
gsub('\\.mzXML$',
'', basename(MS2files)))
if (any(is.na(idxTmp))) {
warning(
'files in runOrder table not detected in mzXMLDir:\n',
paste0(runOrder[is.na(idxTmp), 1], collapse = '\n'),
'\n',
immediate. = TRUE
)
runOrder <- runOrder[!is.na(idxTmp), , drop = FALSE]
idxTmp <- idxTmp[!is.na(idxTmp)]
}
MS2files <- MS2files[idxTmp]
MS2files <- MS2files[order(runOrder[, 2])]
# directory character lengths
dirCharLengths <-
vapply(dirname(MS2files), nchar, FUN.VALUE = numeric(1))
# error handling level directory structure
if (length(unique(dirCharLengths)) != 1) {
stop(
"directories containing .mzXML files are not level...e.g.\n",
names(dirCharLengths)[which.min(dirCharLengths)],
"\n",
names(dirCharLengths)[which.max(dirCharLengths)],
"\n"
)
}
nCores <- ifelse(is.null(nCores), 1, nCores)
if (nCores > 1) {
# start time to record computation time
pmt <- proc.time()
message(paste0("Starting SNOW cluster with ", nCores, " local
sockets..."))
cl <- parallel::makeCluster(nCores, outfile = '')
doSNOW::registerDoSNOW(cl)
message(
"Performing Savitsky-Golay baseline subtraction, dynamic noise
filtration and AdductSpec object construction...\n"
)
progress <-
function(n)
message(paste0(
n,
' of ',
length(MS2files),
' complete (',
basename(MS2files)[n],
').\n'
))
opts <- list(progress = progress)
massDriftFiles <-
gsub('\\.mzXML$|\\.mzML$', '.massDrift.csv',
MS2files)
# foreach and dopar from foreach package
Results <-
foreach(
i = seq_along(MS2files),
.packages = c('mzR',
"RcppEigen", 'pracma'),
.options.snow = opts
) %dopar% {
Res.tmp <- spectraCreate(
MS2file = MS2files[i],
TICfilter = TICfilter,
DNF = DNF,
minInt = minInt,
minPeaks = minPeaks
)
# if present add mass drift files
if (file.exists(massDriftFiles[i])) {
massDriftTmp <-
read.csv(massDriftFiles[i],
header = TRUE,
stringsAsFactors = FALSE)
ms1Level <- Res.tmp[[1]]$msLevel == 1
ms1MassDrift <- massDriftTmp[ms1Level, , drop = FALSE]
ms1MassDrift$ppmDrift[ms1MassDrift[, 3] == 0] <- NA
Res.tmp[[1]]$ppmDrift <- NA
Res.tmp[[1]]$ppmDrift[ms1Level] <-
zoo::na.spline(ms1MassDrift$ppmDrift)
rleLevel <- rle(Res.tmp[[1]]$msLevel == 2)
ms2Level <- which(ms1Level == FALSE)
ms1Level <- which(ms1Level)
precMassesIdx <- ms1Level[{
ms1Level + 1
} %in% ms2Level]
Res.tmp[[1]]$adjPrecursorMZ <- 0
ppmVals <-
mapply(
rep,
Res.tmp[[1]]$ppmDrift[precMassesIdx],
each = rleLevel$lengths[rleLevel$values == TRUE],
SIMPLIFY = FALSE
)
ppmVals <- do.call(c, ppmVals)
Res.tmp[[1]]$adjPrecursorMZ[ms2Level] <-
Res.tmp[[1]]$precursorMZ[ms2Level] -
{
{
Res.tmp[[1]]$precursorMZ[ms2Level] / 1E06
} * ppmVals
}
Res.tmp[[1]]$ppmDrift[ms2Level] <- ppmVals
massDriftTmp$precursorMZ <- Res.tmp[[1]]$precursorMZ
massDriftTmp$adjPrecursorMZ <- Res.tmp[[1]]$adjPrecursorMZ
massDriftTmp$ppmDrift <- Res.tmp[[1]]$ppmDrift
write.csv(massDriftTmp, massDriftFiles[i], row.names
= FALSE)
}
return(Res.tmp)
}
# stop SNOW cluster
parallel::stopCluster(cl)
proc.time() - pmt
} else {
# create list to store results
Results <- vector("list", length(MS2files))
massDriftFiles <-
gsub('\\.mzXML$|\\.mzML$', '.massDrift.csv',
MS2files)
for (i in seq_along(MS2files)) {
Res.tmp <- spectraCreate(
MS2file = MS2files[i],
TICfilter = TICfilter,
DNF = DNF,
minInt = minInt,
minPeaks = minPeaks
)
# if present add mass drift files
if (file.exists(massDriftFiles[i])) {
massDriftTmp <-
read.csv(massDriftFiles[i],
header = TRUE,
stringsAsFactors = FALSE)
ms1Level <- Res.tmp[[1]]$msLevel == 1
ms1MassDrift <- massDriftTmp[ms1Level, , drop = FALSE]
ms1MassDrift$ppmDrift[ms1MassDrift[, 3] == 0] <- NA
Res.tmp[[1]]$ppmDrift <- NA
Res.tmp[[1]]$ppmDrift[ms1Level] <-
zoo::na.spline(ms1MassDrift$ppmDrift)
rleLevel <- rle(Res.tmp[[1]]$msLevel == 2)
ms2Level <- which(ms1Level == FALSE)
ms1Level <- which(ms1Level)
precMassesIdx <- ms1Level[{
ms1Level + 1
} %in% ms2Level]
Res.tmp[[1]]$adjPrecursorMZ <- 0
ppmVals <-
mapply(
rep,
Res.tmp[[1]]$ppmDrift[precMassesIdx],
each = rleLevel$lengths[rleLevel$values == TRUE],
SIMPLIFY = FALSE
)
ppmVals <- do.call(c, ppmVals)
Res.tmp[[1]]$adjPrecursorMZ[ms2Level] <-
Res.tmp[[1]]$precursorMZ[ms2Level] -
{
{
Res.tmp[[1]]$precursorMZ[ms2Level] / 1E06
} * ppmVals
}
Res.tmp[[1]]$ppmDrift[ms2Level] <- ppmVals
massDriftTmp$precursorMZ <- Res.tmp[[1]]$precursorMZ
massDriftTmp$adjPrecursorMZ <- Res.tmp[[1]]$adjPrecursorMZ
massDriftTmp$ppmDrift <- Res.tmp[[1]]$ppmDrift
write.csv(massDriftTmp, massDriftFiles[i], row.names = FALSE)
}
Results[[i]] <- Res.tmp
}
}
# add MS2 file names to results list
names(Results) <- basename(MS2files)
Results <- unlist(Results, recursive = FALSE)
AdductSpecTmp <- new("AdductSpec")
# add in MS2 spectra
adductMS2spec(AdductSpecTmp) <-
Results[grep('MS2spectra', names(Results))]
# add MS1 isotopes
# rbind meta data and add in file names
metaDataTmp <- Results[grep('metaData', names(Results))]
fileNamesTmp <-
rep(gsub('\\.metaData$', '', names(metaDataTmp)),
vapply(metaDataTmp, nrow, numeric(1)))
metaDataTmp <- do.call(rbind, metaDataTmp)
metaDataTmp <- data.frame(mzXMLFile = fileNamesTmp,
metaDataTmp,
stringsAsFactors = FALSE)
metaData(AdductSpecTmp) <- metaDataTmp
Specfile.paths(AdductSpecTmp) <- MS2files
Parameters(AdductSpecTmp) <-
data.frame(
nCores,
ifelse(is.null(intStdMass),
NA, intStdMass),
TICfilter,
DNF,
minInt,
minPeaks,
intStd_MaxMedRtDrift,
intStd_MaxPpmDev,
stringsAsFactors = FALSE
)
# if necessary id internal standard
if (!is.null(intStdMass)) {
message(
'Identifying internal standard MS/MS spectra within a m/z window
of ',
intStd_MaxPpmDev,
' ppm and a retention time window of ',
round(intStd_MaxMedRtDrift / 60, 2),
' minutes...\n'
)
intStdScans <-
peakListId(
AdductSpecTmp,
peakList = intStdPeakList,
minPeaksId = 7,
minSpecEx = minSpecEx,
outputPlotDir = NULL,
exPeakMass = intStdMass,
maxRtDrift = intStd_MaxMedRtDrift,
maxPpmDev = intStd_MaxPpmDev,
closestMassByFile = FALSE
)
# remove duplicates
intStdScans <-
intStdScans[!duplicated(intStdScans$mzXMLFile), ,
drop = FALSE]
# take average ppmDev of all less than 5 ppm
# adjust theoretical then calculate deviation from observed
# intstd mass.
ppmDiffFile <-
((intStdScans$precursorMz - intStdMass) / intStdMass) * 1E06
obsStdMass <-
mean(intStdScans$precursorMz[abs(ppmDiffFile) <= 5])
ppmDiffFile <-
((intStdScans$precursorMz - obsStdMass) / obsStdMass) * 1E06
medRtDiffFile <- intStdScans$retentionTime -
median(intStdScans$retentionTime)
intStdScans$IntStd_ppmDev <- ppmDiffFile
intStdScans$IntStd_medRtDev <- medRtDiffFile / 60
# match names with file path names
runOrderTmp <- match(intStdScans$mzXMLFile,
basename(Specfile.paths(AdductSpecTmp)))
# sort table
intStdScans <- intStdScans[order(runOrderTmp), , drop = FALSE]
nFilesTmp <- seq_along(Specfile.paths(AdductSpecTmp))
missingFiles <- setdiff(nFilesTmp, runOrderTmp)
# if neccessary cubic spline interpolate missing files
# with zoo::na.spline
if (length(missingFiles) > 0) {
message(
'cubic spline interpolating ',
length(missingFiles),
' missing
values...\n'
)
# add NA for missing files
ppmDiffFile <- c(ppmDiffFile, rep(NA, length(missingFiles)))
medRtDiffFile <- c(medRtDiffFile, rep(NA, length(missingFiles)))
runOrderTmp <- c(runOrderTmp, missingFiles)
# sort by runorder
ppmDiffFile <- ppmDiffFile[order(runOrderTmp)]
medRtDiffFile <- medRtDiffFile[order(runOrderTmp)]
runOrderTmp <- runOrderTmp[order(runOrderTmp)]
# interpolate ppmdiff and rtdiff
ppmDiffFile <- zoo::na.spline(ppmDiffFile)
medRtDiffFile <- zoo::na.spline(medRtDiffFile)
}
# add to meta data
MD <- as.data.frame(metaData(AdductSpecTmp))
indxTmp <-
match(MD$mzXMLFile, basename(Specfile.paths(AdductSpecTmp)))
metaData(AdductSpecTmp)[, 'intStdRtDrift'] <-
medRtDiffFile[indxTmp]
metaData(AdductSpecTmp)[, 'intStdPpmDrift'] <-
ppmDiffFile[indxTmp]
# save plot
if (!is.null(outputPlotDir)) {
png(
paste0(outputPlotDir,
'/internalStandard_plots.png'
),
width = 961,
height = 587
)
par(mfrow = c(2, 2))
plot(runOrderTmp,
medRtDiffFile / 60,
xlab = 'run order',
ylab = 'deviation from median Rt (mins)')
# highlight inputed values
if (length(missingFiles) > 0) {
points(missingFiles,
medRtDiffFile[missingFiles] / 60,
col = 'red',
pch = 19)
}
plot(runOrderTmp,
ppmDiffFile,
xlab = 'run order',
ylab = 'ppm difference int.
std.')
# highlight inputed values
if (length(missingFiles) > 0) {
points(missingFiles,
ppmDiffFile[missingFiles],
col = 'red',
pch = 19)
}
plot(
ppmDiffFile,
medRtDiffFile / 60,
ylab = 'deviation from median Rt (
mins)',
xlab = 'ppm difference int. std.',
main = paste(
'medRT dev
range:',
round(min(medRtDiffFile / 60), 2),
'-',
round(max(medRtDiffFile / 60), 2),
'ppm dev range:',
round(min(ppmDiffFile), 2),
'-',
round(max(ppmDiffFile), 2)
)
)
if (length(missingFiles) > 0) {
points(ppmDiffFile[missingFiles],
medRtDiffFile[missingFiles] / 60,
col = 'red',
pch = 19)
}
plot(
c(0, 1),
c(0, 1),
ann = FALSE,
bty = 'n',
type = 'n',
xaxt = 'n',
yaxt = 'n'
)
text(
x = 0.5,
y = 0.5,
paste0(
"red data-points: cubic spline interpolated
files (n=",
length(missingFiles),
") where"
),
cex = 1.3,
col = "red"
)
text(
x = 0.5,
y = 0.3,
paste("internal standard MS2 spectrum were not
identified"),
cex = 1.3,
col = "red"
)
dev.off()
# write table
intStdScans$peakId <- NULL
intStdScans$peakNo <- NULL
intStdScans$plotFile <- paste0(ifelse(
is.null(outputPlotDir),
paste0(getwd(), '/'),
outputPlotDir
),
intStdScans$name,
'.png')
intStdScans$retentionTime <- intStdScans$retentionTime / 60
write.csv(intStdScans,
paste0(
ifelse(
is.null(outputPlotDir),
paste0(getwd(), '/'),
outputPlotDir
),
'/internalStd_scansTable.csv'
),
row.names = FALSE)
}
message('...DONE')
}
return(AdductSpecTmp)
} # end function
setMethod("show", "AdductSpec", function(object) {
if (length(Specfile.paths(object)) > 0) {
message(
"A \"AdductSpec\" class object derived from",
length(Specfile.paths(object)),
"MS2 files \n\n"
)
message(
"Consisting of:\n",
sum(metaData(object)[, 'msLevel'] == 1),
"raw MS1
scans\n",
sum(metaData(object)[, 'msLevel'] == 2),
"raw MS2 scans of which",
sum(metaData(object)[, 'aboveMinPeaks'] == TRUE),
"scans were above the
noise following filtration.\n\n"
)
message(
"m/z range:",
round(metaData(object)[, 'lowMZ'][1], digits = 3),
"-",
round(metaData(object)[, 'highMZ'][1], digits = 3),
"\n\n"
)
if (length(groupMS2spec(object)) > 0) {
message('Inter-file grouping:')
message(
length(groupMS2spec(object)),
'inter-file composite spectra groups
were identified.\n'
)
}
message("Adductome identification:")
# message()
memsize <- object.size(object)
message("Memory usage:", signif(memsize / 2 ^ 20, 3), "MB")
} else {
message("A new empty\"AdductSpec\" class object")
}
})
# set method concatenate
setMethod("c", signature(x = "AdductSpec"), function(x, ...) {
elements = list(...)
# error handling check if all AdductSpec object
if (any(vapply(elements, function(ele)
is(ele, 'AdductSpec'),
FUN.VALUE = logical(1)) == FALSE)) {
stop('all elements must be an AdductSpec class object')
}
emptyAdductSpec <- new('AdductSpec')
# bind together results
# do not include any group info or other information
for (i in seq_along(elements)) {
adductMS2spec(emptyAdductSpec) <- c(adductMS2spec(emptyAdductSpec),
adductMS2spec(elements[[i]]))
metaDataTmp <- metaData(elements[[i]])
#metaDataTmp$aboveMinPeaks <- NULL
metaDataTmp$msPrecursor_group <- NULL
metaDataTmp$interMSMSrtGroups <- NULL
metaDataTmp$predRtLoess <- NULL
metaDataTmp$MS2groupFreq <- NULL
metaDataTmp$MS2groupFreqAbove <- NULL
metaData(emptyAdductSpec) <-
rbind(metaData(emptyAdductSpec), metaDataTmp)
Specfile.paths(emptyAdductSpec) <-
c(Specfile.paths(emptyAdductSpec),
Specfile.paths(elements[[i]]))
}
message(
'Grouping, retention time correction and composite spectra
identification must be repeated in the concatenated "AdductSpec" class
object...\n'
)
return(emptyAdductSpec)
# reset global options
options(stringsAsFactors = TRUE)
}) # end function
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.