# Content of this script
# 1. raw data function - xcms
# 2. Peaks Peaking Function - parameters optimization
# 3. MS/MS processing - (moved from preproc_utils)
# 4. Peak Annotation - CAMERA
# 5. MS File Reading - MSnbase
# --------------------1. Raw spectra processing_Section using the xcms way---------------------------------------------
#'PerformPeakPicking
#'@description PerformPeakPicking
#'@param object the raw data object read by ImportRawMSData function.
#'@param param param list generated by updateRawSpectraParam function.
#'@param BPPARAM parallel method used for data processing. Default is bpparam().
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@ref Smith, C.A. et al. 2006. Analytical Chemistry, 78, 779-787
#'
PerformPeakPicking<-function(object,param,BPPARAM = bpparam()){
object_mslevel <- MSnbase::filterMsLevel(
MSnbase::selectFeatureData(object,
fcol = c(MSnbase:::.MSnExpReqFvarLabels,
"centroided")), msLevel. = 1)
if (length(object_mslevel) == 0)
AddErrMsg("Empty spectra present to perform !")
# Splite the MS data
object_mslevel <- lapply(1:length(fileNames(object_mslevel)),
FUN = filterFile,
object = object_mslevel)
# Peak picking runnning - centWave mode
if (param$Peak_method == "centWave")
resList <- bplapply(object_mslevel,
FUN = PeakPicking_centWave_slave, # slave Function of centWave
param = param, BPPARAM = BPPARAM)
# Peak picking runnning - MatchedFilter mode
if (param$Peak_method == "matchedFilter")
resList <- bplapply(object_mslevel,
FUN = PeakPicking_MatchedFilter_slave, # slave Function of MatchedFilter
param = param, BPPARAM = BPPARAM)
if (param$Peak_method != "matchedFilter" && param$Peak_method != "centWave"){
stop("Only 'centWave' and 'MatchedFilter' are suppoted now !")
}
# Picking Cluster
##### PEAK SUMMARY------------
pks <- vector("list", length(resList))
for (i in 1:length(resList)) {
n_pks <- nrow(resList[[i]])
if (is.null(n_pks))
n_pks <- 0
if (n_pks == 0) {
pks[[i]] <- NULL
warning("No peaks found in sample number ", i, ".")
} else {
pks[[i]] <- cbind(resList[[i]], sample = rep.int(i, n_pks))
}
}
pks <- do.call(rbind, pks)
# Make the chrompeaks embeded
newFd <- list()
#newFd@.xData <- as.environment(as.list(object@msFeatureData, all.names = TRUE))
rownames(pks) <- sprintf(paste0("CP", "%0", ceiling(log10(nrow(pks) + 1L)), "d"),
seq(from = 1, length.out = nrow(pks)))
newFd$chromPeaks <- pks
newFd$chromPeakData <- S4Vectors::DataFrame(ms_level = rep(1L, nrow(pks)), is_filled = rep(FALSE, nrow(pks)),
row.names = rownames(pks))
## mSet Generation
mSet<-list()
mSet$msFeatureData <- newFd
mSet$onDiskData <- object
return(mSet)
}
#'PeakPicking_centWave_slave
#'@description PeakPicking_centWave_slave
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@ref Smith, C.A. et al. 2006. Analytical Chemistry, 78, 779-787
#'
PeakPicking_centWave_slave <- function(x,param){
if (class(x)=="OnDiskMSnExp"){ # for raw data processing
scan.set <- MSnbase::spectra(x, BPPARAM = SerialParam());
rt <- MSnbase::rtime(x);
} else if (class(x) == "list") { # for parameters optimization
scan.set <- x;
rt <- unlist(lapply(x, MSnbase::rtime), use.names = FALSE);
}
mzs <- lapply(scan.set, MSnbase::mz)
vals_per_spect <- lengths(mzs, FALSE)
#if (any(vals_per_spect == 0))
# mzs <- mzs[-which(vals_per_spect == 0)];
# x <- x[-which(vals_per_spect == 0)]; # Cannot be so easy !
# warning("Found empty spectra Scan. Filtered Automatically !")
mz = unlist(mzs, use.names = FALSE);
int = unlist(lapply(scan.set, MSnbase::intensity), use.names = FALSE);
valsPerSpect = vals_per_spect;
scantime = rt;
###### centWaveCore Function
valCount <- cumsum(valsPerSpect)
scanindex <- as.integer(c(0, valCount[-length(valCount)])) ## Get index vector for C calls
if (!is.double(mz))
mz <- as.double(mz)
if (!is.double(int))
int <- as.double(int)
## Fix the mzCenterFun
#mzCenterFun <- paste("mzCenter",
# gsub(mzCenterFun, pattern = "mzCenter.",
# replacement = "", fixed = TRUE), sep=".")
basenames <- c("mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax",
"into", "intb", "maxo", "sn")
verbosenames <- c("egauss", "mu", "sigma", "h", "f", "dppm", "scale",
"scpos", "scmin", "scmax", "lmin", "lmax")
## Peak width: seconds to scales
peakwidth <- param$peakwidth
scalerange <- round((peakwidth / mean(diff(scantime))) / 2)
if (length(z <- which(scalerange == 0)))
scalerange <- scalerange[-z]
if (length(scalerange) < 1) {
warning("No scales? Please check peak width!")
if (param$verboseColumns) {
nopeaks <- matrix(nrow = 0, ncol = length(basenames) +
length(verbosenames))
colnames(nopeaks) <- c(basenames, verbosenames)
} else {
nopeaks <- matrix(nrow = 0, ncol = length(basenames))
colnames(nopeaks) <- c(basenames)
}
return(invisible(nopeaks))
}
if (length(scalerange) > 1){
scales <- seq(from = scalerange[1], to = scalerange[2], by = 2)
} else{
scales <- scalerange
}
minPeakWidth <- scales[1]
noiserange <- c(minPeakWidth * 3, max(scales) * 3)
maxGaussOverlap <- 0.5
minPtsAboveBaseLine <- max(4, minPeakWidth - 2)
minCentroids <- minPtsAboveBaseLine
scRangeTol <- maxDescOutlier <- floor(minPeakWidth / 2)
scanrange <- c(1, length(scantime))
## If no ROIs are supplied then search for them.
roiList<-list()
if (length(roiList) == 0) {
message("ROI searching under ", param$ppm, " ppm ... ", appendLF = FALSE)
withRestarts(
tryCatch({
tmp <- capture.output(
roiList <- .Call(C_findmzROI,
mz, int, scanindex,
as.double(c(0.0, 0.0)),
as.integer(scanrange),
as.integer(length(scantime)),
as.double(param$ppm * 1e-6),
as.integer(minCentroids),
as.integer(param$prefilter),
as.integer(param$noise),PACKAGE = "MetaboAnalystR")
#roiList <- findmzROI (mz, int, scanindex,scanrange,scantime,param$ppm,
# minCentroids,param$prefilter,param$noise)
)
},
error = function(e){
if (grepl("m/z sort assumption violated !", e$message)) {
invokeRestart("fixSort")
} else {
simpleError(e)
}
}),
fixSort = function() {
## Force ordering of values within spectrum by mz:
## o split values into a list -> mz per spectrum, intensity per
## spectrum.
## o define the ordering.
## o re-order the mz and intensity and unlist again.
## Note: the Rle split is faster than the "conventional" factor split.
splitF <- Rle(1:length(valsPerSpect), valsPerSpect)
mzl <- as.list(S4Vectors::split(mz, f = splitF))
oidx <- lapply(mzl, order)
mz <<- unlist(mapply(mzl, oidx, FUN = function(y, z) {
return(y[z])
}, SIMPLIFY = FALSE, USE.NAMES = FALSE), use.names = FALSE)
int <<- unlist(mapply(as.list(split(int, f = splitF)), oidx,
FUN=function(y, z) {
return(y[z])
}, SIMPLIFY = FALSE, USE.NAMES = FALSE),
use.names = FALSE)
rm(mzl)
rm(splitF)
tmp <- capture.output(
roiList <<- .Call(C_findmzROI,
mz, int, scanindex,
as.double(c(0.0, 0.0)),
as.integer(scanrange),
as.integer(length(scantime)),
as.double(param$ppm * 1e-6),
as.integer(minCentroids),
as.integer(param$prefilter),
as.integer(param$noise),PACKAGE = "MetaboAnalystR")
)
}
)
}
## Second stage: process the ROIs
peaklist <- list();
Nscantime <- length(scantime)
lf <- length(roiList)
## print('\n Detecting chromatographic peaks ... \n % finished: ')
## lp <- -1
message("Detecting chromatographic peaks in ", length(roiList),
" regions of interest ...", appendLF = FALSE)
roiScales = NULL
for (f in 1:lf) {
feat <- roiList[[f]]
N <- feat$scmax - feat$scmin + 1
peaks <- peakinfo <- NULL
mzrange <- c(feat$mzmin, feat$mzmax)
sccenter <- feat$scmin[1] + floor(N/2) - 1
scrange <- c(feat$scmin, feat$scmax)
## scrange + noiserange, used for baseline detection and wavelet analysis
sr <- c(max(scanrange[1], scrange[1] - max(noiserange)),
min(scanrange[2], scrange[2] + max(noiserange)))
eic <- .Call(C_getEIC, mz, int, scanindex, as.double(mzrange),
as.integer(sr), as.integer(length(scanindex)),PACKAGE = "MetaboAnalystR")
## eic <- rawEIC(object,mzrange=mzrange,scanrange=sr)
d <- eic$intensity
td <- sr[1]:sr[2]
scan.range <- c(sr[1], sr[2])
## original mzROI range
idxs <- which(eic$scan %in% seq(scrange[1], scrange[2]))
mzROI.EIC <- list(scan=eic$scan[idxs], intensity=eic$intensity[idxs])
## mzROI.EIC <- rawEIC(object,mzrange=mzrange,scanrange=scrange)
omz <- .Call(C_getMZ, mz, int, scanindex, as.double(mzrange),
as.integer(scrange), as.integer(length(scantime)))
## omz <- rawMZ(object,mzrange=mzrange,scanrange=scrange)
if (all(omz == 0)) {
warning("centWave: no peaks found in ROI.")
next
}
od <- mzROI.EIC$intensity
otd <- mzROI.EIC$scan
if (all(od == 0)) {
warning("centWave: no peaks found in ROI.")
next
}
## scrange + scRangeTol, used for gauss fitting and continuous
## data above 1st baseline detection
ftd <- max(td[1], scrange[1] - scRangeTol) : min(td[length(td)],
scrange[2] + scRangeTol)
fd <- d[match(ftd, td)]
## 1st type of baseline: statistic approach
if (N >= 10*minPeakWidth) {
## in case of very long mass trace use full scan range
## for baseline detection
noised <- .Call(C_getEIC, mz, int, scanindex, as.double(mzrange),
as.integer(scanrange), as.integer(length(scanindex)))$intensity
## noised <- rawEIC(object,mzrange=mzrange,scanrange=scanrange)$intensity
} else {
noised <- d
}
## 90% trimmed mean as first baseline guess
gz <- which(noised > 0);
if (length(gz) < 3*minPeakWidth){
noise <- mean(noised)
} else {
noise <- mean(noised[gz], trim=0.05)
}
firstBaselineCheck = TRUE
## any continuous data above 1st baseline ?
if (firstBaselineCheck &
!continuousPtsAboveThreshold(fd, threshold = noise,
num = minPtsAboveBaseLine))
next
## 2nd baseline estimate using not-peak-range
lnoise <- getLocalNoiseEstimate(d, td, ftd, noiserange, Nscantime,
threshold = noise,
num = minPtsAboveBaseLine)
## Final baseline & Noise estimate
baseline <- max(1, min(lnoise[1], noise))
sdnoise <- max(1, lnoise[2])
sdthr <- sdnoise * param$snthresh
## is there any data above S/N * threshold ?
if (!(any(fd - baseline >= sdthr)))
next
wCoefs <- MSW.cwt(d, scales = scales, wavelet = 'mexh')
if (!(!is.null(dim(wCoefs)) && any(wCoefs- baseline >= sdthr)))
next
if (td[length(td)] == Nscantime) ## workaround, localMax fails otherwise
wCoefs[nrow(wCoefs),] <- wCoefs[nrow(wCoefs) - 1, ] * 0.99
localMax <- MSW.getLocalMaximumCWT(wCoefs)
rL <- MSW.getRidge(localMax)
wpeaks <- sapply(rL,
function(x) {
w <- min(1:length(x),ncol(wCoefs))
any(wCoefs[x,w]- baseline >= sdthr)
})
if (any(wpeaks)) {
wpeaksidx <- which(wpeaks)
## check each peak in ridgeList
for (p in 1:length(wpeaksidx)) {
opp <- rL[[wpeaksidx[p]]]
pp <- unique(opp)
if (length(pp) >= 1) {
dv <- td[pp] %in% ftd
if (any(dv)) { ## peaks in orig. data range
## Final S/N check
if (any(d[pp[dv]]- baseline >= sdthr)) {
## if(!is.null(roiScales)) {
## allow roiScales to be a numeric of length 0
if(length(roiScales) > 0) {
## use given scale
best.scale.nr <- which(scales == roiScales[[f]])
if(best.scale.nr > length(opp))
best.scale.nr <- length(opp)
} else {
## try to decide which scale describes the peak best
inti <- numeric(length(opp))
irange <- rep(ceiling(scales[1]/2), length(opp))
for (k in 1:length(opp)) {
kpos <- opp[k]
r1 <- ifelse(kpos - irange[k] > 1,
kpos-irange[k], 1)
r2 <- ifelse(kpos + irange[k] < length(d),
kpos + irange[k], length(d))
inti[k] <- sum(d[r1:r2])
}
maxpi <- which.max(inti)
if (length(maxpi) > 1) {
m <- wCoefs[opp[maxpi], maxpi]
bestcol <- which(m == max(m),
arr.ind = TRUE)[2]
best.scale.nr <- maxpi[bestcol]
} else best.scale.nr <- maxpi
}
best.scale <- scales[best.scale.nr]
best.scale.pos <- opp[best.scale.nr]
pprange <- min(pp):max(pp)
## maxint <- max(d[pprange])
lwpos <- max(1,best.scale.pos - best.scale)
rwpos <- min(best.scale.pos + best.scale, length(td))
p1 <- match(td[lwpos], otd)[1]
p2 <- match(td[rwpos], otd)
p2 <- p2[length(p2)]
if (is.na(p1)) p1 <- 1
if (is.na(p2)) p2 <- N
mz.value <- omz[p1:p2]
mz.int <- od[p1:p2]
maxint <- max(mz.int)
## re-calculate m/z value for peak range
mzrange <- range(mz.value)
#mzmean <- do.call(mzCenterFun,
# list(mz = mz.value,
# intensity = mz.int))
mzmean <- weighted.mean(mz.value, mz.int)
## Compute dppm only if needed
dppm <- NA
if (param$verboseColumns) {
if (length(mz.value) >= (minCentroids + 1)) {
dppm <- round(min(running(abs(diff(mz.value)) /
(mzrange[2] * 1e-6),
fun = max,
width = minCentroids)))
} else {
dppm <- round((mzrange[2] - mzrange[1]) /
(mzrange[2] * 1e-6))
}
}
peaks <- rbind(peaks,
c(mzmean,mzrange, ## mz
NA, NA, NA, ## rt, rtmin, rtmax,
NA, ## intensity (sum)
NA, ## intensity (-bl)
maxint, ## max intensity
round((maxint - baseline) / sdnoise), ## S/N Ratio
NA, ## Gaussian RMSE
NA,NA,NA, ## Gaussian Parameters
f, ## ROI Position
dppm, ## max. difference between the [minCentroids] peaks in ppm
best.scale, ## Scale
td[best.scale.pos],
td[lwpos],
td[rwpos], ## Peak positions guessed from the wavelet's (scan nr)
NA, NA)) ## Peak limits (scan nr)
peakinfo <- rbind(peakinfo,
c(best.scale, best.scale.nr,
best.scale.pos, lwpos, rwpos))
## Peak positions guessed from the wavelet's
}
}
}
} ##for
} ## if
## postprocessing
if (!is.null(peaks)) {
colnames(peaks) <- c(basenames, verbosenames)
colnames(peakinfo) <- c("scale", "scaleNr", "scpos",
"scmin", "scmax")
for (p in 1:dim(peaks)[1]) {
## find minima (peak boundaries), assign rt and intensity values
if (param$integrate == 1) {
lm <- descendMin(wCoefs[, peakinfo[p, "scaleNr"]],
istart = peakinfo[p, "scpos"])
gap <- all(d[lm[1]:lm[2]] == 0) # looks like we got stuck in a gap right in the middle of the peak
if ((lm[1] == lm[2]) || gap) # fall-back
lm <- descendMinTol(
d, startpos = c(peakinfo[p, "scmin"],
peakinfo[p, "scmax"]),
maxDescOutlier)
} else {
lm <- descendMinTol(d, startpos = c(peakinfo[p, "scmin"],
peakinfo[p, "scmax"]),
maxDescOutlier)
}
## Narrow peak rt boundaries by removing values below threshold
lm <- .narrow_rt_boundaries(lm, d)
lm_seq <- lm[1]:lm[2]
pd <- d[lm_seq]
peakrange <- td[lm]
peaks[p, "rtmin"] <- scantime[peakrange[1]]
peaks[p, "rtmax"] <- scantime[peakrange[2]]
peaks[p, "maxo"] <- max(pd)
pwid <- (scantime[peakrange[2]] - scantime[peakrange[1]]) /
(peakrange[2] - peakrange[1])
if (is.na(pwid))
pwid <- 1
peaks[p, "into"] <- pwid * sum(pd)
db <- pd - baseline
peaks[p, "intb"] <- pwid * sum(db[db > 0])
peaks[p, "lmin"] <- lm[1]
peaks[p, "lmax"] <- lm[2]
if (param$fitgauss) {
## perform gaussian fits, use wavelets for inital parameters
td_lm <- td[lm_seq]
md <- max(pd)
d1 <- pd / md ## normalize data for gaussian error calc.
pgauss <- fitGauss(td_lm, pd,
pgauss = list(mu = peaks[p, "scpos"],
sigma = peaks[p, "scmax"] -
peaks[p, "scmin"],
h = peaks[p, "maxo"]))
rtime <- peaks[p, "scpos"]
if (!any(is.na(pgauss)) && all(pgauss > 0)) {
gtime <- td[match(round(pgauss$mu), td)]
if (!is.na(gtime)) {
rtime <- gtime
peaks[p, "mu"] <- pgauss$mu
peaks[p, "sigma"] <- pgauss$sigma
peaks[p, "h"] <- pgauss$h
peaks[p,"egauss"] <- sqrt(
(1 / length(td_lm)) *
sum(((d1 - gauss(td_lm, pgauss$h / md,
pgauss$mu, pgauss$sigma))^2)))
}
}
peaks[p, "rt"] <- scantime[rtime]
## avoid fitting side effects
if (peaks[p, "rt"] < peaks[p, "rtmin"])
peaks[p, "rt"] <- scantime[peaks[p, "scpos"]]
} else
peaks[p, "rt"] <- scantime[peaks[p, "scpos"]]
}
peaks <- joinOverlappingPeaks(td, d, otd, omz, od, scantime,
scan.range, peaks, maxGaussOverlap)
}
if (!is.null(peaks)) {
peaklist[[length(peaklist) + 1]] <- peaks
}
} ## f
if (length(peaklist) == 0) {
warning("No peaks found!")
if (param$verboseColumns) {
nopeaks <- matrix(nrow = 0, ncol = length(basenames) +
length(verbosenames))
colnames(nopeaks) <- c(basenames, verbosenames)
} else {
nopeaks <- matrix(nrow = 0, ncol = length(basenames))
colnames(nopeaks) <- c(basenames)
}
message(" FAIL: none found!")
return(nopeaks)
}
p <- do.call(rbind, peaklist)
if (!param$verboseColumns)
p <- p[, basenames, drop = FALSE]
uorder <- order(p[, "into"], decreasing = TRUE)
pm <- as.matrix(p[,c("mzmin", "mzmax", "rtmin", "rtmax"), drop = FALSE])
uindex <- rectUnique(pm, uorder, param$mzdiff, ydiff = -0.00001) ## allow adjacent peaks
pr <- p[uindex, , drop = FALSE]
message(" OK: ", nrow(pr), " found.")
return(pr)
}
#'PeakPicking_MatchedFilter_slave
#'@description PeakPicking_MatchedFilter_slave
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@ref Smith, C.A. et al. 2006. Analytical Chemistry, 78, 779-787
#'
PeakPicking_MatchedFilter_slave <- function(x,param){
if (class(x)=="OnDiskMSnExp"){ # for raw data processing
scan.set <- MSnbase::spectra(x, BPPARAM = SerialParam());
rt <- MSnbase::rtime(x);
} else if (class(x) == "list") { # for parameters optimization
scan.set <- x;
rt <- unlist(lapply(x, MSnbase::rtime), use.names = FALSE);
}
## 1. data & Param preparation
mzs <- lapply(scan.set, MSnbase::mz)
valsPerSpect <- lengths(mzs, FALSE)
mz = unlist(mzs, use.names = FALSE);
int = unlist(lapply(scan.set, MSnbase::intensity), use.names = FALSE);
scantime = rt;
mrange <- range(mz[mz > 0])
mass <- seq(floor(mrange[1] / param$binSize) * param$binSize,
ceiling(mrange[2] / param$binSize) * param$binSize,
by = param$binSize)
impute <- param$impute;
baseValue <- param$baseValue;
distance <- param$distance;
steps <- param$steps;
sigma <- param$sigma;
snthresh <-param$snthresh;
max <- param$max;
mzdiff <- param$mzdiff;
binSize <- param$binSize;
index <- param$index;
## 2. Binning the data.
## Create and translate settings for binYonX
toIdx <- cumsum(valsPerSpect)
fromIdx <- c(1L, toIdx[-length(toIdx)] + 1L)
shiftBy = TRUE
binFromX <- min(mass)
binToX <- max(mass)
## binSize <- (binToX - binFromX) / (length(mass) - 1)
## brks <- seq(binFromX - binSize/2, binToX + binSize/2, by = binSize)
brks <- breaks_on_nBins(fromX = binFromX, toX = binToX,
nBins = length(mass), shiftByHalfBinSize = TRUE)
binRes <- binYonX(mz, int,
breaks = brks,
fromIdx = fromIdx,
toIdx = toIdx,
baseValue = ifelse(impute == "none", yes = 0, no = NA),
sortedX = TRUE,
returnIndex = TRUE
)
if (length(toIdx) == 1)
binRes <- list(binRes)
bufMax <- do.call(cbind, lapply(binRes, function(z) return(z$index)))
bin_size <- binRes[[1]]$x[2] - binRes[[1]]$x[1]
## Missing value imputation
if (missing(baseValue))
baseValue <- numeric()
if (length(baseValue) == 0)
baseValue <- min(int, na.rm = TRUE) / 2
if (missing(distance))
distance <- numeric()
if (length(distance) == 0)
distance <- floor(0.075 / bin_size)
binVals <- lapply(binRes, function(z) {
return(imputeLinInterpol(z$y, method = impute, distance = distance,
noInterpolAtEnds = TRUE,
baseValue = baseValue))
})
buf <- do.call(cbind, binVals)
bufidx <- 1L:length(mass)
lookahead <- steps-1
lookbehind <- 1
N <- nextn(length(scantime))
xrange <- range(scantime)
x <- c(0:(N/2), -(ceiling(N/2-1)):-1) *
(xrange[2]-xrange[1])/(length(scantime)-1)
filt <- -attr(eval(deriv3(~ 1/(sigma*sqrt(2*pi)) *
exp(-x^2/(2*sigma^2)), "x")), "hessian")
filt <- filt/sqrt(sum(filt^2))
filt <- fft(filt, inverse = TRUE)/length(filt)
cnames <- c("mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "into", "intf",
"maxo", "maxf", "i", "sn")
num <- 0
ResList <- list()
## Can not do much here, lapply/apply won't work because of the 'steps' parameter.
## That's looping through the masses, i.e. rows of the profile matrix.
for (i in seq(length = length(mass)-steps+1)) {
ymat <- buf[bufidx[i:(i+steps-1)], , drop = FALSE]
ysums <- colMax(ymat)
yfilt <- filtfft(ysums, filt)
gmax <- max(yfilt)
for (j in seq(length = max)) {
maxy <- which.max(yfilt)
noise <- mean(ysums[ysums > 0])
##noise <- mean(yfilt[yfilt >= 0])
sn <- yfilt[maxy]/noise
if (yfilt[maxy] > 0 && yfilt[maxy] > snthresh*noise && ysums[maxy] > 0) {
peakrange <- descendZero(yfilt, maxy)
intmat <- ymat[, peakrange[1]:peakrange[2], drop = FALSE]
mzmat <- matrix(mz[bufMax[bufidx[i:(i+steps-1)],
peakrange[1]:peakrange[2]]],
nrow = steps)
which.intMax <- which.colMax(intmat)
mzmat <- mzmat[which.intMax]
if (all(is.na(mzmat))) {
yfilt[peakrange[1]:peakrange[2]] <- 0
next
}
mzrange <- range(mzmat, na.rm = TRUE)
massmean <- weighted.mean(mzmat, intmat[which.intMax],
na.rm = TRUE)
## This case (the only non-na m/z had intensity 0) was reported
## by Gregory Alan Barding "binlin processing"
if(any(is.na(massmean))) {
massmean <- mean(mzmat, na.rm = TRUE)
}
pwid <- (scantime[peakrange[2]] - scantime[peakrange[1]]) /
(peakrange[2] - peakrange[1])
into <- pwid*sum(ysums[peakrange[1]:peakrange[2]])
intf <- pwid*sum(yfilt[peakrange[1]:peakrange[2]])
maxo <- max(ysums[peakrange[1]:peakrange[2]])
maxf <- yfilt[maxy]
yfilt[peakrange[1]:peakrange[2]] <- 0
num <- num + 1
ResList[[num]] <- c(massmean, mzrange[1], mzrange[2], maxy,
peakrange, into, intf, maxo, maxf, j, sn)
} else
break
}
}
if (length(ResList) == 0) {
rmat <- matrix(nrow = 0, ncol = length(cnames))
colnames(rmat) <- cnames
return(rmat)
}
rmat <- do.call(rbind, ResList)
if (is.null(dim(rmat))) {
rmat <- matrix(rmat, nrow = 1)
}
colnames(rmat) <- cnames
max <- max-1 + max*(steps-1) + max*ceiling(mzdiff/binSize)
if (index){
mzdiff <- mzdiff/binSize
} else {
rmat[, "rt"] <- scantime[rmat[, "rt"]]
rmat[, "rtmin"] <- scantime[rmat[, "rtmin"]]
rmat[, "rtmax"] <- scantime[rmat[, "rtmax"]]
}
## Select for each unique mzmin, mzmax, rtmin, rtmax the largest peak
## and report that.
uorder <- order(rmat[, "into"], decreasing = TRUE)
uindex <- rectUnique(rmat[, c("mzmin", "mzmax", "rtmin", "rtmax"),
drop = FALSE],
uorder, mzdiff)
rmat <- rmat[uindex,,drop = FALSE]
return(rmat)
}
#'PerformPeakGrouping
#'@description PerformPeakGrouping
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@ref Smith, C.A. et al. 2006. Analytical Chemistry, 78, 779-787
#'
PerformPeakGrouping<-function(mSet,param){
## 1. Extract Information-------
peaks_0 <- mSet[["msFeatureData"]][["chromPeaks"]]
peaks <- cbind(peaks_0[, c("mz", "rt", "sample", "into"), drop = FALSE],
index = seq_len(nrow(peaks_0)))
if (!is.null(mSet[["onDiskData"]])){
sample_group<-as.character(mSet[["onDiskData"]]@phenoData@data[["sample_group"]]);
if (identical(sample_group,character(0))){
sample_group<-as.character(mSet[["onDiskData"]]@phenoData@data[["sample_name"]]);
}
} else {
sample_group<-as.character(mSet[["inMemoryData"]]@phenoData@data[["sample_group"]]);
if (identical(sample_group,character(0))){
sample_group<-as.character(mSet[["inMemoryData"]]@phenoData@data[["sample_name"]]);
}
}
sampleGroupNames <- unique(sample_group)
sampleGroupTable <- table(sample_group)
nSampleGroups <- length(sampleGroupTable)
## 2. Order peaks matrix by mz-------
peaks <- peaks[order(peaks[, "mz"]), , drop = FALSE]
rownames(peaks) <- NULL
rtRange <- range(peaks[, "rt"])
## 3. Define the mass slices and the index in the peaks matrix with an mz-------
mass <- seq(peaks[1, "mz"], peaks[nrow(peaks), "mz"] + param$binSize,
by = param$binSize / 2)
masspos <- findEqualGreaterM(peaks[, "mz"], mass)
densFrom <- rtRange[1] - 3 * param$bw
densTo <- rtRange[2] + 3 * param$bw
## 4. Increase the number of sampling points for the density distribution.-------
densN <- max(512, 2 * 2^(ceiling(log2(diff(rtRange) / (param$bw / 2)))))
endIdx <- 0
message("Total of ", length(mass) - 1, " slices detected for processing... ",
appendLF = FALSE)
resL <- vector("list", (length(mass) - 2))
for (i in seq_len(length(mass)-2)) {
## That's identifying overlapping mz slices.
startIdx <- masspos[i]
endIdx <- masspos[i + 2] - 1
if (endIdx - startIdx < 0)
next
resL[[i]] <- Densitygrouping_slave(peaks[startIdx:endIdx, , drop = FALSE],
bw = param$bw, densFrom = densFrom,
densTo = densTo, densN = densN,
sampleGroups = sample_group,
sampleGroupTable = sampleGroupTable,
minFraction = param$minFraction,
minSamples = param$minSamples,
maxFeatures = param$maxFeatures)
}
message("Done !")
res <- do.call(rbind, resL)
if (nrow(res)) {
## Remove groups that overlap with more "well-behaved" groups
numsamp <- rowSums(
as.matrix(res[, (match("npeaks", colnames(res)) +1):(ncol(res) -1),
drop = FALSE]))
uorder <- order(-numsamp, res[, "npeaks"])
uindex <- rectUnique(
as.matrix(res[, c("mzmin", "mzmax", "rtmin", "rtmax"),
drop = FALSE]), uorder)
res <- res[uindex, , drop = FALSE]
rownames(res) <- NULL
}
df <- S4Vectors::DataFrame(res)
rownames(df) <- sprintf(paste0("FT", "%0", ceiling(log10(nrow(df) + 1L)), "d"),
seq(from = 1, length.out = nrow(df)))
mSet$FeatureGroupTable <- df
mSet
}
#'Densitygrouping_slave
#'@description Densitygrouping_slave
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@ref Smith, C.A. et al. 2006. Analytical Chemistry, 78, 779-787
#'
Densitygrouping_slave <- function(x, bw, densFrom, densTo, densN, sampleGroups,
sampleGroupTable, minFraction,
minSamples, maxFeatures) {
den <- density(x[, "rt"], bw = bw, from = densFrom, to = densTo,
n = densN)
maxden <- max(den$y)
deny <- den$y
sampleGroupNames <- names(sampleGroupTable)
nSampleGroups <- length(sampleGroupNames)
col_nms <- c("mzmed", "mzmin", "mzmax", "rtmed", "rtmin", "rtmax",
"npeaks", sampleGroupNames)
res_mat <- matrix(nrow = 0, ncol = length(col_nms),
dimnames = list(character(), col_nms))
res_idx <- list()
while (deny[maxy <- which.max(deny)] > maxden / 20 && nrow(res_mat) <
maxFeatures) {
grange <- descendMin(deny, maxy)
deny[grange[1]:grange[2]] <- 0
gidx <- which(x[,"rt"] >= den$x[grange[1]] &
x[,"rt"] <= den$x[grange[2]])
## Determine the sample group of the samples in which the peaks
## were detected and check if they correspond to the required limits.
tt <- table(sampleGroups[unique(x[gidx, "sample"])])
if (!any(tt / sampleGroupTable[names(tt)] >= minFraction &
tt >= minSamples))
next
gcount <- rep(0, length(sampleGroupNames))
names(gcount) <- sampleGroupNames
gcount[names(tt)] <- as.numeric(tt)
res_mat <- rbind(res_mat,
c(median(x[gidx, "mz"]),
range(x[gidx, "mz"]),
median(x[gidx, "rt"]),
range(x[gidx, "rt"]),
length(gidx),
gcount)
)
res_idx <- c(res_idx, list(unname(sort(x[gidx, "index"]))))
}
res <- as.data.frame(res_mat)
res$peakidx <- res_idx
res
}
#'PerformPeakAlignment
#'@description PerformPeakAlignment
#'@param mSet the mSet object generated by PerformPeakPicking function.
#'@param param param list generated by updateRawSpectraParam function.
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@ref Smith, C.A. et al. 2006. Analytical Chemistry, 78, 779-787
#'
PerformPeakAlignment<-function(mSet,param){
## 1~4. Peak Grouping---------
mSet<-PerformPeakGrouping(mSet,param)
## 5. Adjust Retention Time-------
# 1). Group Matrix-------
peaks_0 <- mSet[["msFeatureData"]][["chromPeaks"]]
if (!is.null(mSet[["onDiskData"]])){
subs <- seq_along(mSet[["onDiskData"]]@phenoData@data[["sample_name"]]);
format <- "onDiskData"
} else {
subs <- seq_along(mSet[["inMemoryData"]]@phenoData@data[["sample_name"]]);
format <- "inMemoryData"
}
nSamples <- length(subs)
subset_names <- original_names <- mSet[["msFeatureData"]][["chromPeakData"]]@rownames
mSet[["FeatureGroupTable"]]$peakidx <- lapply(mSet[["FeatureGroupTable"]]$peakidx, function(z) {
idx <- base::match(original_names[z], subset_names)
idx[!is.na(idx)]
})
peakIndex_0 <- mSet[["FeatureGroupTable"]][lengths(mSet[["FeatureGroupTable"]]$peakidx) > 0, ]
peakIndex <- .peakIndex(peakIndex_0)
pkGrpMat <- .getPeakGroupsRtMatrix(peaks = peaks_0,
peakIndex = peakIndex,
sampleIndex = subs,
missingSample = nSamples - (nSamples * param$minFraction),
extraPeaks = param$extraPeaks
)
colnames(pkGrpMat) <- mSet[[format]]@phenoData@data[["sample_name"]][subs]
# 2). RT adjustment-------
rtime_0 <- rtime(mSet[[format]])
tmp <- split(rtime_0, fromFile(mSet[[format]]))
rtime <- vector("list", length(fileNames(mSet[[format]])))
names(rtime) <- as.character(1:length(rtime))
rtime[as.numeric(names(tmp))] <- tmp
res <- RT.Adjust_Slave(peaks_0,
peakIndex = peakIndex_0$peakidx,
rtime = rtime,
minFraction = param$minFraction,
extraPeaks = param$extraPeaks,
smooth = param$smooth,
span = param$span,
family = param$family,
peakGroupsMatrix = pkGrpMat,
subsetAdjust = param$subsetAdjust
)
message("Applying retention time adjustment to the identified chromatographic peaks ... ",appendLF = FALSE)
fts <- .applyRtAdjToChromPeaks(mSet[["msFeatureData"]][["chromPeaks"]],
rtraw = rtime,
rtadj = res)
mSet[["msFeatureData"]][["chromPeaks"]] <- fts
mSet[["msFeatureData"]][["adjustedRT"]] <- res
#
message("Done !")
## 6~9. Peak Grouping Again---------
mSet<-PerformPeakGrouping(mSet,param)
## 10. Return------
return(mSet)
}
#'RT.Adjust_Slave
#'@description RT.Adjust_Slave
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@ref Smith, C.A. et al. 2006. Analytical Chemistry, 78, 779-787
#'
RT.Adjust_Slave <-
function(peaks, peakIndex, rtime, minFraction = 0.9, extraPeaks = 1,
smooth = c("loess", "linear"), span = 0.2,
family = c("gaussian", "symmetric"),
peakGroupsMatrix = matrix(ncol = 0, nrow = 0),
subsetAdjust = c("average", "previous")) {
## minFraction
if (any(minFraction > 1) | any(minFraction < 0))
stop("'minFraction' has to be between 0 and 1!")
## Check peakIndex:
if (any(!(unique(unlist(peakIndex)) %in% seq_len(nrow(peaks)))))
stop("Some indices listed in 'peakIndex' are outside of ",
"1:nrow(peaks)!")
## Check rtime:
if (!is.list(rtime))
stop("'rtime' should be a list of numeric vectors with the retention ",
"times of the spectra per sample!")
if (!all(unlist(lapply(rtime, is.numeric), use.names = FALSE)))
stop("'rtime' should be a list of numeric vectors with the retention ",
"times of the spectra per sample!")
if (length(rtime) != max(peaks[, "sample"]))
stop("The length of 'rtime' does not match with the total number of ",
"samples according to the 'peaks' matrix!")
total_samples <- length(rtime)
subset <- seq_len(total_samples)
## Translate minFraction to number of allowed missing samples.
nSamples <- length(subset)
missingSample <- nSamples - (nSamples * minFraction)
## Remove peaks not present in "subset" from the peakIndex
peaks_in_subset <- which(peaks[, "sample"] %in% subset)
peakIndex <- lapply(peakIndex, function(z) z[z %in% peaks_in_subset])
## Check if we've got a valid peakGroupsMatrix
## o Same number of samples.
## o range of rt values is within the rtime.
if (nrow(peakGroupsMatrix)) {
if (ncol(peakGroupsMatrix) != nSamples)
stop("'peakGroupsMatrix' has to have the same number of columns ",
"as there are samples!")
pg_range <- range(peakGroupsMatrix, na.rm = TRUE)
rt_range <- range(rtime)
if (!(pg_range[1] >= rt_range[1] & pg_range[2] <= rt_range[2]))
stop("The retention times in 'peakGroupsMatrix' have to be within",
" the retention time range of the experiment!")
rt <- peakGroupsMatrix
} else
rt <- .getPeakGroupsRtMatrix(peaks, peakIndex, subset,
missingSample, extraPeaks)
if (ncol(rt) != length(subset))
stop("Length of 'subset' and number of columns of the peak group ",
"matrix do not match.")
## Fix for issue #175
if (length(rt) == 0)
stop("No peak groups found in the data for the provided settings")
message("Performing retention time correction using ", nrow(rt),
" peak groups.")
## Calculate the deviation of each peak group in each sample from its
## median
rtdev <- rt - apply(rt, 1, median, na.rm = TRUE)
if (smooth == "loess") {
mingroups <- min(colSums(!is.na(rt)))
if (mingroups < 4) {
smooth <- "linear"
warning("Too few peak groups for 'loess', reverting to linear",
" method")
} else if (mingroups * span < 4) {
span <- 4 / mingroups
warning("Span too small for 'loess' and the available number of ",
"peak groups, resetting to ", round(span, 2))
}
}
rtdevsmo <- vector("list", nSamples)
## Code for checking to see if retention time correction is overcorrecting
rtdevrange <- range(rtdev, na.rm = TRUE)
warn.overcorrect <- FALSE
warn.tweak.rt <- FALSE
rtime_adj <- rtime
## Adjust samples in subset.
for (i in seq_along(subset)) {
i_all <- subset[i] # Index of sample in whole dataset.
pts <- na.omit(data.frame(rt = rt[, i], rtdev = rtdev[, i]))
## order the data.frame such that rt and rtdev are increasingly ordered.
pk_idx <- order(pts$rt, pts$rtdev)
pts <- pts[pk_idx, ]
if (smooth == "loess") {
lo <- suppressWarnings(loess(rtdev ~ rt, pts, span = span,
degree = 1, family = family))
rtdevsmo[[i]] <- na.flatfill(
predict(lo, data.frame(rt = rtime[[i_all]])))
## Remove singularities from the loess function
rtdevsmo[[i]][abs(rtdevsmo[[i]]) >
quantile(abs(rtdevsmo[[i]]), 0.9,
na.rm = TRUE) * 2] <- NA
if (length(naidx <- which(is.na(rtdevsmo[[i]]))))
rtdevsmo[[i]][naidx] <- suppressWarnings(
approx(na.omit(data.frame(rtime[[i_all]], rtdevsmo[[i]])),
xout = rtime[[i_all]][naidx], rule = 2)$y
)
## Check if there are adjusted retention times that are not ordered
## increasingly. If there are, search for each first unordered rt
## the next rt that is larger and linearly interpolate the values
## in between (see issue #146 for an illustration).
while (length(decidx <- which(diff(rtime[[i_all]] - rtdevsmo[[i]]) < 0))) {
warn.tweak.rt <- TRUE ## Warn that we had to tweak the rts.
rtadj <- rtime[[i_all]] - rtdevsmo[[i]]
rtadj_start <- rtadj[decidx[1]] ## start interpolating from here
## Define the
next_larger <- which(rtadj > rtadj[decidx[1]])
if (length(next_larger) == 0) {
## Fix if there is no larger adjusted rt up to the end.
next_larger <- length(rtadj) + 1
rtadj_end <- rtadj_start
} else {
next_larger <- min(next_larger)
rtadj_end <- rtadj[next_larger]
}
## linearly interpolate the values in between.
adj_idxs <- (decidx[1] + 1):(next_larger - 1)
incr <- (rtadj_end - rtadj_start) / length(adj_idxs)
rtdevsmo[[i]][adj_idxs] <- rtime[[i_all]][adj_idxs] -
(rtadj_start + (1:length(adj_idxs)) * incr)
}
rtdevsmorange <- range(rtdevsmo[[i]])
if (any(rtdevsmorange / rtdevrange > 2))
warn.overcorrect <- TRUE
} else {
if (nrow(pts) < 2) {
stop("Not enough peak groups even for linear smoothing ",
"available!")
}
## Use lm instead?
fit <- lsfit(pts$rt, pts$rtdev)
rtdevsmo[[i]] <- rtime[[i_all]] * fit$coef[2] + fit$coef[1]
ptsrange <- range(pts$rt)
minidx <- rtime[[i_all]] < ptsrange[1]
maxidx <- rtime[[i_all]] > ptsrange[2]
rtdevsmo[[i]][minidx] <- rtdevsmo[[i]][head(which(!minidx), n = 1)]
rtdevsmo[[i]][maxidx] <- rtdevsmo[[i]][tail(which(!maxidx), n = 1)]
}
## Finally applying the correction
rtime_adj[[i_all]] <- rtime[[i_all]] - rtdevsmo[[i]]
}
if (warn.overcorrect) {
warning("Fitted retention time deviation curves exceed points by more",
" than 2x. This is dangerous and the algorithm is probably ",
"overcorrecting your data. Consider increasing the span ",
"parameter or switching to the linear smoothing method.")
}
if (warn.tweak.rt) {
warning(call. = FALSE, "Adjusted retention times had to be ",
"re-adjusted for some files to ensure them being in the same",
" order than the raw retention times. A call to ",
"'dropAdjustedRtime' might thus fail to restore retention ",
"times of chromatographic peaks to their original values. ",
"Eventually consider to increase the value of the 'span' ",
"parameter.")
}
rtime_adj
}
#'PerformPeakFiling
#'@description PerformPeakFiling
#'@param mSet the mSet object generated by PerformPeakPicking function.
#'@param param param list generated by updateRawSpectraParam function.
#'@param BPPARAM parallel method used for data processing. Default is bpparam().
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@ref Smith, C.A. et al. 2006. Analytical Chemistry, 78, 779-787
#'
PerformPeakFiling<-function(mSet,param,BPPARAM=bpparam()){
## Preparing Something
print("Starting peak filling!")
fixedRt <- fixedMz <- expandRt <- expandMz <- 0
if (is.null(param$ppm)){
ppm <-10
} else {
ppm <- param$ppm;
}
message("Defining peak areas for filling-in...",
appendLF = FALSE)
aggFunLow <- median
aggFunHigh <- median
tmp_pks <- mSet$msFeatureData$chromPeaks[, c("rtmin", "rtmax", "mzmin", "mzmax")]
fdef <- mSet$FeatureGroupTable
pkArea <- do.call(rbind,lapply(fdef$peakidx, function(z) {
pa <- c(aggFunLow(tmp_pks[z, 1]),
aggFunHigh(tmp_pks[z, 2]),
aggFunLow(tmp_pks[z, 3]),
aggFunHigh(tmp_pks[z, 4]))
## Check if we have to apply ppm replacement:
if (ppm != 0) {
mzmean <- mean(pa[3:4])
tittle <- mzmean * (ppm / 2) / 1E6
if ((pa[4] - pa[3]) < (tittle * 2)) {
pa[3] <- mzmean - tittle
pa[4] <- mzmean + tittle
}
}
## Expand it.
if (expandRt != 0) {
diffRt <- (pa[2] - pa[1]) * expandRt / 2
pa[1] <- pa[1] - diffRt
pa[2] <- pa[2] + diffRt
}
if (expandMz != 0) {
diffMz <- (pa[4] - pa[3]) * expandMz / 2
pa[3] <- pa[3] - diffMz
pa[4] <- pa[4] + diffMz
}
if (fixedMz != 0) {
pa[3] <- pa[3] - fixedMz
pa[4] <- pa[4] + fixedMz
}
if (fixedRt != 0) {
pa[1] <- pa[1] - fixedRt
pa[2] <- pa[2] + fixedRt
}
pa
}))
rm(tmp_pks)
message(".", appendLF = FALSE)
colnames(pkArea) <- c("rtmin", "rtmax", "mzmin", "mzmax")
## Add mzmed column - needed for MSW peak filling.
pkArea <- cbind(group_idx = 1:nrow(pkArea), pkArea,
mzmed = as.numeric(fdef$mzmed))
if (class(mSet[[2]]) == "OnDiskMSnExp"){
format <- "onDiskData"
} else {
format <- "inMemoryData"
}
fNames <- mSet[[format]]@phenoData@data[["sample_name"]]
pks <- mSet$msFeatureData$chromPeaks
pkGrpVal <-
.feature_values(pks = pks, fts = mSet$FeatureGroupTable,
method = "medret", value = "into",
intensity = "into", colnames = fNames,
missing = NA)
message(".", appendLF = FALSE)
## Check if there is anything to fill...
if (!any(is.na(rowSums(pkGrpVal)))) {
message("No missing peaks present.")
return(mSet)
}
message(".", appendLF = FALSE)
## Split the object by file and define the peaks for which
pkAreaL <- objectL <- vector("list", length(fNames))
## We need "only" a list of OnDiskMSnExp, one for each file but
## instead of filtering by file we create small objects to keep
## memory requirement to a minimum.
if (format == "onDiskData"){
req_fcol <- requiredFvarLabels("OnDiskMSnExp")
min_fdata <- mSet[[format]]@featureData@data[, req_fcol]
###
res <- mSet[["msFeatureData"]][["adjustedRT"]]
res <- unlist(res, use.names = FALSE)
fidx <- fData(mSet[[format]])$fileIdx # a issue for inmemory data
names(fidx) <- featureNames(mSet[[format]])
sNames <- unlist(split(featureNames(mSet[[format]]), fidx),
use.names = FALSE)
names(res) <- sNames
res <- res[featureNames(mSet[[format]])]
min_fdata$retentionTime <- res
for (i in 1:length(mSet[[format]]@phenoData@data[["sample_name"]])) {
fd <- min_fdata[min_fdata$fileIdx == i, ]
fd$fileIdx <- 1
objectL[[i]] <- new(
"OnDiskMSnExp",
processingData = new("MSnProcess",
files = mSet[[format]]@processingData@files[i]),
featureData = new("AnnotatedDataFrame", fd),
phenoData = new("NAnnotatedDataFrame",
data.frame(sampleNames = "1")),
experimentData = new("MIAPE",
instrumentManufacturer = "a",
instrumentModel = "a",
ionSource = "a",
analyser = "a",
detectorType = "a"))
## Want to extract intensities only for peaks that were not
## found in a sample.
pkAreaL[[i]] <- pkArea[is.na(pkGrpVal[, i]), , drop = FALSE]
}
message(" OK\nStart integrating peak areas from original files")
cp_colnames <- colnames(mSet[["msFeatureData"]][["chromPeaks"]])
## Extraction designed for centWave
res <- bpmapply(FUN = .getChromPeakData,
objectL,
pkAreaL,
as.list(1:length(objectL)),
MoreArgs = list(cn = cp_colnames,
mzCenterFun = "weighted.mean"),
BPPARAM = BPPARAM,
SIMPLIFY = FALSE)
} else {
message(" OK\nStart integrating peak areas from original files")
mzCenterFun = "weighted.mean"
scan_set<-names(mSet[[format]]@assayData);
scan_set_ordered <- sort(scan_set);
assayData <- sapply(scan_set_ordered, FUN = function(x){mSet[[format]]@assayData[[x]]})
#assayData <- lapply(scan_set,FUN=function(x){mSet[[format]]@assayData[[x]]});
assayData <- split(assayData,fromFile(mSet[[format]]));
rtim_all <- split(rtime(mSet[[format]]),fromFile(mSet[[format]]))
filesname <- basename(MSnbase::fileNames(mSet[[format]]))
res_new <-list()
for (ii in 1:length(mSet[[format]]@phenoData@data[["sample_name"]])){
cn <-colnames(mSet[["msFeatureData"]][["chromPeaks"]])
peakArea <- pkArea[is.na(pkGrpVal[, ii]), , drop = FALSE];
ncols <- length(cn)
res <- matrix(ncol = ncols, nrow = nrow(peakArea))
colnames(res) <- cn
res[, "sample"] <- ii
res[, c("mzmin", "mzmax")] <- peakArea[, c("mzmin", "mzmax")]
## Load the data
message("Requesting ", nrow(res), " peaks from ",
filesname[ii], " ... ", appendLF = FALSE)
spctr <- assayData[[ii]]
mzs <- lapply(spctr, mz)
valsPerSpect <- lengths(mzs)
ints <- unlist(lapply(spctr, intensity), use.names = FALSE)
rm(spctr)
mzs <- unlist(mzs, use.names = FALSE)
mzs_range <- range(mzs)
rtim <- rtim_all[[ii]]
rtim_range <- range(rtim)
for (i in seq_len(nrow(res))) {
rtr <- peakArea[i, c("rtmin", "rtmax")]
mzr <- peakArea[i, c("mzmin", "mzmax")]
## If the rt range is completely out; additional fix for #267
if (rtr[2] < rtim_range[1] | rtr[1] > rtim_range[2] |
mzr[2] < mzs_range[1] | mzr[1] > mzs_range[2]) {
res[i, ] <- rep(NA_real_, ncols)
next
}
## Ensure that the mz and rt region is within the range of the data.
rtr[1] <- max(rtr[1], rtim_range[1])
rtr[2] <- min(rtr[2], rtim_range[2])
mzr[1] <- max(mzr[1], mzs_range[1])
mzr[2] <- min(mzr[2], mzs_range[2])
mtx <- .rawMat(mz = mzs, int = ints, scantime = rtim,
valsPerSpect = valsPerSpect, rtrange = rtr,
mzrange = mzr)
if (length(mtx)) {
if (any(!is.na(mtx[, 3]))) {
res[i, "into"] <- sum(mtx[, 3], na.rm = TRUE) *
((rtr[2] - rtr[1]) /
max(1, (sum(rtim >= rtr[1] & rtim <= rtr[2]) - 1)))
maxi <- which.max(mtx[, 3])
res[i, c("rt", "maxo")] <- mtx[maxi[1], c(1, 3)]
res[i, c("rtmin", "rtmax")] <- rtr
## Calculate the intensity weighted mean mz
meanMz <- do.call(mzCenterFun, list(mtx[, 2], mtx[, 3]))
if (is.na(meanMz)) meanMz <- mtx[maxi[1], 2]
res[i, "mz"] <- meanMz
} else {
res[i, ] <- rep(NA_real_, ncols)
}
} else {
res[i, ] <- rep(NA_real_, ncols)
}
}
message("got ", sum(!is.na(res[, "into"])), ".")
res_new[[ii]] <- res
pkAreaL[[ii]] <- pkArea[is.na(pkGrpVal[, ii]), , drop = FALSE]
}
res <-res_new
}
res <- do.call(rbind, res)
## cbind the group_idx column to track the feature/peak group.
res <- cbind(res, group_idx = do.call(rbind, pkAreaL)[, "group_idx"])
## Remove those without a signal
res <- res[!is.na(res[, "into"]), , drop = FALSE]
if (nrow(res) == 0) {
warning("Could not integrate any signal for the missing ",
"peaks! Consider increasing 'expandMz' and 'expandRt'.")
return(mSet)
}
## Get the msFeatureData:
newFd <- mSet$msFeatureData
incr <- nrow(mSet$msFeatureData$chromPeaks)
for (i in unique(res[, "group_idx"])) {
fdef$peakidx[[i]] <- c(fdef$peakidx[[i]],
(which(res[, "group_idx"] == i) + incr))
}
## Define IDs for the new peaks; include fix for issue #347
maxId <- max(as.numeric(sub("^CP", "",
rownames(mSet[["msFeatureData"]][["chromPeaks"]]))))
if (maxId < 1)
stop("chromPeaks matrix lacks rownames; please update ",
"'object' with the 'updateObject' function.")
toId <- maxId + nrow(res)
rownames(res) <- sprintf(
paste0("CP", "%0", ceiling(log10(toId + 1L)), "d"),
(maxId + 1L):toId)
newFd[["chromPeaks"]] <- rbind(mSet[["msFeatureData"]][["chromPeaks"]],
res[, -ncol(res)])
cpd <- newFd$chromPeakData[rep(1L, nrow(res)), , drop = FALSE]
cpd[,] <- NA
cpd$ms_level <- as.integer(1)
cpd$is_filled <- TRUE
newFd$chromPeakData <- rbind(newFd$chromPeakData, cpd)
rownames(newFd$chromPeakData) <- rownames(newFd$chromPeaks)
mSet$msFeatureData <- newFd
mSet$FeatureGroupTable <- fdef
mSet$xcmsSet <- mSet2xcmsSet(mSet)
return(mSet)
}
#'mSet2xcmsSet
#'@description mSet2xcmsSet
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@ref Smith, C.A. et al. 2006. Analytical Chemistry, 78, 779-787
#'
mSet2xcmsSet <- function(mSet) {
xs <- new("xcmsSet")
xs@peaks <- mSet[["msFeatureData"]][["chromPeaks"]]
fgs <- mSet$FeatureGroupTable
xs@groups <- S4Vectors::as.matrix(fgs[, -ncol(fgs)])
rownames(xs@groups) <- NULL
xs@groupidx <- fgs$peakidx
rts <- list()
if (class(mSet[[2]]) == "OnDiskMSnExp"){
format <- "onDiskData"
} else {
format <- "inMemoryData"
}
## Ensure we're getting the raw rt
rts$raw <- rtime(mSet[[format]])
rts$corrected <- mSet[["msFeatureData"]][["adjustedRT"]]
xs@rt <- rts
## @phenoData
xs@phenoData <- pData(mSet[[format]])
## @filepaths
xs@filepaths <- fileNames(mSet[[format]])
## @profinfo (list)
profMethod <- NA
profStep <- NA
profParam <- list()
## If we've got any MatchedFilterParam we can take the values from there
## @mslevel <- msLevel?
xs@mslevel <- 1
## @scanrange
xs@scanrange <- range(scanIndex(mSet[[format]]))
## @filled ... not yet.
if (any(mSet$msFeatureData$chromPeakData$is_filled)) {
fld <- which(mSet$msFeatureData$chromPeakData$is_filled)
xs@filled <- as.integer(fld)
}
return(xs)
}
#'updateRawSpectraParam
#'@description updateRawSpectraParam
#'@param Params object generated by SetPeakParams function.
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'@export
#'@ref Smith, C.A. et al. 2006. Analytical Chemistry, 78, 779-787
#' Mcgill University
#' License: GNU GPL (>= 2)
updateRawSpectraParam <- function (Params){
param <- list();
# 0. Method Define
param$Peak_method <- Params$Peak_method
param$RT_method <- Params$RT_method
# 1.1 update peak picking params - centWave
if (param$Peak_method == "centWave"){
param$ppm <- as.numeric(Params[["ppm"]]);
param$peakwidth <- c(as.numeric(Params[["min_peakwidth"]]),
as.numeric(Params[["max_peakwidth"]]));
param$snthresh <- as.numeric(Params[["snthresh"]]);
param$prefilter <- c(as.numeric(Params[["prefilter"]]),
as.numeric(Params[["value_of_prefilter"]]));
param$roiList <- list();
param$firstBaselineCheck <- T;
param$roiScales <- numeric(0);
param$mzCenterFun <- Params[["mzCenterFun"]];
param$integrate <- Params[["integrate"]];
param$mzdiff <- as.numeric(Params[["mzdiff"]]);
param$fitgauss <- as.logical(Params[["fitgauss"]]);
param$noise <- as.numeric(Params[["noise"]]);
param$verboseColumns <- as.logical(Params[["verbose.columns"]]);
param$binSize <-0.25; # density Param
} else if (param$Peak_method == "matchedFilter") {
param$fwhm <- as.numeric(Params$fwhm);
param$sigma <- as.numeric(Params$sigma)
param$steps <- as.numeric(Params$steps)
param$snthresh <- as.numeric(Params$snthresh)
param$mzdiff <- as.numeric(Params$mzdiff)
param$bw <- as.numeric(Params$bw)
param$max <- as.numeric(Params$max)
param$impute <- "none";
param$baseValue<- numeric(0);
param$distance<- numeric(0);
param$index<- F;
param$binSize <- 0.1;
}
# 2.1 update grouping params - density
param$bw<-as.numeric(Params[["bw"]]);
param$minFraction <- as.numeric(Params[["minFraction"]]);
param$minSamples <- as.numeric(Params[["minSamples"]]);
param$maxFeatures <- as.numeric(Params[["maxFeatures"]]);
# 3.1 update RT correction params - peakgroup
param$extraPeaks <- as.numeric(Params[["extra"]]);
param$smooth <- Params[["smooth"]];
param$span <- as.numeric(Params[["span"]]);
param$family <- Params[["family"]];
param$subsetAdjust <- "average";
# Finished !
print(paste("Parameters for",param$Peak_method, "have been successfully parsed!"))
return(param)
}
#'creatPeakTable
#'@description creatPeakTable
#'@author Zhiqiang Pang, Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'
creatPeakTable <- function(xset){
if (length(xset@phenoData[["sample_name"]]) == 1) {
return(xset@peaks)
}
groupmat <- xset@groups
ts <- data.frame(cbind(groupmat,.groupval(xset, value="into")), row.names = NULL)
cnames <- colnames(ts)
if (cnames[1] == 'mzmed') {
cnames[1] <- 'mz'
} else {
stop ('mzmed column missing')
}
if (cnames[4] == 'rtmed') {
cnames[4] <- 'rt'
} else {
stop ('mzmed column missing')
}
colnames(ts) <- cnames
return(ts)
}
######## -----------=========== Internal Functions From XCMS ===========------------- #
#
#' @references Smith, C.A., Want, E.J., O'Maille, G., Abagyan,R., Siuzdak, G. (2006).
# "XCMS: Processing mass spectrometry data for metabolite profiling using nonlinear
# peak alignment, matching and identification." Analytical Chemistry, 78, 779-787.
continuousPtsAboveThreshold <- function(y, threshold, num, istart = 1) {
if (!is.double(y)) y <- as.double(y)
if (.C(C_continuousPtsAboveThreshold,
y,
as.integer(istart-1),
length(y),
threshold = as.double(threshold),
num = as.integer(num),
n = integer(1))$n > 0) TRUE else FALSE
}
getLocalNoiseEstimate <- function(d, td, ftd, noiserange, Nscantime, threshold, num) {
if (length(d) < Nscantime) {
## noiserange[2] is full d-range
drange <- which(td %in% ftd)
n1 <- d[-drange] ## region outside the detected ROI (wide)
n1.cp <- continuousPtsAboveThresholdIdx(n1, threshold=threshold,num=num) ## continousPtsAboveThreshold (probably peak) are subtracted from data for local noise estimation
n1 <- n1[!n1.cp]
if (length(n1) > 1) {
baseline1 <- mean(n1)
sdnoise1 <- sd(n1)
} else
baseline1 <- sdnoise1 <- 1
## noiserange[1]
d1 <- drange[1]
d2 <- drange[length(drange)]
nrange2 <- c(max(1,d1 - noiserange[1]) : d1, d2 : min(length(d),d2 + noiserange[1]))
n2 <- d[nrange2] ## region outside the detected ROI (narrow)
n2.cp <- continuousPtsAboveThresholdIdx(n2, threshold=threshold,num=num) ## continousPtsAboveThreshold (probably peak) are subtracted from data for local noise estimation
n2 <- n2[!n2.cp]
if (length(n2) > 1) {
baseline2 <- mean(n2)
sdnoise2 <- sd(n2)
} else
baseline2 <- sdnoise2 <- 1
} else {
trimmed <- trimm(d,c(0.05,0.95))
baseline1 <- baseline2 <- mean(trimmed)
sdnoise1 <- sdnoise2 <- sd(trimmed)
}
c(min(baseline1,baseline2),min(sdnoise1,sdnoise2))
}
continuousPtsAboveThresholdIdx <- function(y, threshold, num, istart = 1) {
if (!is.double(y)) y <- as.double(y)
as.logical(.C(C_continuousPtsAboveThresholdIdx,
y,
as.integer(istart-1),
length(y),
threshold = as.double(threshold),
num = as.integer(num),
n = integer(length(y)))$n)
}
MSW.cwt <- function (ms, scales = 1, wavelet = "mexh") { ## modified from package MassSpecWavelet
if (wavelet == "mexh") {
psi_xval <- seq(-6, 6, length = 256)
psi <- (2/sqrt(3) * pi^(-0.25)) * (1 - psi_xval^2) *
exp(-psi_xval^2/2)
}
else if (is.matrix(wavelet)) {
if (nrow(wavelet) == 2) {
psi_xval <- wavelet[1, ]
psi <- wavelet[2, ]
}
else if (ncol(wavelet) == 2) {
psi_xval <- wavelet[, 1]
psi <- wavelet[, 2]
}
else {
stop("Unsupported wavelet format!")
}
}
else {
stop("Unsupported wavelet!")
}
oldLen <- length(ms)
ms <- MSW.extendNBase(ms, nLevel = NULL, base = 2)
len <- length(ms)
nbscales <- length(scales)
wCoefs <- NULL
psi_xval <- psi_xval - psi_xval[1]
dxval <- psi_xval[2]
xmax <- psi_xval[length(psi_xval)]
for (i in 1:length(scales)) {
scale.i <- scales[i]
f <- rep(0, len)
j <- 1 + floor((0:(scale.i * xmax))/(scale.i * dxval))
if (length(j) == 1)
j <- c(1, 1)
lenWave <- length(j)
f[1:lenWave] <- rev(psi[j]) - mean(psi[j])
if (length(f) > len)
{i<-i-1;break;} ## stop(paste("scale", scale.i, "is too large!"))
wCoefs.i <- 1/sqrt(scale.i) * convolve(ms, f)
wCoefs.i <- c(wCoefs.i[(len - floor(lenWave/2) + 1):len],
wCoefs.i[1:(len - floor(lenWave/2))])
wCoefs <- cbind(wCoefs, wCoefs.i)
}
if (i < 1) return(NA)
scales <- scales[1:i]
if (length(scales) == 1)
wCoefs <- matrix(wCoefs, ncol = 1)
colnames(wCoefs) <- scales
wCoefs <- wCoefs[1:oldLen, , drop = FALSE]
wCoefs
}
MSW.extendNBase <- function(x, nLevel=1, base=2, ...) { ## from package MassSpecWavelet
if (!is.matrix(x)) x <- matrix(x, ncol=1)
nR <- nrow(x)
if (is.null(nLevel)) {
nR1 <- nextn(nR, base)
} else {
nR1 <- ceiling(nR / base^nLevel) * base^nLevel
}
if (nR != nR1) {
x <- MSW.extendLength(x, addLength=nR1-nR, ...)
}
x
}
MSW.extendLength <- function(x, addLength=NULL, method=c('reflection', 'open', 'circular'), direction=c('right', 'left', 'both')) { ## from package MassSpecWavelet
if (is.null(addLength)) stop('Please provide the length to be added!')
if (!is.matrix(x)) x <- matrix(x, ncol=1)
method <- match.arg(method)
direction <- match.arg(direction)
nR <- nrow(x)
nR1 <- nR + addLength
if (direction == 'both') {
left <- right <- addLength
} else if (direction == 'right') {
left <- 0
right <- addLength
} else if (direction == 'left') {
left <- addLength
right <- 0
}
if (right > 0) {
x <- switch(method,
reflection =rbind(x, x[nR:(2 * nR - nR1 + 1), , drop=FALSE]),
open = rbind(x, matrix(rep(x[nR,], addLength), ncol=ncol(x), byrow=TRUE)),
circular = rbind(x, x[1:(nR1 - nR),, drop=FALSE]))
}
if (left > 0) {
x <- switch(method,
reflection =rbind(x[addLength:1, , drop=FALSE], x),
open = rbind(matrix(rep(x[1,], addLength), ncol=ncol(x), byrow=TRUE), x),
circular = rbind(x[(2 * nR - nR1 + 1):nR,, drop=FALSE], x))
}
if (ncol(x) == 1) x <- as.vector(x)
x
}
MSW.getLocalMaximumCWT <- function(wCoefs, minWinSize=5, amp.Th=0) { ## from package MassSpecWavelet
localMax <- NULL
scales <- as.numeric(colnames(wCoefs))
for (i in 1:length(scales)) {
scale.i <- scales[i]
winSize.i <- scale.i * 2 + 1
if (winSize.i < minWinSize) {
winSize.i <- minWinSize
}
temp <- MSW.localMaximum(wCoefs[,i], winSize.i)
localMax <- cbind(localMax, temp)
}
## Set the values less than peak threshold as 0
localMax[wCoefs < amp.Th] <- 0
colnames(localMax) <- colnames(wCoefs)
rownames(localMax) <- rownames(wCoefs)
localMax
}
MSW.localMaximum <- function (x, winSize = 5) { ## from package MassSpecWavelet
len <- length(x)
rNum <- ceiling(len/winSize)
## Transform the vector as a matrix with column length equals winSize
## and find the maximum position at each row.
y <- matrix(c(x, rep(x[len], rNum * winSize - len)), nrow=winSize)
y.maxInd <- apply(y, 2, which.max)
## Only keep the maximum value larger than the boundary values
selInd <- which(apply(y, 2, function(x) max(x) > x[1] & max(x) > x[winSize]))
## keep the result
localMax <- rep(0, len)
localMax[(selInd-1) * winSize + y.maxInd[selInd]] <- 1
## Shift the vector with winSize/2 and do the same operation
shift <- floor(winSize/2)
rNum <- ceiling((len + shift)/winSize)
y <- matrix(c(rep(x[1], shift), x, rep(x[len], rNum * winSize - len - shift)), nrow=winSize)
y.maxInd <- apply(y, 2, which.max)
## Only keep the maximum value larger than the boundary values
selInd <- which(apply(y, 2, function(x) max(x) > x[1] & max(x) > x[winSize]))
localMax[(selInd-1) * winSize + y.maxInd[selInd] - shift] <- 1
## Check whether there is some local maxima have in between distance less than winSize
maxInd <- which(localMax > 0)
selInd <- which(diff(maxInd) < winSize)
if (length(selInd) > 0) {
selMaxInd1 <- maxInd[selInd]
selMaxInd2 <- maxInd[selInd + 1]
temp <- x[selMaxInd1] - x[selMaxInd2]
localMax[selMaxInd1[temp <= 0]] <- 0
localMax[selMaxInd2[temp > 0]] <- 0
}
localMax
}
MSW.getRidge <- function(localMax, iInit=ncol(localMax), step=-1, iFinal=1, minWinSize=3, gapTh=3, skip=NULL) { ## modified from package MassSpecWavelet
scales <- as.numeric(colnames(localMax))
if (is.null(scales)) scales <- 1:ncol(localMax)
maxInd_curr <- which(localMax[, iInit] > 0)
nMz <- nrow(localMax)
if (is.null(skip)) {
skip <- iInit + 1
}
## Identify all the peak pathes from the coarse level to detail levels (high column to low column)
## Only consider the shortest path
if ( ncol(localMax) > 1 ) colInd <- seq(iInit+step, iFinal, step)
else colInd <- 1
ridgeList <- as.list(maxInd_curr)
names(ridgeList) <- maxInd_curr
peakStatus <- as.list(rep(0, length(maxInd_curr)))
names(peakStatus) <- maxInd_curr
## orphanRidgeList keep the ridges disconnected at certain scale level
## Changed by Pan Du 05/11/06
orphanRidgeList <- NULL
orphanRidgeName <- NULL
nLevel <- length(colInd)
for (j in 1:nLevel) {
col.j <- colInd[j]
scale.j <- scales[col.j]
if (colInd[j] == skip) {
oldname <- names(ridgeList)
ridgeList <- lapply(ridgeList, function(x) c(x, x[length(x)]))
##peakStatus <- lapply(peakStatus, function(x) c(x, x[length(x)]))
names(ridgeList) <- oldname
##names(peakStatus) <- oldname
next
}
if (length(maxInd_curr) == 0) {
maxInd_curr <- which(localMax[, col.j] > 0)
next
}
## The slide window size is proportional to the CWT scale
## winSize.j <- scale.j / 2 + 1
winSize.j <- floor(scale.j/2)
if (winSize.j < minWinSize) {
winSize.j <- minWinSize
}
selPeak.j <- NULL
remove.j <- NULL
for (k in 1:length(maxInd_curr)) {
ind.k <- maxInd_curr[k]
start.k <- ifelse(ind.k-winSize.j < 1, 1, ind.k-winSize.j)
end.k <- ifelse(ind.k+winSize.j > nMz, nMz, ind.k+winSize.j)
ind.curr <- which(localMax[start.k:end.k, col.j] > 0) + start.k - 1
##ind.curr <- which(localMax[, col.j] > 0)
if (length(ind.curr) == 0) {
status.k <- peakStatus[[as.character(ind.k)]]
## bug work-around
if (is.null(status.k)) status.k <- gapTh +1
##
if (status.k > gapTh & scale.j >= 2) {
temp <- ridgeList[[as.character(ind.k)]]
orphanRidgeList <- c(orphanRidgeList, list(temp[1:(length(temp)-status.k)]))
orphanRidgeName <- c(orphanRidgeName, paste(col.j + status.k + 1, ind.k, sep='_'))
remove.j <- c(remove.j, as.character(ind.k))
next
} else {
ind.curr <- ind.k
peakStatus[[as.character(ind.k)]] <- status.k + 1
}
} else {
peakStatus[[as.character(ind.k)]] <- 0
if (length(ind.curr) >= 2) ind.curr <- ind.curr[which.min(abs(ind.curr - ind.k))]
}
ridgeList[[as.character(ind.k)]] <- c(ridgeList[[as.character(ind.k)]], ind.curr)
selPeak.j <- c(selPeak.j, ind.curr)
}
## Remove the disconnected lines from the currrent list
if (length(remove.j) > 0) {
removeInd <- which(names(ridgeList) %in% remove.j)
ridgeList <- ridgeList[-removeInd]
peakStatus <- peakStatus[-removeInd]
}
## Check for duplicated selected peaks and only keep the one with the longest path.
dupPeak.j <- unique(selPeak.j[duplicated(selPeak.j)])
if (length(dupPeak.j) > 0) {
removeInd <- NULL
for (dupPeak.jk in dupPeak.j) {
selInd <- which(selPeak.j == dupPeak.jk)
selLen <- sapply(ridgeList[selInd], length)
removeInd.jk <- which.max(selLen)
removeInd <- c(removeInd, selInd[-removeInd.jk])
orphanRidgeList <- c(orphanRidgeList, ridgeList[removeInd.jk])
orphanRidgeName <- c(orphanRidgeName, paste(col.j, selPeak.j[removeInd.jk], sep='_'))
}
selPeak.j <- selPeak.j[-removeInd]
ridgeList <- ridgeList[-removeInd]
peakStatus <- peakStatus[-removeInd]
}
## Update the names of the ridgeList as the new selected peaks
##if (scale.j >= 2) {
if (length(ridgeList) > 0) names(ridgeList) <- selPeak.j
if (length(peakStatus) > 0) names(peakStatus) <- selPeak.j
##}
## If the level is larger than 3, expand the peak list by including other unselected peaks at that level
if (scale.j >= 2) {
maxInd_next <- which(localMax[, col.j] > 0)
unSelPeak.j <- maxInd_next[!(maxInd_next %in% selPeak.j)]
newPeak.j <- as.list(unSelPeak.j)
names(newPeak.j) <- unSelPeak.j
## Update ridgeList
ridgeList <- c(ridgeList, newPeak.j)
maxInd_curr <- c(selPeak.j, unSelPeak.j)
## Update peakStatus
newPeakStatus <- as.list(rep(0, length(newPeak.j)))
names(newPeakStatus) <- newPeak.j
peakStatus <- c(peakStatus, newPeakStatus)
} else {
maxInd_curr <- selPeak.j
}
}
## Attach the peak level at the beginning of the ridge names
if (length(ridgeList) > 0) names(ridgeList) <- paste(1, names(ridgeList), sep='_')
if (length(orphanRidgeList) > 0) names(orphanRidgeList) <- orphanRidgeName
## Combine ridgeList and orphanRidgeList
ridgeList <- c(ridgeList, orphanRidgeList)
if (length(ridgeList) == 0) return(NULL)
## Reverse the order as from the low level to high level.
ridgeList <- lapply(ridgeList, rev)
## order the ridgeList in increasing order
##ord <- order(selPeak.j)
##ridgeList <- ridgeList[ord]
## Remove possible duplicated ridges
ridgeList <- ridgeList[!duplicated(names(ridgeList))]
attr(ridgeList, 'class') <- 'ridgeList'
attr(ridgeList, 'scales') <- scales
return(ridgeList)
}
descendMin <- function(y, istart = which.max(y)) {
if (!is.double(y)) y <- as.double(y)
unlist(.C(C_DescendMin,
y,
length(y),
as.integer(istart-1),
ilower = integer(1),
iupper = integer(1))[4:5]) + 1
}
descendMinTol <- function(d,startpos,maxDescOutlier) {
l <- startpos[1]; r <- startpos[2]; outl <- 0; N <- length(d)
## left
while ((l > 1) && (d[l] > 0) && outl <= maxDescOutlier) {
if (outl > 0) vpos <- opos else vpos <- l
if (d[l-1] > d[vpos]) outl <- outl + 1 else outl <- 0
if (outl == 1) opos <- l
l <- l -1
}
if (outl > 0) l <- l + outl
## right
outl <- 0;
while ((r < N) && (d[r] > 0) && outl <= maxDescOutlier) {
if (outl > 0) vpos <- opos else vpos <- r
if (d[r+1] > d[vpos]) outl <- outl + 1 else outl <- 0
if (outl == 1) opos <- r
r <- r + 1
}
if (outl > 0) r <- r - outl
c(l,r)
}
joinOverlappingPeaks <- function(td, d, otd, omz, od, scantime, scan.range, peaks, maxGaussOverlap=0.5) {
## Fix issue #284: avoid having identical peaks multiple times in this
## matrix.
peaks <- unique(peaks)
gausspeaksidx <- which(!is.na(peaks[,"mu"]))
Ngp <- length(gausspeaksidx)
if (Ngp == 0)
return(peaks)
newpeaks <- NULL
gpeaks <- peaks[gausspeaksidx, , drop = FALSE]
if (nrow(peaks) - Ngp > 0)
notgausspeaks <- peaks[-gausspeaksidx, , drop = FALSE]
if (Ngp > 1) {
comb <- which(upper.tri(matrix(0, Ngp, Ngp)), arr.ind = TRUE)
overlap <- logical(nrow(comb))
overlap <- rep(FALSE, dim(comb)[1])
for (k in seq_len(nrow(comb))) {
p1 <- comb[k, 1]
p2 <- comb[k, 2]
overlap[k] <- gaussCoverage(xlim = scan.range,
h1 = gpeaks[p1, "h"],
mu1 = gpeaks[p1, "mu"],
s1 = gpeaks[p1, "sigma"],
h2 = gpeaks[p2, "h"],
mu2 = gpeaks[p2, "mu"],
s2 = gpeaks[p2, "sigma"]) >=
maxGaussOverlap
}
} else overlap <- FALSE
if (any(overlap) && (Ngp > 1)) {
jlist <- list()
if (length(which(overlap)) > 1) {
gm <- comb[overlap, ]
## create list of connected components
cc <- list()
cc[[1]] <- gm[1,] ## copy first entry
for (j in 2:dim(gm)[1]) { ## search for connections
ccl <- unlist(cc)
nl <- sapply(cc, function(x) length(x))
ccidx <- rep(1:length(nl),nl)
idx <- match(gm[j,],ccl)
if (any(!is.na(idx))) { ## connection found, add to list
pos <- ccidx[idx[which(!is.na(idx))[1]]]
cc[[pos]] <- c(cc[[pos]],gm[j,])
} else ## create new list element
cc[[length(cc) + 1]] <- gm[j,]
}
ccn <- list()
lcc <- length(cc)
ins <- rep(FALSE,lcc)
if (lcc > 1) {
jcomb <- which(upper.tri(matrix(0,lcc,lcc)),arr.ind = TRUE)
for (j in 1:dim(jcomb)[1]) {
j1 <- jcomb[j,1]; j2 <- jcomb[j,2]
if (any(cc[[j1]] %in% cc[[j2]]))
ccn[[length(ccn) +1]] <- unique(c(cc[[j1]],cc[[j2]]))
else {
if (!ins[j1]) {
ccn[[length(ccn) +1]] <- unique(cc[[j1]])
ins[j1] <- TRUE
}
if (!ins[j2]) {
ccn[[length(ccn) +1]] <- unique(cc[[j2]])
ins[j2] <- TRUE
}
}
}
} else ccn <- cc;
size <- sapply(ccn, function(x) length(x))
s2idx <- which(size >= 2)
if (length(s2idx) > 0) {
for (j in 1:length(s2idx)) {
pgroup <- unique(ccn[[ s2idx[j] ]])
jlist[[j]] <- pgroup
}
} else stop('(length(s2idx) = 0) ?!?')
} else jlist[[1]] <- comb[overlap, ]
## join all peaks belonging to one cc
for (j in seq_along(jlist)) {
jidx <- jlist[[j]]
newpeak <- gpeaks[jidx[1], , drop = FALSE]
newmin <- min(gpeaks[jidx, "lmin"])
newmax <- max(gpeaks[jidx, "lmax"])
newpeak[1, "scpos"] <- -1 ## not defined after join
newpeak[1, "scmin"] <- -1 ## ..
newpeak[1, "scmax"] <- -1 ## ..
newpeak[1, "scale"] <- -1 ## ..
newpeak[1, "maxo"] <- max(gpeaks[jidx, "maxo"])
newpeak[1, "sn"] <- max(gpeaks[jidx, "sn"])
newpeak[1, "lmin"] <- newmin
newpeak[1, "lmax"] <- newmax
newpeak[1, "rtmin"] <- scantime[td[newmin]]
newpeak[1, "rtmax"] <- scantime[td[newmax]]
newpeak[1,"rt"] <- weighted.mean(gpeaks[jidx, "rt"],
w = gpeaks[jidx, "maxo"])
## Re-assign m/z values
p1 <- match(td[newmin], otd)[1]
p2 <- match(td[newmax], otd)
p2 <- p2[length(p2)]
if (is.na(p1)) p1 <- 1
if (is.na(p2)) p2 <- length(omz)
mz.value <- omz[p1:p2]
mz.int <- od[p1:p2]
## re-calculate m/z value for peak range
#mzmean <- do.call(mzCenterFun, list(mz = mz.value,
# intensity = mz.int))
mzmean <- weighted.mean(mz.value, mz.int)
mzrange <- range(mz.value)
newpeak[1, "mz"] <- mzmean
newpeak[1, c("mzmin","mzmax")] <- mzrange
## re-fit gaussian
md <- max(d[newmin:newmax])
d1 <- d[newmin:newmax] / md
pgauss <- fitGauss(td[newmin:newmax],
d[newmin:newmax],
pgauss = list(mu = td[newmin] +
(td[newmax] - td[newmin])/2,
sigma = td[newmax] - td[newmin],
h = max(gpeaks[jidx, "h"])))
if (!any(is.na(pgauss)) && all(pgauss > 0)) {
newpeak[1, "mu"] <- pgauss$mu
newpeak[1, "sigma"] <- pgauss$sigma
newpeak[1, "h"] <- pgauss$h
newpeak[1, "egauss"]<- sqrt((1/length(td[newmin:newmax])) *
sum(((d1 - gauss(td[newmin:newmax],
pgauss$h/md,
pgauss$mu,
pgauss$sigma))^2)))
} else { ## re-fit after join failed
newpeak[1, "mu"] <- NA
newpeak[1, "sigma"] <- NA
newpeak[1, "h"] <- NA
newpeak[1, "egauss"] <- NA
}
newpeaks <- rbind(newpeaks, newpeak)
}
## add the remaining peaks
jp <- unique(unlist(jlist))
if (dim(peaks)[1] - length(jp) > 0)
newpeaks <- rbind(newpeaks, gpeaks[-jp, ])
} else
newpeaks <- gpeaks
grt.min <- newpeaks[, "rtmin"]
grt.max <- newpeaks[, "rtmax"]
if (nrow(peaks) - Ngp > 0) { ## notgausspeaks
for (k in 1:nrow(notgausspeaks)) {
## here we can only check if they are completely overlapped
## by other peaks
if (!any((notgausspeaks[k, "rtmin"] >= grt.min) &
(notgausspeaks[k,"rtmax"] <= grt.max)))
newpeaks <- rbind(newpeaks,notgausspeaks[k,])
}
}
rownames(newpeaks) <- NULL
newpeaks
}
trimm <- function(x, trim=c(0.05,0.95)) {
a <- sort(x[x>0])
Na <- length(a)
quant <- round((Na*trim[1])+1):round(Na*trim[2])
a[quant]
}
findEqualGreaterM <- function(x, values) {
if (!is.double(x)) x <- as.double(x)
if (!is.double(values)) values <- as.double(values)
.C(C_FindEqualGreaterM,
x,
length(x),
values,
length(values),
index = integer(length(values)))$index + 1
}
rectUnique <- function(m, order = seq(length = nrow(m)), xdiff = 0, ydiff = 0) {
nr <- nrow(m)
nc <- ncol(m)
if (!is.double(m))
m <- as.double(m)
.C(C_RectUnique,
m,
as.integer(order-1),
nr,
nc,
as.double(xdiff),
as.double(ydiff),
logical(nrow(m)),
DUP = FALSE)[[7]]
}
na.flatfill <- function(x) {
realloc <- which(!is.na(x))
if (realloc[1] > 1)
x[1:(realloc[1]-1)] <- x[realloc[1]]
if (realloc[length(realloc)] < length(x))
x[(realloc[length(realloc)]+1):length(x)] <- x[realloc[length(realloc)]]
x
}
SSgauss <- selfStart(~ h*exp(-(x-mu)^2/(2*sigma^2)), function(mCall, data, LHS) {
xy <- sortedXyData(mCall[["x"]], LHS, data)
len <- dim(xy)[1]
xyarea <- sum((xy[2:len,2]+xy[1:(len-1),2])*(xy[2:len,1]-xy[1:(len-1),1]))/2
maxpos <- which.max(xy[,2])
mu <- xy[maxpos,1]
h <- xy[maxpos,2]
sigma <- xyarea/(h*sqrt(2*pi))
value <- c(mu, sigma, h)
names(value) <- mCall[c("mu", "sigma", "h")]
value
}, c("mu", "sigma", "h"))
.narrow_rt_boundaries <- function(lm, d, thresh = 1) {
lm_seq <- lm[1]:lm[2]
above_thresh <- d[lm_seq] >= thresh
if (any(above_thresh)) {
## Expand by one on each side to be consistent with old code.
above_thresh <- above_thresh | c(above_thresh[-1], FALSE) |
c(FALSE, above_thresh[-length(above_thresh)])
lm <- range(lm_seq[above_thresh], na.rm = TRUE)
}
lm
}
.rawMat <- function(mz, int, scantime, valsPerSpect, mzrange = numeric(), rtrange = numeric(), scanrange = numeric(), log = FALSE) {
if (length(rtrange) >= 2) {
rtrange <- range(rtrange)
## Fix for issue #267. rtrange outside scanrange causes scanrange
## being c(Inf, -Inf)
scns <- which((scantime >= rtrange[1]) & (scantime <= rtrange[2]))
if (!length(scns))
return(matrix(
nrow = 0, ncol = 3,
dimnames = list(character(), c("time", "mz", "intensity"))))
scanrange <- range(scns)
}
if (length(scanrange) < 2)
scanrange <- c(1, length(valsPerSpect))
else scanrange <- range(scanrange)
if (!all(is.finite(scanrange)))
stop("'scanrange' does not contain finite values")
if (!all(is.finite(mzrange)))
stop("'mzrange' does not contain finite values")
if (!all(is.finite(rtrange)))
stop("'rtrange' does not contain finite values")
if (scanrange[1] == 1)
startidx <- 1
else
startidx <- sum(valsPerSpect[1:(scanrange[1] - 1)]) + 1
endidx <- sum(valsPerSpect[1:scanrange[2]])
scans <- rep(scanrange[1]:scanrange[2],
valsPerSpect[scanrange[1]:scanrange[2]])
masses <- mz[startidx:endidx]
massidx <- 1:length(masses)
if (length(mzrange) >= 2) {
mzrange <- range(mzrange)
massidx <- massidx[(masses >= mzrange[1] & (masses <= mzrange[2]))]
}
int <- int[startidx:endidx][massidx]
if (log && (length(int) > 0))
int <- log(int + max(1 - min(int), 0))
cbind(time = scantime[scans[massidx]],
mz = masses[massidx],
intensity = int)
}
.getPeakGroupsRtMatrix <- function(peaks, peakIndex, sampleIndex, missingSample, extraPeaks) {
## For each feature:
## o extract the retention time of the peak with the highest intensity.
## o skip peak groups if they are not assigned a peak in at least a
## minimum number of samples OR if have too many peaks from the same
## sample assigned to it.
nSamples <- length(sampleIndex)
rt <- lapply(peakIndex, function(z) {
cur_fts <- peaks[z, c("rt", "into", "sample"), drop = FALSE]
## Return NULL if we've got less samples that required or is the total
## number of peaks is larger than a certain threshold.
## Note that the original implementation is not completely correct!
## nsamp > nsamp + extraPeaks might be correct.
nsamp <- length(unique(cur_fts[, "sample"]))
if (nsamp < (nSamples - missingSample) |
nrow(cur_fts) > (nsamp + extraPeaks))
return(NULL)
cur_fts[] <- cur_fts[order(cur_fts[, 2], decreasing = TRUE), ]
cur_fts[match(sampleIndex, cur_fts[, 3]), 1]
})
rt <- do.call(rbind, rt)
## Order them by median retention time. NOTE: this is different from the
## original code, in which the peak groups are ordered by the median
## retention time that is calculated over ALL peaks within the peak
## group, not only to one peak selected for each sample (for multi
## peak per sample assignments).
## Fix for issue #175
if (is(rt, "matrix")) {
rt <- rt[order(Biobase::rowMedians(rt, na.rm = TRUE)), , drop = FALSE]
}
rt
}
.peakIndex <- function(object) {
if (inherits(object, "DataFrame")) {
idxs <- object$peakidx
names(idxs) <- rownames(object)
} else {
if (is.null(object[["FeatureGroupTable"]]))
stop("No feature definitions present. Please run groupChromPeaks first.")
idxs <- object[["FeatureGroupTable"]]@listData[["peakidx"]]
names(idxs) <- rownames(object[["FeatureGroupTable"]])
}
idxs
}
.applyRtAdjToChromPeaks <- function(x, rtraw, rtadj) {
## Using a for loop here.
for (i in 1:length(rtraw)) {
whichSample <- which(x[, "sample"] == i)
if (length(whichSample)) {
x[whichSample, c("rt", "rtmin", "rtmax")] <-
.applyRtAdjustment(x[whichSample, c("rt", "rtmin", "rtmax")],
rtraw = rtraw[[i]], rtadj = rtadj[[i]])
}
}
x
}
.applyRtAdjustment <- function(x, rtraw, rtadj) {
## re-order everything if rtraw is not sorted; issue #146
if (is.unsorted(rtraw)) {
idx <- order(rtraw)
rtraw <- rtraw[idx]
rtadj <- rtadj[idx]
}
adjFun <- stepfun(rtraw[-1] - diff(rtraw) / 2, rtadj)
res <- adjFun(x)
## Fix margins.
idx_low <- which(x < rtraw[1])
if (length(idx_low)) {
first_adj <- idx_low[length(idx_low)] + 1
res[idx_low] <- x[idx_low] + res[first_adj] - x[first_adj]
}
idx_high <- which(x > rtraw[length(rtraw)])
if (length(idx_high)) {
last_adj <- idx_high[1] - 1
res[idx_high] <- x[idx_high] + res[last_adj] - x[last_adj]
}
if (is.null(dim(res)))
names(res) <- names(x)
res
}
.getChromPeakData <- function(object, peakArea, sample_idx,mzCenterFun = "weighted.mean",cn = c("mz", "rt", "into", "maxo", "sample")) {
ncols <- length(cn)
res <- matrix(ncol = ncols, nrow = nrow(peakArea))
colnames(res) <- cn
res[, "sample"] <- sample_idx
res[, c("mzmin", "mzmax")] <- peakArea[, c("mzmin", "mzmax")]
## Load the data
message("Requesting ", nrow(res), " peaks from ",
basename(MSnbase::fileNames(object)), " ... ", appendLF = FALSE)
spctr <- MSnbase::spectra(object, BPPARAM = SerialParam())
mzs <- lapply(spctr, mz)
valsPerSpect <- lengths(mzs)
ints <- unlist(lapply(spctr, intensity), use.names = FALSE)
rm(spctr)
mzs <- unlist(mzs, use.names = FALSE)
mzs_range <- range(mzs)
rtim <- rtime(object)
rtim_range <- range(rtim)
for (i in seq_len(nrow(res))) {
rtr <- peakArea[i, c("rtmin", "rtmax")]
mzr <- peakArea[i, c("mzmin", "mzmax")]
## If the rt range is completely out; additional fix for #267
if (rtr[2] < rtim_range[1] | rtr[1] > rtim_range[2] |
mzr[2] < mzs_range[1] | mzr[1] > mzs_range[2]) {
res[i, ] <- rep(NA_real_, ncols)
next
}
## Ensure that the mz and rt region is within the range of the data.
rtr[1] <- max(rtr[1], rtim_range[1])
rtr[2] <- min(rtr[2], rtim_range[2])
mzr[1] <- max(mzr[1], mzs_range[1])
mzr[2] <- min(mzr[2], mzs_range[2])
mtx <- .rawMat(mz = mzs, int = ints, scantime = rtim,
valsPerSpect = valsPerSpect, rtrange = rtr,
mzrange = mzr)
if (length(mtx)) {
if (any(!is.na(mtx[, 3]))) {
## How to calculate the area: (1)sum of all intensities / (2)by
## the number of data points (REAL ones, considering also NAs)
## and multiplied with the (3)rt width.
## (1) sum(mtx[, 3], na.rm = TRUE)
## (2) sum(rtim >= rtr[1] & rtim <= rtr[2]) - 1 ; if we used
## nrow(mtx) here, which would correspond to the non-NA
## intensities within the rt range we don't get the same results
## as e.g. centWave. Using max(1, ... to avoid getting Inf in
## case the signal is based on a single data point.
## (3) rtr[2] - rtr[1]
res[i, "into"] <- sum(mtx[, 3], na.rm = TRUE) *
((rtr[2] - rtr[1]) /
max(1, (sum(rtim >= rtr[1] & rtim <= rtr[2]) - 1)))
maxi <- which.max(mtx[, 3])
res[i, c("rt", "maxo")] <- mtx[maxi[1], c(1, 3)]
res[i, c("rtmin", "rtmax")] <- rtr
## Calculate the intensity weighted mean mz
meanMz <- do.call(mzCenterFun, list(mtx[, 2], mtx[, 3]))
if (is.na(meanMz)) meanMz <- mtx[maxi[1], 2]
res[i, "mz"] <- meanMz
} else {
res[i, ] <- rep(NA_real_, ncols)
}
} else {
res[i, ] <- rep(NA_real_, ncols)
}
}
message("got ", sum(!is.na(res[, "into"])), ".")
return(res)
}
.feature_values <- function(pks, fts, method, value = "into",intensity = "into", colnames, missing = NA) {
ftIdx <- fts$peakidx
## Match columns
idx_rt <- match("rt", colnames(pks))
idx_int <- match(intensity, colnames(pks))
idx_samp <- match("sample", colnames(pks))
vals <- matrix(nrow = length(ftIdx), ncol = length(colnames))
nSamples <- seq_along(colnames)
if (method == "sum") {
for (i in seq_along(ftIdx)) {
cur_pks <- pks[ftIdx[[i]], c(value, "sample")]
int_sum <- split(cur_pks[, value], cur_pks[, "sample"])
vals[i, as.numeric(names(int_sum))] <-
unlist(lapply(int_sum, base::sum), use.names = FALSE)
}
} else {
if (method == "medret") {
medret <- fts$rtmed
for (i in seq_along(ftIdx)) {
gidx <- ftIdx[[i]][
base::order(base::abs(pks[ftIdx[[i]],
idx_rt] - medret[i]))]
vals[i, ] <- gidx[
base::match(nSamples, pks[gidx, idx_samp])]
}
}
if (method == "maxint") {
for (i in seq_along(ftIdx)) {
gidx <- ftIdx[[i]][
base::order(pks[ftIdx[[i]], idx_int],
decreasing = TRUE)]
vals[i, ] <- gidx[base::match(nSamples,
pks[gidx, idx_samp])]
}
}
if (value != "index") {
if (!any(colnames(pks) == value))
stop("Column '", value, "' not present in the ",
"chromatographic peaks matrix!")
vals <- pks[vals, value]
dim(vals) <- c(length(ftIdx), length(nSamples))
}
}
if (value != "index") {
if (is.numeric(missing)) {
vals[is.na(vals)] <- missing
}
if (!is.na(missing) & missing == "rowmin_half") {
for (i in seq_len(nrow(vals))) {
nas <- is.na(vals[i, ])
if (any(nas))
vals[i, nas] <- min(vals[i, ], na.rm = TRUE) / 2
}
}
}
colnames(vals) <- colnames
rownames(vals) <- rownames(fts)
vals
}
.groupval <-function(xset, method = c("medret", "maxint"),value = "into", intensity = "into"){
if ( nrow(xset@groups)<1 || length(xset@groupidx) <1) {
stop("xcmsSet is not been grouped.")
}
method <- match.arg(method)
peakmat <- xset@peaks
groupmat <- xset@groups
groupindex <- xset@groupidx
sampnum <- seq(length = length(rownames(xset@phenoData)))
retcol <- match("rt", colnames(peakmat))
intcol <- match(intensity, colnames(peakmat))
sampcol <- match("sample", colnames(peakmat))
values <- matrix(nrow = length(groupindex), ncol = length(sampnum))
if (method == "medret") {
for (i in seq(along = groupindex)) {
gidx <- groupindex[[i]][order(abs(peakmat[groupindex[[i]],retcol] - median(peakmat[groupindex[[i]],retcol])))]
values[i,] <- gidx[match(sampnum, peakmat[gidx,sampcol])]
}
} else {
for (i in seq(along = groupindex)) {
gidx <- groupindex[[i]][order(peakmat[groupindex[[i]],intcol], decreasing = TRUE)]
values[i,] <- gidx[match(sampnum, peakmat[gidx,sampcol])]
}
}
if (value != "index") {
values <- peakmat[values,value]
dim(values) <- c(length(groupindex), length(sampnum))
}
colnames(values) <- rownames(xset@phenoData)
rownames(values) <- paste(round(groupmat[,"mzmed"],1), round(groupmat[,"rtmed"]), sep = "/")
values
}
### ---- ===== MatchedFilter Sub Kit ==== ---- #####
binYonX <- function(x, y, breaks, nBins, binSize, binFromX,
binToX, fromIdx = 1L, toIdx = length(x),
method = "max", baseValue,
sortedX = !is.unsorted(x),
shiftByHalfBinSize = FALSE,
returnIndex = FALSE, returnX = TRUE) {
if (!missing(x) & missing(y))
y <- x
if (missing(x) | missing(y))
stop("Arguments 'x' and 'y' are mandatory!")
if (missing(nBins) & missing(binSize) & missing(breaks))
stop("One of 'breaks', 'nBins' or 'binSize' has to be defined!")
if (!sortedX) {
message("'x' is not sorted, will sort 'x' and 'y'.")
## Sort method; see issue #180 for MSnbase
## Note: order method = "radix" is considerably faster - but there is no
## method argument for older R versions.
o <- order(x)
x <- x[o]
y <- y[o]
}
## Check fromIdx and toIdx
if (any(fromIdx < 1) | any(toIdx > length(x)))
stop("'fromIdx' and 'toIdx' have to be within 1 and lenght(x)!")
if (length(toIdx) != length(fromIdx))
stop("'fromIdx' and 'toIdx' have to have the same length!")
if (missing(binFromX))
binFromX <- as.double(NA)
if (missing(binToX))
binToX <- as.double(NA)
## For now we don't allow NAs in x
if (anyNA(x))
stop("No 'NA' values are allowed in 'x'!")
## Check that input values have the correct data types.
if (!is.double(x)) x <- as.double(x)
if (!is.double(y)) y <- as.double(y)
if (!is.double(binFromX)) binFromX <- as.double(binFromX)
if (!is.double(binToX)) binToX <- as.double(binToX)
if (!is.integer(fromIdx)) fromIdx <- as.integer(fromIdx)
if (!is.integer(toIdx)) toIdx <- as.integer(toIdx)
## breaks has precedence over nBins over binSize.
shiftIt <- 0L
if (!missing(breaks)) {
if (shiftByHalfBinSize)
warning("Argument 'shiftByHalfBinSize' is ignored if 'breaks'",
" are provided.")
if (!is.double(breaks)) breaks <- as.double(nBins)
nBins <- NA_integer_
binSize <- as.double(NA)
} else {
if (!missing(nBins)) {
breaks <- as.double(NA)
if (!is.integer(nBins)) nBins <- as.integer(nBins)
binSize <- as.double(NA)
} else{
breaks <- as.double(NA)
nBins <- NA_integer_
if (!is.double(binSize)) binSize <- as.double(binSize)
}
}
if (shiftByHalfBinSize)
shiftIt <- 1L
## Define default value for baseValue
if (missing(baseValue)) {
baseValue = as.double(NA)
} else {
if (!is.double(baseValue)) baseValue <- as.double(baseValue)
}
getIndex <- 0L
if (returnIndex)
getIndex <- 1L
getX <- 0L
if (returnX)
getX <- 1L
if (length(toIdx) > 1) {
.Call(C_binYonX_multi, x, y, breaks, nBins, binSize,
binFromX, binToX, force(fromIdx - 1L), force(toIdx - 1L),
shiftIt,
as.integer(.aggregateMethod2int(method)),
baseValue,
getIndex,
getX,
PACKAGE = "MetaboAnalystR")
} else {
.Call(C_binYonX, x, y, breaks, nBins, binSize,
binFromX, binToX, force(fromIdx - 1L), force(toIdx - 1L),
shiftIt,
as.integer(.aggregateMethod2int(method)),
baseValue,
getIndex,
getX,
PACKAGE = "MetaboAnalystR")
}
}
.aggregateMethod2int <- function(method = "max") {
.aggregateMethods <- c(1, 2, 3, 4);
names(.aggregateMethods) <- c("max", "min", "sum", "mean")
method <- match.arg(method, names(.aggregateMethods))
return(.aggregateMethods[method])
}
imputeLinInterpol <- function(x, baseValue, method = "lin", distance = 1L,
noInterpolAtEnds = FALSE) {
method <- match.arg(method, c("none", "lin", "linbase")) ## interDist? distance = 2
if (method == "none") {
return(x)
}
if (!is.double(x)) x <- as.double(x)
if (method == "lin") {
noInter <- 0L
if (noInterpolAtEnds)
noInter <- 1L
return(.Call(C_impute_with_linear_interpolation, x, noInter,
PACKAGE = "MetaboAnalystR"))
}
if (method == "linbase") {
if (missing(baseValue))
baseValue <- min(x, na.rm = TRUE) / 2
if (!is.double(baseValue)) baseValue <- as.double(baseValue)
if (!is.integer(distance)) distance <- as.integer(distance)
return(.Call(C_impute_with_linear_interpolation_base, x, baseValue,
distance, PACKAGE = "MetaboAnalystR"))
}
}
colMax <- function (x, na.rm = FALSE, dims = 1) {
if (is.data.frame(x))
x <- as.matrix(x)
if (!is.array(x) || length(dn <- dim(x)) < 2)
stop("`x' must be an array of at least two dimensions")
if (dims < 1 || dims > length(dn) - 1)
stop("invalid `dims'")
n <- prod(dn[1:dims])
dn <- dn[-(1:dims)]
if (!is.double(x)) x <- as.double(x)
z <- .C(C_ColMax,
x,
as.integer(n),
as.integer(prod(dn)),
double(prod(dn)),
PACKAGE = "MetaboAnalystR")[[4]]
if (length(dn) > 1) {
dim(z) <- dn
dimnames(z) <- dimnames(x)[-(1:dims)]
}
else names(z) <- dimnames(x)[[dims + 1]]
z
}
filtfft <- function(y, filt) {
yfilt <- numeric(length(filt))
yfilt[1:length(y)] <- y
yfilt <- fft(fft(yfilt, inverse = TRUE) * filt)
Re(yfilt[1:length(y)])
}
descendZero <- function(y, istart = which.max(y)) {
if (!is.double(y)) y <- as.double(y)
unlist(.C(C_DescendZero,
y,
length(y),
as.integer(istart-1),
ilower = integer(1),
iupper = integer(1),
PACKAGE = "MetaboAnalystR")[4:5]) + 1
}
which.colMax <- function (x, na.rm = FALSE, dims = 1) {
if (is.data.frame(x))
x <- as.matrix(x)
if (!is.array(x) || length(dn <- dim(x)) < 2)
stop("`x' must be an array of at least two dimensions")
if (dims < 1 || dims > length(dn) - 1)
stop("invalid `dims'")
n <- prod(dn[1:dims])
dn <- dn[-(1:dims)]
if (!is.double(x)) x <- as.double(x)
z <- .C(C_WhichColMax,
x,
as.integer(n),
as.integer(prod(dn)),
integer(prod(dn)),
PACKAGE = "MetaboAnalystR")[[4]]
if (length(dn) > 1) {
dim(z) <- dn
dimnames(z) <- dimnames(x)[-(1:dims)]
}
else names(z) <- dimnames(x)[[dims + 1]]
z
}
breaks_on_nBins <- function(fromX, toX, nBins, shiftByHalfBinSize = FALSE) {
if(missing(fromX) | missing(toX) | missing(nBins))
stop("'fromX', 'toX' and 'nBins' are required!")
if (!is.double(fromX)) fromX <- as.double(fromX)
if (!is.double(toX)) toX <- as.double(toX)
if (!is.integer(nBins)) nBins <- as.integer(nBins)
shiftIt <- 0L
if (shiftByHalfBinSize)
shiftIt <- 1L
return(.Call(C_breaks_on_nBins, fromX, toX, nBins, shiftIt,
PACKAGE = "MetaboAnalystR"))
}
### ---- ==== Botthom of this sub Kit ==== --- ####
#
######### --------------======== Bottom of this Function Kit =============--------
# ---------------------------2. Peaks Picking_Section-------------------------------
### Functions_Peak Peaking _ used for parameters optimization
#' @title Data Preparation for ChromPeaking Finding
#' @param object MSnExp object.
#' @import MSnbase
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
PeakPicking_prep <-function(object){
## Check if the data is centroided
centroided <- all(centroided(object))
if (is.na(centroided)) {
suppressWarnings(
centroided <- isCentroided(
object[[ceiling(length(object) / 3)]])
)
}
if (is.na(centroided) || !centroided)
warning("Your data appears to be not centroided! CentWave",
" works best on data in centroid mode.")
##### Revise for Centwave MSnExp -------------
## (1) split the object per file from MSnExp.
object_mslevel_i<-splitByFile(object = object, f = factor(c(1:length(object@phenoData@data[["sample_name"]]))))
object_mslevel_o<-bplapply(object_mslevel_i, FUN = function(x){x@assayData})
message("Data Spliting Finished !")
## (2) use parallel to do the peak detection.
message("Peak Preparing Begin...")
object_mslevel_name<-bplapply(object_mslevel_o,ls);
object_mslevell<-object_mslevel<-list();
for (j in 1:length(object_mslevel_o)){
for (i in 1:length(object_mslevel_name[[j]])){
#print(i,j,"\n")
object_mslevell[[i]]<-object_mslevel_o[[j]][[object_mslevel_name[[j]][i]]]
}
names(object_mslevell)<-object_mslevel_name[[j]];
object_mslevel[[j]]<-object_mslevell;
}
for (i in 1:length(object_mslevel)){
#for (j in 1:length(object_mslevel[[i]])){
ncount<-as.numeric(which(sapply(names(object_mslevel[[i]]),is.na)));
if (!identical(ncount,numeric(0))){
object_mslevel[[i]]<-object_mslevel[[i]][-ncount]
}
#}
}
message("Peak Preparing Done !")
return(object_mslevel)
}
#' @title Calculate PPS method
#' @description Peak picking method. Specifically used for parameters optimization
#' @param xset MSnExp object.
#' @param object_mslevel List, prepared by findChromPeaks_prep function.
#' @param param Parameters list.
#' @param BPPARAM Parallel Method.
#' @param msLevel msLevel. Only 1 is supported currently.
#' @import MSnbase
#' @author Zhiqiang Pang \email{zhiqiang.pang@mail.mcgill.ca}
#' Mcgill University
#' License: GNU GPL (>= 2)
PeakPicking_core <-function(object,object_mslevel,param, BPPARAM = bpparam(), msLevel = 1L){
## Restrict to MS 1 data for now.
if (length(msLevel) > 1)
stop("Currently only peak detection in a single MS level is ",
"supported", call. = FALSE)
if (is.null(param$fwhm)){
resList <- bplapply(object_mslevel,
FUN = PeakPicking_centWave_slave,
param = param,
BPPARAM = BPPARAM)
} else {
resList <- bplapply(object_mslevel,
FUN = PeakPicking_MatchedFilter_slave,
param = param,
BPPARAM = BPPARAM)
}
print("Peak Profiling Finished !")
rm(object_mslevel)
## (3) PEAK SUMMARY------------
pks <- vector("list", length(resList))
for (i in 1:length(resList)) {
n_pks <- nrow(resList[[i]])
if (is.null(n_pks))
n_pks <- 0
if (n_pks == 0) {
pks[[i]] <- NULL
warning("No peaks found in sample number ", i, ".")
} else {
pks[[i]] <- cbind(resList[[i]], sample = rep.int(i, n_pks))
}
}
pks <- do.call(rbind, pks)
# Make the chrompeaks embeded
newFd <- list()
#newFd@.xData <- as.environment(as.list(object@msFeatureData, all.names = TRUE))
rownames(pks) <- sprintf(paste0("CP", "%0", ceiling(log10(nrow(pks) + 1L)), "d"),
seq(from = 1, length.out = nrow(pks)))
newFd$chromPeaks <- pks
newFd$chromPeakData <- S4Vectors::DataFrame(ms_level = rep(1L, nrow(pks)), is_filled = rep(FALSE, nrow(pks)),
row.names = rownames(pks))
## mSet Generation
mSet<-list()
mSet$msFeatureData <- newFd
mSet$inMemoryData <- object
return(mSet)
}
# -------------------------------3. MS2 Data_Section---------------------------------
#' Extract MS2 Data
#' @description This function returns a list of spectra that matches with
#' a user specified precursor m/z.
#' @param filename Name of the file (e.g. mzML, mzXML)
#' @param peakParams Object containing parameters for peak picking.
#' @param mzmin Minimum m/z when selecting a precursor from peak list
#' @param mzmax Maximum m/z when selecting a precursor from peak list
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
ExtractMS2data <- function(filename, peakParams, mzmin, mzmax){
## Function to extract MSMS spectra
xrms <- readMSData(filename, mode = "onDisk", centroided = TRUE)
# get all MS2 spectra from DDA experiment
ms2spectra <- spectra(filterMsLevel(xrms, msLevel = 2))
## Detect chromatographic peaks
# set parameters and find chromatographic peaks
ms1cwp <- CentWaveParam(snthresh = peakParams$sn_thresh,
noise = 100, ppm = peakParams$ppm, peakwidth = c(peakParams$min_pkw, peakParams$max_pkw))
ms1data <- findChromPeaks(xrms, param = ms1cwp, msLevel = 1)
#get all peaks
chromPeaks <- chromPeaks(ms1data)
# pick one compound
## find which row contains the mass we want
chp <- as.data.frame(chromPeaks)
rownum <- which(chp$mz >= mzmin & chp$mz <= mzmax)
chromPeak <- chromPeaks[rownum,]
#filter out fitting spectra
filteredMs2spectra <- .getDdaMS2Scans(chromPeak, ms2spectra)
return(filteredMs2spectra)
}
### Helper function
### function for DDA data
### Credit: Michael Witting; https://github.com/michaelwitting/metabolomics2018/blob/master/R/msLevelMerge.R
### MS2 spectra need to be list of spectrum2 objects
.getDdaMS2Scans <- function(chromPeak, ms2spectra, mzTol = 0.01, rtTol = 20) {
#make a new list
filteredMs2spectra <- list()
#isolate only the ones within range
for(i in 1:length(ms2spectra)) {
#isolate Ms2 spectrum
ms2spectrum <- ms2spectra[[i]]
#check if within range of peak
if(abs(chromPeak[4] - ms2spectrum@rt) < rtTol & abs(chromPeak[1] - ms2spectrum@precursorMz) < mzTol) {
filteredMs2spectra <- c(filteredMs2spectra, ms2spectrum)
}
}
#return list with filtered ms2 spectra
return(filteredMs2spectra)
}
#' Plot selected M2 spectra for an m/z feature
#' @description This function creates a plot of the user selected precursor m/z.
#' @param ms2 Spectrum2 class object containing all of the spectra for the selected
#' m/z feature.
#' @author Jasmine Chong \email{jasmine.chong@mail.mcgill.ca},
#' Mai Yamamoto \email{yamamoto.mai@mail.mcgill.ca}, and Jeff Xia \email{jeff.xia@mcgill.ca}
#' McGill University, Canada
#' License: GNU GPL (>= 2)
PlotMS2Spectra <- function(ms2, spectra = 1){
MSnbase::plot(ms2[[spectra]])
}
# -----------------------------------4. Annotation-----------------------
##' @references Kuhl C, Tautenhahn R, Boettcher C, Larson TR, Neumann S (2012).
##' "CAMERA: an integrated strategy for compound spectra extraction and annotation of
##' liquid chromatography/mass spectrometry data sets." Analytical Chemistry, 84, 283-289.
##' http://pubs.acs.org/doi/abs/10.1021/ac202450g.
####### ---------- ====== Function Kit From CAMERA ======= ----------- ######/
getPeaks_selection <- function(xs, method="medret", value="into"){
if (!class(xs) == "xcmsSet") {
stop ("Parameter xs is no xcmsSet object\n")
}
# Testing if xcmsSet is grouped
if (nrow(xs@groups) > 0 && length(xs@filepaths) > 1) {
# get grouping information
groupmat <- xs@groups
# generate data.frame for peaktable
ts <- data.frame(cbind(groupmat, groupval(xs, method=method, value=value)), row.names = NULL)
#rename column names
cnames <- colnames(ts)
if (cnames[1] == 'mzmed') {
cnames[1] <- 'mz'
} else {
stop ('Peak information ?!?')
}
if (cnames[4] == 'rtmed') {
cnames[4] <- 'rt'
} else {
stop ('Peak information ?!?')
}
colnames(ts) <- cnames
} else if (length(rownames(xs@phenoData)) == 1) { #Contains only one sample?
ts <- xs@peaks
} else {
stop ('First argument must be a xcmsSet with group information or contain only one sample.')
}
return(as.matrix(ts))
}
groupval <- function(object, method = c("medret", "maxint"), value = "index", intensity = "into") {
if ( nrow(object@groups)<1 || length(object@groupidx) <1) {
stop("mSet$xcmsSet is not been grouped.")
}
method <- match.arg(method)
peakmat <- object@peaks; groupmat <- object@groups; groupindex <- object@groupidx
sampnum <- seq(length = length(rownames(object@phenoData)))
retcol <- match("rt", colnames(peakmat))
intcol <- match(intensity, colnames(peakmat))
sampcol <- match("sample", colnames(peakmat))
values <- matrix(nrow = length(groupindex), ncol = length(sampnum))
if (method == "medret") {
for (i in seq(along = groupindex)) {
gidx <- groupindex[[i]][order(abs(peakmat[groupindex[[i]],retcol] - median(peakmat[groupindex[[i]],retcol])))]
values[i,] <- gidx[match(sampnum, peakmat[gidx,sampcol])]
}
} else {
for (i in seq(along = groupindex)) {
gidx <- groupindex[[i]][order(peakmat[groupindex[[i]],intcol], decreasing = TRUE)]
values[i,] <- gidx[match(sampnum, peakmat[gidx,sampcol])]
}
}
if (value != "index") {
values <- peakmat[values,value]
dim(values) <- c(length(groupindex), length(sampnum))
}
colnames(values) <- rownames(object@phenoData)
rownames(values) <- paste(round(groupmat[,"mzmed"],1), round(groupmat[,"rtmed"]), sep = "/")
values
}
percentOutput <- function(len.old, len.add, len.max, cnt){
len <- len.old + len.add;
perc <- round((len) / len.max * 100)
perc <- perc %/% 10 * 10;
if (perc != cnt && perc != 0) {
print(perc,' ');
cnt2 <- perc;
eval.parent(substitute(cnt <- cnt2))
}
if (.Platform$OS.type == "windows"){
flush.console();
}
eval.parent(substitute(len.old <- len))
}
is.wholenumber <- function(x, tol = .Machine$double.eps^0.5) abs(x - round(x)) < tol
calcIsotopeMatrix <- function(maxiso=4){
if(!is.numeric(maxiso)){
stop("Parameter maxiso is not numeric!\n")
} else if(maxiso < 1 | maxiso > 8){
stop(paste("Parameter maxiso must between 1 and 8. ",
"Otherwise use your own IsotopeMatrix.\n"),sep="")
}
isotopeMatrix <- matrix(NA, 8, 4);
colnames(isotopeMatrix) <- c("mzmin", "mzmax", "intmin", "intmax")
isotopeMatrix[1, ] <- c(1.000, 1.0040, 1.0, 150)
isotopeMatrix[2, ] <- c(0.997, 1.0040, 0.01, 200)
isotopeMatrix[3, ] <- c(1.000, 1.0040, 0.001, 200)
isotopeMatrix[4, ] <- c(1.000, 1.0040, 0.0001, 200)
isotopeMatrix[5, ] <- c(1.000, 1.0040, 0.00001, 200)
isotopeMatrix[6, ] <- c(1.000, 1.0040, 0.000001, 200)
isotopeMatrix[7, ] <- c(1.000, 1.0040, 0.0000001, 200)
isotopeMatrix[8, ] <- c(1.000, 1.0040, 0.00000001, 200)
return(isotopeMatrix[1:maxiso, , drop=FALSE])
}
findIsotopesPspec <- function(isomatrix, mz, ipeak, int, params){
#isomatrix - isotope annotations (5 column matrix)
#mz - m/z vector, contains all m/z values from specific pseudospectrum
#int - int vector, see above
#maxiso - how many isotopic peaks are allowed
#maxcharge - maximum allowed charge
#devppm - scaled ppm error
#mzabs - absolut error in m/z
#matrix with all important informationen
spectra <- matrix(c(mz, ipeak), ncol=2)
int <- int[order(spectra[, 1]), , drop=FALSE]
spectra <- spectra[order(spectra[, 1]), ];
cnt <- nrow(spectra);
#isomatrix <- matrix(NA, ncol=5, nrow=0)
#colnames(isomatrix) <- c("mpeak", "isopeak", "iso", "charge", "intrinsic")
#calculate error
error.ppm <- params$devppm * mz;
#error.abs <- ),1, function(x) x + params$mzabs*rbind(1,2,3)));
#for every peak in pseudospectrum
for ( j in 1:(length(mz) - 1)){
#create distance matrix
MI <- spectra[j:cnt, 1] - spectra[j, 1];
#Sum up all possible/allowed isotope distances + error(ppm of peak mz and mzabs)
max.index <- max(which(MI < (sum(params$IM[1:params$maxiso, "mzmax"]) + error.ppm[j] + params$mzabs )))
#check if one peaks falls into isotope window
if(max.index == 1){
#no promising candidate found, move on
next;
}
#IM - isotope matrix (column diffs(min,max) per charge, row num. isotope)
IM <- t(sapply(1:params$maxcharge,function(x){
mzmin <- (params$IM[, "mzmin"]) / x;
mzmax <- (params$IM[, "mzmax"]) / x;
error <- (error.ppm[j]+params$mzabs) / x
res <- c(0,0);
for(k in 1:length(mzmin)){
res <- c(res, mzmin[k]+res[2*k-1], mzmax[k]+res[2*k])
}
res[seq(1,length(res),by=2)] <- res[seq(1,length(res),by=2)]-error
res[seq(2,length(res),by=2)] <- res[seq(2,length(res),by=2)]+error
return (res[-c(1:2)])
} ))
#Sort IM to fix bug, with high ppm and mzabs values
#TODO: Find better solution and give feedback to user!
IM <- t(apply(IM,1,sort))
#find peaks, which m/z value is in isotope interval
hits <- t(apply(IM, 1, function(x){ findInterval(MI[1:max.index], x)}))
rownames(hits) <- c(1:nrow(hits))
colnames(hits) <- c(1:ncol(hits))
hits[which(hits==0)] <-NA
hits <- hits[, -1, drop=FALSE]
hits.iso <- hits%/%2 + 1;
#check occurence of first isotopic peak
for(iso in 1:min(params$maxiso,ncol(hits.iso))){
hit <- apply(hits.iso,1, function(x) any(naOmit(x)==iso))
hit[which(is.na(hit))] <- TRUE
if(all(hit)) break;
hits.iso[!hit,] <- t(apply(hits.iso[!hit,,drop=FALSE],1, function(x) {
if(!all(is.na(x))){
ini <- which(x > iso)
if(!is.infinite(ini) && length(ini) > 0){
x[min(ini):ncol(hits.iso)] <- NA
}
}
x
}))
}
#set NA to 0
hits[which(is.na(hits.iso))] <- 0
#check if any isotope is found
hit <- apply(hits, 1, function(x) sum(x)>0)
#drop nonhits
hits <- hits[hit, , drop=FALSE]
#if no first isotopic peaks exists, next
if(nrow(hits) == 0){
next;
}
#getting max. isotope cluster length
#TODO: unique or not????
#isolength <- apply(hits, 1, function(x) length(which(unique(x) %% 2 !=0)))
#isohits - for each charge, length of peak within intervals
isohits <- lapply(1:nrow(hits), function(x) which(hits[x, ] %% 2 !=0))
isolength <- sapply(isohits, length)
#Check if any result is found
if(all(isolength==0)){
next;
}
#itensity checks
#candidate.matrix
#first column - how often succeded the isotope intensity test
#second column - how often could a isotope int test be performed
candidate.matrix <- matrix(0, nrow=length(isohits), ncol=max(isolength)*2);
for(iso in 1:length(isohits)){
for(candidate in 1:length(isohits[[iso]])){
for(sample.index in c(1:ncol(int))){
#Test if C12 Peak is NA
if(!is.na(int[j, sample.index])){
#candidate.matrix[maxIso, 1] <- candidate.matrix[maxIso, 1] + 1
}
charge <- as.numeric(row.names(hits)[iso])
int.c12 <- int[j, sample.index]
isotopePeak <- hits[iso,isohits[[iso]][candidate]]%/%2 + 1;
if(isotopePeak == 1){
#first isotopic peak, check C13 rule
int.c13 <- int[isohits[[iso]][candidate]+j, sample.index];
int.available <- all(!is.na(c(int.c12, int.c13)))
if (int.available){
theo.mass <- spectra[j, 1] * charge; #theoretical mass
numC <- abs(round(theo.mass / 12)); #max. number of C in molecule
inten.max <- int.c12 * numC * 0.011; #highest possible intensity
inten.min <- int.c12 * 1 * 0.011; #lowest possible intensity
if((int.c13 < inten.max && int.c13 > inten.min) || !params$filter){
candidate.matrix[iso,candidate * 2 - 1] <- candidate.matrix[iso,candidate * 2 - 1] + 1
candidate.matrix[iso,candidate * 2 ] <- candidate.matrix[iso,candidate * 2] + 1
}else{
candidate.matrix[iso,candidate * 2 ] <- candidate.matrix[iso,candidate * 2] + 1
}
} else {
#todo
}
} else {
#x isotopic peak
int.cx <- int[isohits[[iso]][candidate]+j, sample.index];
int.available <- all(!is.na(c(int.c12, int.cx)))
if (int.available) {
intrange <- c((int.c12 * params$IM[isotopePeak,"intmin"]/100),
(int.c12 * params$IM[isotopePeak,"intmax"]/100))
#filter Cx isotopic peaks muss be smaller than c12
if(int.cx < intrange[2] && int.cx > intrange[1]){
candidate.matrix[iso,candidate * 2 - 1] <- candidate.matrix[iso,candidate * 2 - 1] + 1
candidate.matrix[iso,candidate * 2 ] <- candidate.matrix[iso,candidate * 2] + 1
}else{
candidate.matrix[iso,candidate * 2 ] <- candidate.matrix[iso,candidate * 2] + 1
}
} else {
candidate.matrix[iso,candidate * 2 ] <- candidate.matrix[iso,candidate * 2] + 1
}#end int.available
}#end if first isotopic peak
}#for loop samples
}#for loop candidate
}#for loop isohits
#calculate ratios
candidate.ratio <- candidate.matrix[, seq(from=1, to=ncol(candidate.matrix),
by=2)] / candidate.matrix[, seq(from=2,
to=ncol(candidate.matrix), by=2)];
if(is.null(dim(candidate.ratio))){
candidate.ratio <- matrix(candidate.ratio, nrow=nrow(candidate.matrix))
}
if(any(is.nan(candidate.ratio))){
candidate.ratio[which(is.nan(candidate.ratio))] <- 0;
}
#decision between multiple charges or peaks
for(charge in 1:nrow(candidate.matrix)){
if(any(duplicated(hits[charge, isohits[[charge]]]))){
#One isotope peaks has more than one candidate
##check if problem is still consistent
for(iso in unique(hits[charge, isohits[[charge]]])){
if(length(index <- which(hits[charge, isohits[[charge]]]==iso))== 1){
#now duplicates next
next;
}else{
#find best
index2 <- which.max(candidate.ratio[charge, index]);
save.ratio <- candidate.ratio[charge, index[index2]]
candidate.ratio[charge,index] <- 0
candidate.ratio[charge,index[index2]] <- save.ratio
index <- index[-index2]
isohits[[charge]] <- isohits[[charge]][-index]
}
}
}#end if
for(isotope in 1:ncol(candidate.ratio)){
if(candidate.ratio[charge, isotope] >= params$minfrac){
isomatrix <- rbind(isomatrix,
c(spectra[j, 2],
spectra[isohits[[charge]][isotope]+j, 2],
isotope, as.numeric(row.names(hits)[charge]), 0))
} else{
break;
}
}
}#end for charge
}#end for j
return(isomatrix)
}
getEICs <- function(xraw,peaks,maxscans=length(xraw@scantime)) {
npeaks <- dim(peaks)[1]; scans <- length(xraw@scantime)
eics <- matrix(as.numeric(0),npeaks,maxscans)
for (p in 1:npeaks) {
eics[p,1:scans] <- as.integer(getEIC(xraw,massrange=c(peaks[p,"mzmin"],peaks[p,"mzmax"]))$intensity)
}
eics
}
create.matrix <- function(dim1,dim2) {
x <- matrix()
length(x) <- dim1*dim2
dim(x) <- c(dim1,dim2)
x
}
getAllPeakEICs <- function(mSet, index=NULL){
#Checking parameter index
if(is.null(index)){
stop("Parameter index is not set.\n")
}else if(length(index) != nrow(mSet$AnnotateObject$groupInfo)){
stop("Index length must equals number of peaks.\n")
}
nfiles <- length(mSet$xcmsSet@filepaths)
scantimes <- list()
if(nfiles == 1){
#Single sample experiment
if (file.exists(mSet$xcmsSet@filepaths[1])) {
mSet_splits <- mSet$onDiskData
scantimes[[1]] <- scan_length <- sapply(mSet_splits, length)
maxscans <- max(scan_length)
#xraw <- xcmsRaw(filepaths(mSet$xcmsSet)[1],profstep=0)
#maxscans <- length(xraw@scantime)
#scantimes[[1]] <- xraw@scantime
mset$onDiskData <- mSet_splits
pdata <- as.data.frame(mSet$xcmsSet@peaks)
EIC <- create.matrix(nrow(pdata),maxscans)
EIC[,] <- getEIC4Peaks(mset,pdata,maxscans)#getEIC4Peaks(xraw,pdata,maxscans)
} else {
stop('Raw data file:',mSet$xcmsSet@filepaths[1],' not found ! \n');
}
} else {
#Multiple sample experiment
gval <- groupval(mSet$xcmsSet);
message('Generating EIC\'s . ',appendLF = F)
#na flag, stores if sample contains NA peaks
na.flag <- 0; maxscans <- 0;
mSet_splits <- split(mSet$onDiskData,fromFile(mSet$onDiskData))
message('.',appendLF = F)
if (file.exists(mSet$xcmsSet@filepaths[1])) {
scan_length <- sapply(mSet_splits, length)
maxscans <- max(scan_length)
message('.',appendLF = F)
} else {
stop('Raw data file:',mSet$xcmsSet@filepaths[1],' not found ! \n');
}
#generate EIC Matrix
EIC <- create.matrix(nrow(gval),maxscans)
mset <-list();mset$env <- new.env(parent=.GlobalEnv)
message('.')
#loop over all samples
for (f in 1:nfiles){
message(paste("Detecting",basename(mSet$xcmsSet@filepaths)[f]," ... "),appendLF = F)
#which peaks should read from this sample
idx.peaks <- which(index == f);
#check if we need data from sample f
if(length(idx.peaks) == 0){
next;
}
#check if raw data file of sample f exists
if (file.exists(mSet$xcmsSet@filepaths[f])) {
#read sample
scantimes[[f]] <- scan_length[f]
pdata <- as.data.frame(mSet$xcmsSet@peaks[gval[idx.peaks,f],,drop=FALSE]) # data for peaks from file f
#Check if peak data include NA values
if(length(which(is.na(pdata[,1]))) > 0){
na.flag <- 1;
}
mset$onDiskData <- mSet_splits[[f]]
#Generate raw data according to peak data
EIC[idx.peaks,] <- getEIC4Peaks(mset,pdata,maxscans)
} else {
stop('Raw data file:',mSet$xcmsSet@filepaths[f],' not found ! \n')
}
}
if(na.flag ==1){
print("Warning: Found NA peaks in selected sample.");
}
}
invisible(list(scantimes=scantimes,EIC=EIC));
}
getEIC4Peaks <- function(mset,peaks,maxscans){
mset$env$mz <- lapply(MSnbase::spectra(mset$onDiskData, BPPARAM = SerialParam()), mz)
mset$env$intensity <- MSnbase::intensity(mset$onDiskData, BPPARAM = SerialParam())
mset$scantime <- MSnbase::rtime(mset$onDiskData)
valCount <- cumsum(lengths(mset$env$mz, FALSE))
mset$scanindex <- as.integer(c(0, valCount[-length(valCount)])) ## Get index vector for C calls
npeaks <- dim(peaks)[1];
scans <- length(mset$scantime);
eics <- matrix(NA,npeaks,maxscans);
mset$env$mz <- as.double(unlist(mset$env$mz))
mset$env$intensity <- as.double(unlist(mset$env$intensity))
scan_time_leng <- as.integer(length(mset$scantime))
message(npeaks," peaks found ! ")
for (p in 1:npeaks) {
timerange <- c(peaks[p,"rtmin"],peaks[p,"rtmax"]);
tidx <- which((mset$scantime >= timerange[1]) & (mset$scantime <= timerange[2]));
if(length(tidx)>0){
scanrange <- range(tidx);
}else{
scanrange <- 1:scans;
}
massrange <- c(peaks[p,"mzmin"],peaks[p,"mzmax"]);
eic <- .Call(C_getEIC,
mset$env$mz,
mset$env$intensity,
as.integer(mset$scanindex),
as.double(massrange),
as.integer(scanrange),
scan_time_leng,
PACKAGE ='MetaboAnalystR' )$intensity;
eic[eic==0] <- NA;
eics[p,scanrange[1]:scanrange[2]] <- eic;
}
eics
}
calcCiS<- function(mSet, EIC=EIC, corval=0.75, pval=0.05, psg_list=NULL ){
#Columns Peak 1, Peak 2, correlation coefficienct, Pseudospectrum Index
resMat <- create.matrix(100000,4);
colnames(resMat) <- c("x","y","cor","ps")
cnt <- 0;
npeaks <- nrow(mSet$AnnotateObject$groupInfo);
ncl <- sum(sapply(mSet$AnnotateObject$pspectra, length));
npeaks.global <- 0; #Counter for % bar
npspectra <- length(mSet$AnnotateObject$pspectra);
#Check if object have been preprocessed with groupFWHM
if(npspectra < 1){
npspectra <- 1;
mSet$AnnotateObject$pspectra[[1]] <- seq(1:nrow(mSet$AnnotateObject$groupInfo));
print('Calculating peak correlations in 1 group. ');
lp <- -1;
pspectra_list <- 1;
mSet$AnnotateObject$psSamples <- 1;
}else{
if(is.null(psg_list)){
print(paste('Calculating peak correlations in',npspectra,'Groups... '));
lp <- -1;
pspectra_list <- 1:npspectra;
}else{
print(paste('Calculating peak correlations in',length(psg_list),'Groups... '));
lp <- -1;
pspectra_list <- psg_list;
ncl <- sum(sapply(mSet$AnnotateObject$pspectra[psg_list],length));
}
}
if(dim(EIC)[2] != npeaks){
EIC <- t(EIC);
#Second check, otherwise number of peaks != number of EIC curves
if(dim(EIC)[2] != npeaks){
stop(paste("Wrong dimension of EIC. It has ",dim(EIC)[1]," Rows for ",npeaks,"peaks",sep=""));
}
}
lp <- -1;
#Iterate over all PS-spectra
pb <- progress_bar$new(format = "Generating [:bar] :percent Time left: :eta", total = length(pspectra_list), clear = T, width= 75)
for(j in 1:length(pspectra_list)){
i <- pspectra_list[j];
pi <- mSet$AnnotateObject$pspectra[[i]];
npi <- length(pi);
if( ((npi^2)/2 + cnt) >= nrow(resMat)){
#resize resMat
resMat <- rbind(resMat,create.matrix(100000,4));
}
pb$tick();
#Need at least two peaks for correlation
if(npi < 2) {
next;
}
#Suppress warnings
options(warn = -1);
res <- Hmisc::rcorr(EIC[, pi], type="pearson")
options(warn = 0);
#Set lower triangle to NA
res$r[lower.tri(res$r,diag = TRUE)] <- NA;
res$P[lower.tri(res$P,diag = TRUE)] <- NA;
#Find peaks with have correlation higher corr_threshold and p <= 0.05
index <- which(( res$r > corval) & (res$P <= pval))
if( (length(index) + cnt) >= nrow(resMat)){
#resize resMat
resMat <- rbind(resMat, create.matrix(max(length(index)+1,100000),4));
}
if(length(index) > 0){
for( x in 1:(length(index))){
col <- index[x] %/% npi + 1;
row <- index[x] %% npi;
if(row == 0){
row <- npi;
col <- col - 1;
}
xi <- pi[row];
yi <- pi[col];
#y > x should be satisfied for every value, since we have set lower.tri to NA
cnt<-cnt+1;
resMat[cnt,] <- c(xi,yi,res$r[row, col],i);
}
}else{
next;
}
}
return(invisible(resMat[1:cnt,,drop=FALSE]))
}
calcPC.hcs <- function(mSet, ajc=NULL,psg_list=NULL) {
npspectra <- length(mSet$AnnotateObject$pspectra);
pspectra <- mSet$AnnotateObject$pspectra
psSamples <- mSet$AnnotateObject$psSamples;
npeaks.global <- 0;
ncl <- sum(sapply(mSet$AnnotateObject$pspectra, length));
colnames(ajc)[3] <- c("weight") ##todo: Change to generell ajc interface
#Information for % output
if(is.null(psg_list)){
print(paste('Calculating graph cross linking in', npspectra, 'Groups...'));
lperc <- -1;
pspectra_list <- 1:npspectra;
ncl <- sum(sapply(mSet$AnnotateObject$pspectra, length));
}else{
print(paste('Calculating graph cross linking in',length(psg_list),'Groups...'));
lperc <- -1;
pspectra_list <- psg_list;
ncl <- sum(sapply(mSet$AnnotateObject$pspectra[psg_list], length));
}
#peak counter
npeaks <- 0;
lp <- -1;
pb <- progress_bar$new(format = "Calculating [:bar] :percent Time left: :eta", total = length(pspectra_list), clear = T, width= 75)
for(j in 1:length(pspectra_list)){
i <- pspectra_list[j];#index of pseudospectrum
pi <- mSet$AnnotateObject$pspectra[[i]]; #peak_id in pseudospectrum
names(pi) <- as.character(pi);
#percent output
pb$tick();
#end percent output
index <- which(ajc[,4] == i)
if(length(index) < 1){
g <- ftM2graphNEL(matrix(nrow=0,ncol=2),V=as.character(pi),edgemode="undirected")
}else{
g <- ftM2graphNEL(ajc[index,1:2,drop=FALSE],W=ajc[index,3,drop=FALSE], V=as.character(pi), edgemode="undirected");
}
hcs <- highlyConnSG(g);
#order cluster after size
cnts <- sapply(hcs$clusters,length);
grps <- 1:length(hcs$clusters);
grps <- grps[order(cnts, decreasing = TRUE)]
for (ii in 1:length(grps)){
if(ii==1){
#save old pspectra number
pspectra[[i]] <- as.integer(hcs$cluster[[grps[ii]]])
pi[hcs$cluster[[grps[ii]]]] <- NA
} else {
npspectra <- npspectra +1
pspectra[[npspectra]] <- as.integer(hcs$cluster[[grps[ii]]])
pi[hcs$cluster[[grps[ii]]]] <- NA
psSamples[npspectra] <- psSamples[i]
}
}
index <- which(!is.na(pi));
if(length(index)>0){
for(ii in 1:length(index)){
npspectra <- npspectra +1
pspectra[[npspectra]] <- pi[index[ii]]
psSamples[npspectra] <- psSamples[i]
}
}
}
mSet$AnnotateObject$pspectra <- pspectra;
mSet$AnnotateObject$psSamples <- psSamples;
print(paste("New number of ps-groups: ",length(pspectra)));
return(mSet)
}
findAdducts <- function(mSet, ppm=5, mzabs=0.015, multiplier=3, polarity=NULL, rules=NULL,
max_peaks=100, psg_list=NULL, intval="maxo",maxcharge){
multFragments=FALSE;
# Scaling ppm factor
devppm <- ppm / 1000000;
# counter for % bar
npeaks.global <- 0;
# get mz values from peaklist
imz <- mSet$AnnotateObject$groupInfo[, "mz"];
#number of pseudo-spectra
npspectra <- length(mSet$AnnotateObject$pspectra);
#If groupCorr or groupFHWM have not been invoke, select all peaks in one sample
if(npspectra < 1){
npspectra <- 1;
mSet$AnnotateObject$pspectra[[1]] <- seq(1:nrow(mSet$AnnotateObject$groupInfo));
}
if(mSet$AnnotateObject$sample == 1 && length(rownames(mSet$xcmsSet@phenoData)) == 1){
##one sample case
mint <- mSet$AnnotateObject$groupInfo[, intval];
}else{
##multiple sample
if(is.na(mSet$AnnotateObject$sample[1])){
index <- 1:length(mSet$xcmsSet@filepaths);
}else{
index <- mSet$AnnotateObject$sample;
}
print("Generating peak matrix for peak annotation!");
mint <- groupval(mSet$xcmsSet,value=intval)[, index, drop=FALSE];
peakmat <- mSet$xcmsSet@peaks;
groupmat <- mSet$xcmsSet@groups;
imz <- groupmat[, "mzmed"];
irt <- groupmat[, "rtmed"];
int.val <- c();
nsample <- length(mSet$AnnotateObject$sample);
}
# isotopes
isotopes <- mSet$AnnotateObject$isotopes;
# adductlist
derivativeIons <- vector("list", length(imz));
# other variables
oidscore <- c();
index <- c();
annoID <- matrix(ncol=4, nrow=0)
annoGrp <- matrix(ncol=4, nrow=0)
colnames(annoID) <- colnames(mSet$AnnotateObject$annoID)
colnames(annoGrp) <- colnames(mSet$AnnotateObject$annoGrp)
##Examine polarity and rule set
if(!(mSet$AnnotateObject$polarity == "")){
print(paste("Polarity is set in annotaParam:", mSet$AnnotateObject$polarity));
if(is.null(rules)){
if(!is.null(mSet$AnnotateObject$ruleset)){
rules <- mSet$AnnotateObject$ruleset;
}else{
print("Ruleset could not read from object! Recalculate");
rules <- calcRules(maxcharge=maxcharge, mol=3, nion=2, nnloss=1, nnadd=1, nh=2,
polarity=mSet$AnnotateObject$polarity,
lib.loc= .libPaths(), multFragments=multFragments);
mSet$AnnotateObject$ruleset <- rules;
}
}else{
mSet$AnnotateObject$ruleset <- rules;
print("Found and use user-defined ruleset!");
}
}else{
if(!is.null(polarity)){
if(polarity %in% c("positive","negative")){
if(is.null(rules)){
rules <- calcRules(maxcharge=maxcharge, mol=3, nion=2, nnloss=1, nnadd=1,
nh=2, polarity=polarity, lib.loc= .libPaths(),
multFragments=multFragments);
}else{ print("Found and use user-defined ruleset!");}
mSet$AnnotateObject$polarity <- polarity;
}else stop("polarity mode unknown, please choose between positive and negative.")
}else if(length(mSet$xcmsSet@polarity) > 0){
index <- which(sampclass(mSet$xcmsSet) == mSet$AnnotateObject$category)[1] + mSet$AnnotateObject$sample-1;
if(mSet$xcmsSet@polarity[index] %in% c("positive","negative")){
if(is.null(rules)){
rules <- calcRules(maxcharge=maxcharge, mol=3, nion=2, nnloss=1, nnadd=1,
nh=2, polarity=mSet$xcmsSet@polarity[index],
lib.loc= .libPaths(), multFragments=multFragments);
}else{ print("Found and use user-defined ruleset!");}
mSet$AnnotateObject$polarity <- polarity;
}else stop("polarity mode in xcmsSet unknown, please define variable polarity.")
}else stop("polarity mode could not be estimated from the xcmsSet, please define variable polarity!")
#save ruleset
mSet$AnnotateObject$ruleset <- rules;
}
##Run as single or parallel mode
runParallel <- 0;
if(mSet$AnnotateObject$runParallel$enable == 1){
if(!(is.null(mSet$AnnotateObject$runParallel$cluster)) || mpi.comm.size() > 0 ){
runParallel <- 1;
}else{
warning("CAMERA runs in parallel mode, but no slaves are spawned!\nRun in single core mode!\n");
runParallel <- 0;
}
}else{
runParallel <- 0;
}
if("quasi" %in% colnames(rules)){
#backup for old rule sets
quasimolion <- which(rules[, "quasi"]== 1)
}else{
quasimolion <- which(rules[, "mandatory"]== 1)
}
#Remove recognized isotopes from annotation m/z vector
for(x in seq(along = isotopes)){
if(!is.null(isotopes[[x]])){
if(isotopes[[x]]$iso != 0){
imz[x] <- NA;
}
}
}
#counter for % bar
npeaks <- 0;
massgrp <- 0;
ncl <- sum(sapply(mSet$AnnotateObject$pspectra, length));
if (runParallel == 1) { ## ... we run in parallel mode
if(is.null(psg_list)){
print(paste('Calculating possible adducts in',npspectra,'Groups... '));
lp <- -1;
pspectra_list <- 1:npspectra;
}else{
print(paste('Calculating possible adducts in',length(psg_list),'Groups... '));
lp <- -1;
pspectra_list <- psg_list;
}
argList <- list();
cnt_peak <- 0;
if(is.null(max_peaks)){
max_peaks=100;
}
paramsGlobal <- list();
if("typ" %in% colnames(rules)){
rules.idx <- which(rules[, "typ"]== "A")
parent <- TRUE;
}else{
#backup for old rule sets
rules.idx <- 1:nrow(rules);
parent <- FALSE;
}
#Add params to env
paramsGlobal <- list()
paramsGlobal$pspectra <- mSet$AnnotateObject$pspectra;
paramsGlobal$imz <- imz;
paramsGlobal$rules <- rules;
paramsGlobal$mzabs <- mzabs;
paramsGlobal$devppm <- devppm;
paramsGlobal$isotopes <- isotopes;
paramsGlobal$quasimolion <- quasimolion;
paramsGlobal$parent <- parent;
paramsGlobal$rules.idx <- rules.idx;
#create params
#paramsGlobal <- list2env(params)
params <- list();
for(j in 1:length(pspectra_list)){
i <- pspectra_list[j];
params$i[[length(params$i)+1]] <- i;
cnt_peak <- cnt_peak+length(mSet$AnnotateObject$pspectra[[i]]);
if(cnt_peak > max_peaks || j == length(pspectra_list)){
argList[[length(argList)+1]] <- params
cnt_peak <- 0;
params <- list();
}
}
#Some informationen for the user
print(paste("Parallel mode: There are",length(argList), "tasks."))
if(is.null(mSet$AnnotateObject$runParallel$cluster)){
#Use MPI
result <- xcmsPapply(argList, annotateGrpMPI2, paramsGlobal)
}else{
#For snow
result <- xcms:::xcmsClusterApply(cl=mSet$AnnotateObject$runParallel$cluster,
x=argList, fun=annotateGrpMPI,
msgfun=msgfun.snowParallel,
paramsGlobal)
}
for(ii in 1:length(result)){
if(length(result[[ii]]) == 0){
next;
}
for(iii in 1:length(result[[ii]])){
hypothese <- result[[ii]][[iii]];
if(is.null(hypothese)){
next;
}
charge <- 0;
old_massgrp <- 0;
index <- argList[[ii]]$i[[iii]];
ipeak <- mSet$AnnotateObject$pspectra[[index]];
for(hyp in 1:nrow(hypothese)){
peakid <- as.numeric(ipeak[hypothese[hyp, "massID"]]);
if(old_massgrp != hypothese[hyp,"massgrp"]) {
massgrp <- massgrp+1;
old_massgrp <- hypothese[hyp,"massgrp"];
annoGrp <- rbind(annoGrp,c(massgrp,hypothese[hyp,"mass"],
sum(hypothese[ which(hypothese[,"massgrp"]==old_massgrp),"score"]),i) )
}
if(parent){
annoID <- rbind(annoID, cbind(peakid, massgrp, hypothese[hyp, c("ruleID","parent")]))
}else{
annoID <- rbind(annoID, cbind(peakid, massgrp, hypothese[hyp, c("ruleID")],NA))
}
}
}
}
derivativeIons <- getderivativeIons(annoID,annoGrp,rules,length(imz));
mSet$AnnotateObject$derivativeIons <- derivativeIons;
mSet$AnnotateObject$annoID <- annoID;
mSet$AnnotateObject$annoGrp <- annoGrp;
return(object)
} else {
##Parallel Core Mode
if(is.null(psg_list)){
print(paste('Calculating possible adducts in',npspectra,'Groups... '));
lp <- -1;
pspectra_list <- 1:npspectra;
}else{
print(paste('Calculating possible adducts in',length(psg_list),'Groups... '));
lp <- -1;
pspectra_list <- psg_list;
sum_peaks <- sum(sapply(mSet$AnnotateObject$pspectra[psg_list],length));
}
if("typ" %in% colnames(rules)){
rules.idx <- which(rules[, "typ"]== "A")
parent <- TRUE;
}else{
#backup for old rule sets
rules.idx <- 1:nrow(rules);
parent <- FALSE;
}
along = pspectra_list;
pb <- progress_bar$new(format = "Calculating [:bar] :percent Time left: :eta", total = length(along), clear = T, width= 75)
for(j in seq(along)){
i <- pspectra_list[j];
#peak index for those in pseudospectrum i
ipeak <- mSet$AnnotateObject$pspectra[[i]];
#percent output
pb$tick();
#end percent output
#check if the pspec contains more than one peak
if(length(ipeak) > 1){
hypothese <- annotateGrp(ipeak, imz, rules, mzabs, devppm, isotopes, quasimolion, rules.idx=rules.idx);
#save results
if(is.null(hypothese)){
next;
}
charge <- 0;
old_massgrp <- 0;
#combine annotation hypotheses to annotation groups for one compound mass
for(hyp in 1:nrow(hypothese)){
peakid <- as.numeric(ipeak[hypothese[hyp, "massID"]]);
if(old_massgrp != hypothese[hyp, "massgrp"]) {
massgrp <- massgrp + 1;
old_massgrp <- hypothese[hyp, "massgrp"];
annoGrp <- rbind(annoGrp, c(massgrp, hypothese[hyp, "mass"],
sum(hypothese[ which(hypothese[, "massgrp"] == old_massgrp), "score"]), i) )
}
if(parent){
annoID <- rbind(annoID, c(peakid, massgrp, hypothese[hyp, "ruleID"], ipeak[hypothese[hyp, "parent"]]))
}else{
annoID <- rbind(annoID, c(peakid, massgrp, hypothese[hyp, "ruleID"], NA))
}
}
}
}
derivativeIons <- getderivativeIons(annoID, annoGrp, rules, length(imz));
mSet$AnnotateObject$derivativeIons <- derivativeIons;
mSet$AnnotateObject$annoID <- annoID;
mSet$AnnotateObject$annoGrp <- annoGrp;
return(mSet)
}
}
getPeaklist <- function(mSet, intval="into") {
if (! intval %in% colnames(mSet$xcmsSet@peaks)) {
stop("unknown intensity value!")
}
#generate peaktable
#Check if xcmsSet contains only one sample
if(mSet$AnnotateObject$sample == 1 && length(rownames(mSet$xcmsSet@phenoData)) == 1){
#intval is here ignored since all intensity values are already contained
peaktable <- mSet$AnnotateObject$groupInfo;
}else {
#Case of xcmsSet with multiple samples
#Use groupInfo information and replace intensity values
peaktable <- mSet$AnnotateObject$groupInfo;
#get intensity values from xcmsSet
grpval <- groupval(mSet$xcmsSet, value=intval);
#get column range for replacement
grpval.ncol <- ncol(grpval)
start <- ncol(peaktable) - grpval.ncol +1;
ende <- start + grpval.ncol - 1;
peaktable[, start:ende] <- grpval;
}
#allocate variables for CAMERA output
adduct <- vector("character", nrow(mSet$AnnotateObject$groupInfo));
isotopes <- vector("character", nrow(mSet$AnnotateObject$groupInfo));
pcgroup <- vector("character", nrow(mSet$AnnotateObject$groupInfo));
#default polarity set to positive
polarity <- "+";
if(length(mSet$AnnotateObject$polarity) > 0){
if(mSet$AnnotateObject$polarity == "negative"){
polarity <- "-";
}
}
#First isotope informationen and adduct informationen
for(i in seq(along = isotopes)){
#check if adduct annotation is present for peak i
if(length(mSet$AnnotateObject$derivativeIons) > 0 && !(is.null(mSet$AnnotateObject$derivativeIons[[i]]))) {
#Check if we have more than one annotation for peak i
if(length(mSet$AnnotateObject$derivativeIons[[i]]) > 1) {
#combine ion species name and rounded mass hypophysis
names <- paste(mSet$AnnotateObject$derivativeIons[[i]][[1]]$name, signif(mSet$AnnotateObject$derivativeIons[[i]][[1]]$mass, 6));
for(ii in 2:length(mSet$AnnotateObject$derivativeIons[[i]])) {
names <- paste(names, mSet$AnnotateObject$derivativeIons[[i]][[ii]]$name, signif(mSet$AnnotateObject$derivativeIons[[i]][[ii]]$mass, 6));
}
#save name in vector adduct
adduct[i] <- names;
} else {
#Only one annotation
adduct[i] <- paste(mSet$AnnotateObject$derivativeIons[[i]][[1]]$name, signif(mSet$AnnotateObject$derivativeIons[[i]][[1]]$mass, 6));
}
} else {
#no annotation empty name
adduct[i] <- "";
}
#Check if we have isotope informationen about peak i
if(length(mSet$AnnotateObject$isotopes) > 0&& !is.null(mSet$AnnotateObject$isotopes[[i]])) {
num.iso <- mSet$AnnotateObject$isotopes[[i]]$iso;
#Which isotope peak is peak i?
if(num.iso == 0){
str.iso <- "[M]";
} else {
str.iso <- paste("[M+", num.iso, "]", sep="")
}
#Multiple charged?
if(mSet$AnnotateObject$isotopes[[i]]$charge > 1){
isotopes[i] <- paste("[", mSet$AnnotateObject$isotopes[[i]]$y, "]", str.iso, mSet$AnnotateObject$isotopes[[i]]$charge, polarity, sep="");
}else{
isotopes[i] <- paste("[", mSet$AnnotateObject$isotopes[[i]]$y, "]", str.iso, polarity, sep="");
}
} else {
#No isotope informationen available
isotopes[i] <- "";
}
}
#Have we more than one pseudospectrum?
if(length(mSet$AnnotateObject$pspectra) < 1){
pcgroup <- 0;
} else {
for(i in seq(along = mSet$AnnotateObject$pspectra)){
index <- mSet$AnnotateObject$pspectra[[i]];
pcgroup[index] <- i;
}
}
rownames(peaktable)<-NULL;#Bugfix for: In data.row.names(row.names, rowsi, i) :Â some row.names duplicated:
return(invisible(data.frame(peaktable, isotopes, adduct, pcgroup, stringsAsFactors=FALSE, row.names=NULL)));
}
sampclass <- function(object) {
if (ncol(object@phenoData) >0) {
if(any(colnames(object@phenoData)=="class")){
sclass <- object$class
## in any rate: transform class to a character vector
## and generate a new factor on that with the levels
## being in the order of the first occurrence of the
## elements (i.e. no alphanumeric ordering).
sclass <- as.character(sclass)
sclass <- factor(sclass, levels=unique(sclass))
return(sclass)
}
interaction(object@phenoData, drop=TRUE)
} else {
factor()
}
}
calcRules <- function (maxcharge=3, mol=3, nion=2, nnloss=1,
nnadd=1, nh=2, polarity=NULL, lib.loc = .libPaths(), multFragments=FALSE){
r <- new("ruleSet")
r <- setDefaultLists(r, lib.loc=lib.loc)
r <- readLists(r)
r <- setParams(r, maxcharge=maxcharge, mol=mol, nion=nion,
nnloss=nnloss, nnadd=nnadd, nh=nh, polarity=polarity, lib.loc = lib.loc)
if(multFragments){
r <- generateRules2(r)
}else{
r <- generateRules(r)
}
return(r@rules)
}
generateRules <- function (object) {
maxcharge=object@maxcharge
mol=object@mol
nion=object@nion
nnloss=object@nnloss
nnadd=object@nnadd
nh=object@nh
polarity=object@polarity
ionlist=object@ionlist
neutralloss=object@neutralloss
neutraladdition=object@neutraladdition
rules=object@rules
name<-c();
nmol<-c();
charge<-c();
massdiff<-c();
oidscore<-c();
quasi<-c();
ips<-c();
##Erzeuge Regeln
tmpname <- c();
tmpnmol <- c();
tmpcharge <- 0;
tmpmass <- 0;
tmpips <- 0;
## Molekülionen
if(polarity=="positive"){
## Wasserstoff, hard codiert
for(k in 1:mol){
if(k == 1){
str <- "";
tmpips <- 1.0;
}else{
str <- k;
tmpips <- 0.5;
};
name <- append(name, paste("[", str, "M+H]+", sep=""));
charge <- append(charge, 1);
massdiff <- append(massdiff, 1.007276);
nmol <- append(nmol, k);
if(k == 1) {
quasi <- append(quasi, 1);
} else {
quasi <- append(quasi, 0);
};
oidscore <- append(oidscore, 1);
ips <- append(ips, tmpips)
name <- append(name,paste("[",str,"M+2H]2+",sep=""));
charge<-append(charge,2);
massdiff<-append(massdiff,2.014552);
nmol<-append(nmol,k);quasi<-append(quasi,0);
oidscore<-append(oidscore,2);
ips<-append(ips,tmpips)
name<-append(name,paste("[",str,"M+3H]3+",sep=""));
charge<-append(charge,3);
massdiff<-append(massdiff,3.021828);
nmol<-append(nmol,k);
quasi<-append(quasi,0);
oidscore<-append(oidscore,3);
ips<-append(ips,tmpips)
oid<-3;
for(i in 1:nrow(ionlist)){
if(ionlist[i,2]<=0) {next;}
if(ionlist[i,2]==1){
name<-append(name,paste("[",str,"M+H+",ionlist[i,1],"]2+",sep=""));
}else{
name<-append(name,paste("[",str,"M+H+",ionlist[i,1],"]",ionlist[i,2]+1,"+",sep=""));
}
charge <- append(charge,ionlist[i,2]+1);
massdiff <- append(massdiff,ionlist[i,3]+1.007276);
nmol <- append(nmol,k);
quasi <- append(quasi,0);
oidscore <- append(oidscore,oid+i);
if(tmpips>0.75){
ips<-append(ips,0.5)
}else{
ips<-append(ips,tmpips);
}#Austausch
}
oid<-oid+nrow(ionlist);
coeff<-expand.grid(rep(list(0:nion),nrow(ionlist)))
if(length(list<-which(ionlist[,2]<=0))>0){
coeff[,list]<-0;
}
coeff<-unique(coeff);
coeff<-cbind(coeff,rep(0,nrow(coeff)));
coeff<-coeff[-1,]
tmp<-NULL;
for(i in 1:nrow(ionlist)){
if(ionlist[i,2]<=0)next;
## Austausch erstmal nur einen pro Ion
tmp<-rbind(tmp,t(apply(coeff,1,function(x) {x[i]<-x[i]+1;x[nrow(ionlist)+1]<-1;x})));
}
coeff<-unique(rbind(coeff,tmp));
i <- 1
while (i <= nrow(coeff)){
tmpname<-paste("[",str,"M",sep="");
tmpcharge<-0;tmpmass<-0;
for(ii in 1:(ncol(coeff)-1)){
if(coeff[i,ii]>0){
if(coeff[i,ii]>1){
tmpname<-paste(tmpname,"+",coeff[i,ii],ionlist[ii,1],sep="");
}else{
tmpname<-paste(tmpname,"+",ionlist[ii,1],sep="");
}
tmpcharge<-tmpcharge+coeff[i,ii]*ionlist[ii,2];
tmpmass<-tmpmass+coeff[i,ii]*ionlist[ii,3]
}
}
if(coeff[i,ncol(coeff)]>0){
## Austausch hat stattgefunden, einfach bsp 1
tmpname<-paste(tmpname,"-H",sep="");
tmpcharge<-tmpcharge-1;
tmpmass<-tmpmass-1.007276;
tmpips<-0.25;
}
if(tmpcharge>1){
tmpname<-paste(tmpname,"]",tmpcharge,"+",sep="")
}else{
tmpname<-paste(tmpname,"]+",sep="")
}
name<-append(name,tmpname)
charge<-append(charge,tmpcharge)
massdiff<-append(massdiff,tmpmass)
nmol <- append(nmol,k);
oidscore<-append(oidscore,oid+i)
if(sum(coeff[i,])==1&& k==1){
quasi <- append(quasi,1);
ips <-append(ips,tmpips);
}else{
quasi <- append(quasi,0);
if(tmpips>0.75){
ips <- append(ips,0.75);
}else{
ips <- append(ips,tmpips);
}
}
i <- i+1
}
}
oid<-max(oidscore);
## Create Neutral Addition
index<-which(quasi==1)
if (nrow(neutraladdition)>0) {
for(i in 1:nrow(neutraladdition)) {
if(length(index2<-which(ionlist[,2]>0))>0){
for(ii in 1:length(index2)){
if(ionlist[index2[ii],2] > 1){
name <- append(name,paste("[M+",ionlist[index2[ii],1],"+",neutraladdition[i,1],"]",abs(ionlist[index2[ii],2]),"+",sep=""));
}else{
name <- append(name,paste("[M+",ionlist[index2[ii],1],"+",neutraladdition[i,1],"]+",sep=""));
}
charge <- append(charge,ionlist[index2[ii],2]);
massdiff <- append(massdiff,neutraladdition[i,2]+ionlist[index2[ii],3]);
nmol <- append(nmol,1);
quasi <- append(quasi,0);
oidscore <- append(oidscore,oid+1);oid<-oid+1;
ips<-append(ips,0.5);
}
}
if(neutraladdition[i,1]=="NaCOOH"){next;}
name<-append(name,paste("[M+H+",neutraladdition[i,1],"]+",sep=""));
charge<-append(charge,+1);
massdiff<- append(massdiff,neutraladdition[i,2]+1.007276);
nmol<-append(nmol,1);
quasi<-append(quasi,0)
oidscore<-append(oidscore,oid+1);oid<-oid+1;
ips<-append(ips,0.5);
}
oid<-max(oidscore);
}
##Erzeuge Neutral loss
index<-which(quasi==1)
for(i in 1:nrow(neutralloss)){
for(ii in 1:maxcharge){
if(ii > 1){
name<-append(name,paste("[M+",ii,"H-",neutralloss[i,1],"]",ii,"+",sep=""));
}else {name<-append(name,paste("[M+H-",neutralloss[i,1],"]+",sep=""));}
charge<-append(charge,ii);
massdiff<- append(massdiff,-neutralloss[i,2]+1.007276*ii);
nmol<-append(nmol,1);
quasi<-append(quasi,0)
oidscore<-append(oidscore,oid+1);oid<-oid+1;
ips<-append(ips,0.25);
}
}
ruleset <- data.frame(name,nmol,charge,massdiff,oidscore,quasi,ips)
if(length(index<-which(ruleset[,"charge"]>maxcharge))>0){
ruleset<- ruleset[-index,];
}
}else if(polarity=="negative"){
## Wasserstoff, hard codiert
for(k in 1:mol){
if(k==1){str<-"";tmpips<-1.0;}else{str<-k;tmpips<-0.5};
name<-append(name,paste("[",str,"M-H]-",sep=""));
charge<-append(charge,-1);massdiff<-append(massdiff,-1.007276);nmol<-append(nmol,k);if(k==1){quasi<-append(quasi,1);}else{quasi<-append(quasi,0);};oidscore<-append(oidscore,1);ips<-append(ips,tmpips)
name<-append(name,paste("[",str,"M-2H]2-",sep=""));charge<-append(charge,-2);massdiff<-append(massdiff,-2.014552);nmol<-append(nmol,k);quasi<-append(quasi,0);oidscore<-append(oidscore,2);ips<-append(ips,tmpips)
name<-append(name,paste("[",str,"M-3H]3-",sep=""));charge<-append(charge,-3);massdiff<-append(massdiff,-3.021828);nmol<-append(nmol,k);quasi<-append(quasi,0);oidscore<-append(oidscore,3);ips<-append(ips,tmpips)
oid<-3;
for(i in 1:nrow(ionlist)){
if(ionlist[i,2]>=0){
if(ionlist[i,2] == 1){
name<-append(name,paste("[",str,"M-2H+",ionlist[i,1],"]-",sep=""));
charge <- append(charge,ionlist[i,2]-2);
massdiff<- append(massdiff,ionlist[i,3]-(2*1.007276));
nmol <- append(nmol,k);
quasi <- append(quasi,0);
oidscore<-append(oidscore,oid+i);
ips<-append(ips,0.25);
next;
} else {
if(ionlist[i,2] > maxcharge) {next;}
localCharge <- ionlist[i,2]+1
name<-append(name,paste("[",str,"M-",localCharge,"H+",ionlist[i,1],"]-",sep=""));
charge <- append(charge,ionlist[i,2]-localCharge);
massdiff<- append(massdiff,ionlist[i,3]-(localCharge*1.007276));
nmol <- append(nmol,k);
quasi <- append(quasi,0);
oidscore<-append(oidscore,oid+i);
ips<-append(ips,0.25);
next;
}
}
if(ionlist[i,2]== -1){
name<-append(name,paste("[",str,"M-H+",ionlist[i,1],"]2-",sep=""));
}else{
name<-append(name,paste("[",str,"M-H+",ionlist[i,1],"]",ionlist[i,2]+1,"-",sep=""));
}
charge <- append(charge,ionlist[i,2]-1);
massdiff<- append(massdiff,ionlist[i,3]-1.007276);
nmol <- append(nmol,k);
quasi <- append(quasi,0);
oidscore<-append(oidscore,oid+i);
ips<-append(ips,tmpips);
#Austausch
}
oid<-oid+nrow(ionlist);
coeff<-expand.grid(rep(list(0:nion),nrow(ionlist)))
if(length(list<-which(ionlist[,2]>=0))>0){
coeff[,list]<-0;
}
coeff<-unique(coeff);
coeff<-cbind(coeff,rep(0,nrow(coeff)));
coeff<-coeff[-1,] ## remove first row
i <- 1
while (i <= nrow(coeff)){
tmpname<-paste("[",str,"M",sep="");
tmpcharge<-0;tmpmass<-0;
for(ii in 1:(ncol(coeff)-1)){
if(coeff[i,ii]>0){
if(coeff[i,ii]>1){
tmpname<-paste(tmpname,"+",coeff[i,ii],ionlist[ii,1],sep="");
}else{
tmpname<-paste(tmpname,"+",ionlist[ii,1],sep="");
}
tmpcharge<-tmpcharge+coeff[i,ii]*ionlist[ii,2];
tmpmass<-tmpmass+coeff[i,ii]*ionlist[ii,3]
}
}
if(coeff[i,ncol(coeff)]>0){
#Austausch hat stattgefunden, einfach bsp 1
tmpname<-paste(tmpname,"-H",sep="");
tmpcharge<-tmpcharge-1;
tmpmass<-tmpmass-1.007276;
tmpips<-0.5;
}
if(tmpcharge< -1){
tmpname<-paste(tmpname,"]",abs(tmpcharge),"-",sep="")
}else{
tmpname<-paste(tmpname,"]-",sep="")
}
name<-append(name,tmpname)
charge<-append(charge,tmpcharge)
massdiff<-append(massdiff,tmpmass)
nmol <-append(nmol,k);
oidscore<-append(oidscore,oid+i)
if(sum(coeff[i,])==1&& k==1){
quasi <-append(quasi,1);
}else{
quasi <-append(quasi,0);
}
ips <-append(ips,tmpips);
i <- i+1
}
}
oid<-max(oidscore);
##Erzeuge Neutral Addition
index<-which(quasi==1)
for(i in 1:nrow(neutraladdition)){
if(length(index2<-which(ionlist[,2]<0))>0){
for(ii in 1:length(index2)){
if(ionlist[index2[ii],2]< -1){
name <- append(name,paste("[M+",ionlist[index2[ii],1],"+",neutraladdition[i,1],"]",abs(ionlist[index2[ii],2]),"-",sep=""));
}else{
name <- append(name,paste("[M+",ionlist[index2[ii],1],"+",neutraladdition[i,1],"]-",sep=""));
}
charge <- append(charge,ionlist[index2[ii],2]);
massdiff<- append(massdiff,neutraladdition[i,2]+ionlist[index2[ii],3]);
nmol <- append(nmol,1);
quasi <- append(quasi,0);
oidscore<-append(oidscore,oid+1);oid<-oid+1;
ips<-append(ips,0.5);
}
}
name<-append(name,paste("[M-H+",neutraladdition[i,1],"]-",sep=""));
charge<-append(charge,-1);
massdiff<- append(massdiff,neutraladdition[i,2]-1.007276);
nmol<-append(nmol,1);
quasi<-append(quasi,0)
oidscore<-append(oidscore,oid+1);oid<-oid+1;
ips<-append(ips,0.5);
}
oid<-max(oidscore);
##Erzeuge Neutral loss
index<-which(quasi==1)
for(i in 1:nrow(neutralloss)){
name<-append(name,paste("[M-H-",neutralloss[i,1],"]-",sep=""));
charge<-append(charge,-1);
massdiff<- append(massdiff,-neutralloss[i,2]-1.007276);
nmol<-append(nmol,1);
quasi<-append(quasi,0)
oidscore<-append(oidscore,oid+1);oid<-oid+1;
ips<-append(ips,0.25);
}
ruleset <- data.frame(name,nmol,charge,massdiff,oidscore,quasi,ips)
if(length(index<-which(ruleset[,"charge"]< -maxcharge))>0){
ruleset<- ruleset[-index,];
}
}else stop("Unknown error")
## Update object rules and return ruleset
object@rules=ruleset
object;
}
generateRules2 <- function (object) {
maxcharge=object@maxcharge
mol=object@mol
nion=object@nion
nnloss=object@nnloss
nnadd=object@nnadd
nh=object@nh
polarity=object@polarity
ionlist=object@ionlist
neutralloss=object@neutralloss
neutraladdition=object@neutraladdition
rules=object@rules
##Create Rules
ruleset <- matrix(nrow=0, ncol=8);
colnames(ruleset) <- c("name", "nmol","charge", "massdiff", "typ","mandatory",
"score", "parent")
tmpname <- c();
tmpcharge <- 0;
tmpmass <- 0;
tmpionparent <- NA;
massH <- 1.007276
##Positive Rule set
if(polarity == "positive"){
charge <- "+"
chargeValue <- 1
}else if(polarity == "negative"){
charge <- "-"
chargeValue <- -1
}else{
stop("Unknown polarity mode in rule set generation! Debug!\n")
}
#Hydrogen part is hard coded
for(k in 1:mol) {
if(k == 1){
#For M+H
str <- "";
tmpips <- 1.5;
quasi <- 1
}else{
#For xM+H
str <- k;
tmpips <- 0;
quasi <- 0
}
for(xh in seq.int(nh)){
if(xh == 1){
ruleset <- rbind(ruleset, cbind(paste("[", str, "M", charge, "H]", charge, sep=""), k, chargeValue,
massH*chargeValue, "A",
quasi, tmpips+0.5, tmpionparent));
} else {
ruleset <- rbind(ruleset, cbind(paste("[", str, "M", charge, xh, "H]", xh, charge, sep=""),
k,xh*chargeValue, massH*xh*chargeValue, "A", 0, 0.5, tmpionparent+1));
}
}
for(i in 1:nrow(ionlist)){
#Add change of kat- respectively anion against one hydrogen
if(ionlist[i, 2] > 0 & charge == "-"){
if(ionlist[i, 2] == 1){
sumcharge <- "";
hdiff <- 1;
}else{
sumcharge <- ionlist[i, 2];
hdiff <- ionlist[i,2]-1;
}
ruleset <- rbind(ruleset, cbind(paste("[", str, "M", charge,"H+", ionlist[i,1], "-", sumcharge,"H]", "",
charge, sep=""), k, ionlist[i, 2]*chargeValue,
ionlist[i, 3]-massH*(1+hdiff),
"A", 0, 0.25, tmpionparent));
}
#xM+H + additional Kation like Na or K
if(ionlist[i,2] <= 0 & chargeValue > 0) {
#Ions with negative charge, when polarity is positive
next;
} else if(ionlist[i, 2] >= 0 & chargeValue < 0){
#Ions with positive charge, when polarity is negative
next;
}
#Add Rule to Ruleset
if(abs(ionlist[i, 2]) == 1){
ruleset <- rbind(ruleset, cbind(paste("[", str, "M", charge,"H+", ionlist[i,1], "]2",
charge, sep=""), k, ionlist[i, 2]+1*chargeValue,
ionlist[i, 3]+massH*chargeValue, "A", 0, 0.25, tmpionparent));
}else{
ruleset <- rbind(ruleset, cbind(paste("[", str, "M", charge,"H+", ionlist[i, 1], "]",
ionlist[i, 2]+(1*chargeValue),
charge, sep=""), k, ionlist[i, 2]+1*chargeValue,
ionlist[i, 3]+massH*chargeValue, "A" ,0, 0.25, tmpionparent));
}
}#End for loop nrow(ionlist)
##Coeff - coefficient Matrix, for generating rules with
#combination of kat- or anionsexchange ions like [M-2H+Na] (M-H and change of H against Na)
coeff <- expand.grid(rep(list(0:nion), nrow(ionlist)))
if(chargeValue > 0){
index <- which(ionlist[, 2] <= 0);
}else{
index <- which(ionlist[, 2] >= 0);
}
if(length(index) > 0){
coeff[, index] <- 0;
}
tmp <- NULL;
for(i in 1:nrow(ionlist)){
if((chargeValue > 0 & ionlist[i, 2] <= 0) | (chargeValue < 0 & ionlist[i,2] >=0)){
next;
}
#Austausch erstmal nur einen pro Ion
tmp <- rbind(tmp, t(apply(coeff, 1, function(x) {
x[i] <- x[i]+1;
x[nrow(ionlist)+1] <- -1;
x }
)));
}
coeff <- cbind(coeff, rep(0, nrow(coeff)));
colnames(coeff)[4] <- "Var4"
colnames(tmp)[4] <- "Var4"
coeff <- unique(rbind(coeff, tmp));
for(i in 1:nrow(coeff)){
if(sum(coeff[i, 1:nrow(ionlist)]) > 2 | sum(coeff[i, 1:nrow(ionlist)]) < 1){
next;
}
tmpname <- paste("[",str,"M",sep="");
tmpcharge <- 0;
tmpmass <- 0;
for(ii in 1:(ncol(coeff))){
if(coeff[i,ii] > 0){
if(coeff[i,ii] > 1){
tmpname <- paste(tmpname, "+", coeff[i, ii], ionlist[ii, 1], sep="");
}else{
tmpname <- paste(tmpname, "+", ionlist[ii, 1], sep="");
}
tmpcharge <- tmpcharge + coeff[i, ii] * ionlist[ii, 2];
tmpmass <- tmpmass + coeff[i, ii] * ionlist[ii, 3];
} else if (coeff[i,ii] < 0){
tmpname <- paste(tmpname, "-H", sep="");
tmpcharge <- tmpcharge - 1;
tmpmass <- tmpmass - massH;
}
}
if(abs(tmpcharge) > 1){
tmpname <- paste(tmpname, "]", tmpcharge, charge, sep="")
}else{
tmpname <- paste(tmpname, "]", charge, sep="")
}
if(tmpcharge > maxcharge | tmpcharge == 0){
next;
}
if(sum(coeff[i, ]) == 1 && k == 1 && coeff[i, 4] >=0){
ruleset <- rbind(ruleset, cbind(tmpname, k, tmpcharge, tmpmass, "A", 1, 0.75, tmpionparent));
}else{
ruleset <- rbind(ruleset, cbind(tmpname, k, tmpcharge, tmpmass, "A", 0, 0.25, tmpionparent));
}
}#end for loop nrow(coeff)
}#end for loop k
# Create neutral addition to M+H from list
for(i in 1:nrow(neutraladdition)){
#Add neutral ion to only M+H
ruleset <- rbind(ruleset, cbind(paste("[M", charge, "H+", neutraladdition[i, 1], "]", charge, sep="") , 1, chargeValue,
neutraladdition[i, 2]+(massH*chargeValue), "A", 0, 0.25, 1));
}
## Add neutral loss from list to ruleset
for(i in 1:nrow(neutralloss)){
ruleset <- rbind(ruleset, cbind(paste("-", neutralloss[i, 1], sep=""), 1, 0,
-neutralloss[i, 2], "F", 0, 0.25, 1));
#Eliminate rules with charge > maxcharge
if(length(index <- which(ruleset[, "charge"] > maxcharge)) > 0){
ruleset <- ruleset[-index, ];
}
}
ruleset <- as.data.frame(ruleset, stringsAsFactors=FALSE)
class(ruleset$nmol) <- "numeric"
class(ruleset$charge) <- "numeric"
class(ruleset$massdiff) <- "numeric"
class(ruleset$mandatory) <- "numeric"
class(ruleset$score) <- "numeric"
class(ruleset$parent) <- "numeric"
object@rules=ruleset
object;
return(object);
}
setDefaultLists <- function(object, lib.loc=.libPaths()) {
##Read Tabellen
object@ionlistfile <- system.file('lists/ions.csv', package = "MetaboAnalystR",
lib.loc=lib.loc)[1]
if (!file.exists(object@ionlistfile)) stop('ions.csv not found.')
object@neutrallossfile <- system.file('lists/neutralloss.csv',
package = "MetaboAnalystR", lib.loc=lib.loc)[1]
if (!file.exists(object@neutrallossfile)) stop('neutralloss.csv not found.')
object@neutraladditionfile <- system.file('lists/neutraladdition.csv',
package = "MetaboAnalystR", lib.loc=lib.loc)[1]
if (!file.exists(object@neutraladditionfile)) stop('neutraladdition.csv not found.')
object
}
readLists <- function(object) {
object@ionlist <- read.table(object@ionlistfile, header=TRUE, dec=".", sep=",",
as.is=TRUE, stringsAsFactors = FALSE);
object@neutralloss <- read.table(object@neutrallossfile, header=TRUE, dec=".", sep=",",
as.is=TRUE, stringsAsFactors = FALSE);
object@neutraladdition <- read.table(object@neutraladditionfile,
header=TRUE, dec=".", sep=",",
as.is=TRUE, stringsAsFactors = FALSE);
object
}
setParams <- function (object, maxcharge=3, mol=3, nion=2, nnloss=1, nnadd=1, nh=2,
polarity=NULL, lib.loc=NULL) {
object@maxcharge=maxcharge
object@mol=mol
object@nion=nion
object@nnloss=nnloss
object@nnadd=nnadd
object@nh=nh
object@polarity=polarity
object@lib.loc=lib.loc
object
}
annotateGrp <- function(ipeak, imz, rules, mzabs, devppm, isotopes, quasimolion, rules.idx) {
#m/z vector for group i with peakindex ipeak
mz <- imz[ipeak];
naIdx <- which(!is.na(mz))
#Spectrum have only annotated isotope peaks, without monoisotopic peak
#Give error or warning?
if(length(na.omit(mz[naIdx])) < 1){
return(NULL);
}
ML <- massDiffMatrix(mz[naIdx], rules[rules.idx,]);
hypothese <- createHypothese(ML, rules[rules.idx, ], devppm, mzabs, naIdx);
#create hypotheses
if(is.null(nrow(hypothese)) || nrow(hypothese) < 2 ){
return(NULL);
}
#remove hypotheses, which violates via isotope annotation discovered ion charge
if(length(isotopes) > 0){
hypothese <- checkIsotopes(hypothese, isotopes, ipeak);
}
if(nrow(hypothese) < 2){
return(NULL);
}
#Test if hypothese grps include mandatory ions
#Filter Rules #2
if(length(quasimolion) > 0){
hypothese <- checkQuasimolion(hypothese, quasimolion);
}
if(nrow(hypothese) < 2){
return(NULL);
};
#Entferne Hypothesen, welche gegen OID-Score&Kausalität verstossen!
hypothese <- checkOidCausality(hypothese, rules[rules.idx, ]);
if(nrow(hypothese) < 2){
return(NULL);
};
#Prüfe IPS-Score
hypothese <- checkIps(hypothese)
if(nrow(hypothese) < 2){
return(NULL)
}
#We have hypotheses and want to add neutral losses
if("typ" %in% colnames(rules)){
hypothese <- addFragments(hypothese, rules, mz)
hypothese <- resolveFragmentConnections(hypothese)
}
return(hypothese);
}
massDiffMatrix <- function(m, rules){
#m - m/z vector
#rules - annotation rules
nRules <- nrow(rules);
DM <- matrix(NA, length(m), nRules)
for (i in seq_along(m)){
for (j in seq_len(nRules)){
DM[i, j] <- (abs(rules[j, "charge"] * m[i]) - rules[j, "massdiff"]) / rules[j, "nmol"] # ((z*m) - add) /n
}
}
return(DM)
}
createHypothese <- function(ML, rules, devppm, mzabs, naIdx){
ML.nrow <- nrow(ML);
ML.vec <- as.vector(ML);
max.value <- max(round(ML, 0));
hashmap <- vector(mode="list", length=max.value);
for(i in 1:length(ML)){
val <- trunc(ML[i],0);
if(val>1){
hashmap[[val]] <- c(hashmap[[val]],i);
}
}
if("ips" %in% colnames(rules)){
score <- "ips"
}else{
score <- "score"
}
hypothese <- matrix(NA,ncol=8,nrow=0);
colnames(hypothese) <- c("massID", "ruleID", "nmol", "charge", "mass", "score", "massgrp", "check");
massgrp <- 1;
for(i in seq(along=hashmap)){
if(is.null(hashmap[[i]])){
next;
}
candidates <- ML.vec[hashmap[[i]]];
candidates.index <- hashmap[[i]];
if(i != 1 && !is.null(hashmap[[i-1]]) && min(candidates) < i+(2*devppm*i+mzabs)){
index <- which(ML.vec[hashmap[[i-1]]]> i-(2*devppm*i+mzabs))
if(length(index)>0) {
candidates <- c(candidates, ML.vec[hashmap[[i-1]]][index]);
candidates.index <- c(candidates.index,hashmap[[i-1]][index]);
}
}
if(length(candidates) < 2){
next;
}
tol <- max(2*devppm*mean(candidates, na.rm=TRUE))+ mzabs;
result <- cutree(hclust(dist(candidates)), h=tol);
index <- which(table(result) >= 2);
if(length(index) == 0){
next;
}
m <- lapply(index, function(x) which(result == x));
for(ii in 1:length(m)){
ini.adducts <- candidates.index[m[[ii]]];
for( iii in 1:length(ini.adducts)){
adduct <- ini.adducts[iii] %/% ML.nrow +1;
mass <- ini.adducts[iii] %% ML.nrow;
if(mass == 0){
mass <- ML.nrow;
adduct <- adduct -1;
}
hypothese <- rbind(hypothese, c(naIdx[mass], adduct, rules[adduct, "nmol"], rules[adduct, "charge"], mean(candidates[m[[ii]]]), rules[adduct,score],massgrp ,1));
}
if(length(unique(hypothese[which(hypothese[, "massgrp"] == massgrp), "massID"])) == 1){
##only one mass annotated
hypothese <- hypothese[-(which(hypothese[,"massgrp"]==massgrp)),,drop=FALSE]
}else{
massgrp <- massgrp +1;
}
}
}
return(hypothese);
}
checkIsotopes <- function(hypothese, isotopes, ipeak){
for(hyp in 1:nrow(hypothese)){
peakid <- ipeak[hypothese[hyp, 1]];
if(!is.null(isotopes[[peakid]])){
#Isotope da
explainable <- FALSE;
if(isotopes[[peakid]]$charge == abs(hypothese[hyp, "charge"])){
explainable <- TRUE;
}
if(!explainable){
#delete Rule
hypothese[hyp,"check"]=0;
}
}
}
hypothese <- hypothese[which(hypothese[, "check"]==TRUE), ,drop=FALSE];
#check if hypothese grps annotate at least two different peaks
hypothese <- checkHypothese(hypothese)
return(hypothese)
}
checkHypothese <- function(hypothese){
if(is.null(nrow(hypothese))){
hypothese <- matrix(hypothese, byrow=F, ncol=8)
}
colnames(hypothese) <- c("massID", "ruleID", "nmol", "charge", "mass", "score", "massgrp", "check")
for(i in unique(hypothese[,"massgrp"])){
if(length(unique(hypothese[which(hypothese[, "massgrp"] == i), "massID"])) == 1){
##only one mass annotated
hypothese <- hypothese[-(which(hypothese[,"massgrp"]==i)), , drop=FALSE]
}
}
return(hypothese)
}
checkIps <- function(hypothese){
for(hyp in 1:nrow(hypothese)){
if(length(which(hypothese[, "massgrp"] == hypothese[hyp, "massgrp"])) < 2){
hypothese[hyp, "check"] = 0;
}
}
hypothese <- hypothese[which(hypothese[, "check"]==TRUE), ];
if(is.null(nrow(hypothese))) {
hypothese <- matrix(hypothese, byrow=F, ncol=9)
}
if(nrow(hypothese) < 1){
colnames(hypothese)<-c("massID", "ruleID", "nmol", "charge", "mass", "oidscore", "ips","massgrp", "check")
return(hypothese)
}
for(hyp in 1:nrow(hypothese)){
if(length(id <- which(hypothese[, "massID"] == hypothese[hyp, "massID"] & hypothese[, "check"] != 0)) > 1){
masses <- hypothese[id, "mass"]
nmasses <- sapply(masses, function(x) {
sum(hypothese[which(hypothese[, "mass"] == x), "score"])
})
masses <- masses[-which(nmasses == max(nmasses))];
if(length(masses) > 0){
hypothese[unlist(sapply(masses, function(x) {which(hypothese[, "mass"]==x)})), "check"]=0;
}
}
}
hypothese <- hypothese[which(hypothese[, "check"]==TRUE), ,drop=FALSE];
#check if hypothese grps annotate at least two different peaks
hypothese <- checkHypothese(hypothese)
return(hypothese)
}
checkOidCausality <- function(hypothese,rules){
#check every hypothese grp
for(hyp in unique(hypothese[,"massgrp"])){
hyp.nmol <- which(hypothese[, "massgrp"] == hyp & hypothese[, "nmol"] > 1)
for(hyp.nmol.idx in hyp.nmol){
if(length(indi <- which(hypothese[, "mass"] == hypothese[hyp.nmol.idx, "mass"] &
abs(hypothese[, "charge"]) == hypothese[, "nmol"])) > 1){
if(hyp.nmol.idx %in% indi){
#check if [M+H] [2M+2H]... annotate the same molecule
massdiff <- rules[hypothese[indi, "ruleID"], "massdiff"] /
rules[hypothese[indi, "ruleID"], "charge"]
if(length(indi_new <- which(duplicated(massdiff))) > 0){
hypothese[hyp.nmol.idx, "check"] <- 0;
}
}
}
}
}
hypothese <- hypothese[which(hypothese[, "check"] == TRUE), ,drop=FALSE];
#check if hypothese grps annotate at least two different peaks
hypothese <- checkHypothese(hypothese)
return(hypothese)
}
checkQuasimolion <- function(hypothese, quasimolion){
hypomass <- unique(hypothese[, "mass"])
for(mass in 1:length(hypomass)){
if(!any(quasimolion %in% hypothese[which(hypothese[, "mass"] == hypomass[mass]), "ruleID"])){
hypothese[which(hypothese[, "mass"] == hypomass[mass]), "check"] = 0;
}else if(is.null(nrow(hypothese[which(hypothese[, "mass"] == hypomass[mass]), ]))){
hypothese[which(hypothese[, "mass"] == hypomass[mass]), "check"] = 0;
}
}
hypothese <- hypothese[which(hypothese[, "check"]==TRUE), , drop=FALSE];
#check if hypothese grps annotate at least two different peaks
hypothese <- checkHypothese(hypothese)
return(hypothese)
}
getderivativeIons <- function(annoID, annoGrp, rules, npeaks){
#generate Vector length npeaks
derivativeIons <- vector("list", npeaks);
#intrinsic charge
#TODO: Not working at the moment
charge <- 0;
#check if we have annotations
if(nrow(annoID) < 1){
return(derivativeIons);
}
for(i in 1:nrow(annoID)){
peakid <- annoID[i, 1];
grpid <- annoID[i, 2];
ruleid <- annoID[i, 3];
if(is.null(derivativeIons[[peakid]])){
#Peak has no annotations so far
if(charge == 0 | rules[ruleid, "charge"] == charge){
derivativeIons[[peakid]][[1]] <- list( rule_id = ruleid,
charge = rules[ruleid, "charge"],
nmol = rules[ruleid, "nmol"],
name = paste(rules[ruleid, "name"]),
mass = annoGrp[grpid, 2])
}
} else {
#Peak has already an annotation
if(charge == 0 | rules[ruleid, "charge"] == charge){
derivativeIons[[peakid]][[(length(
derivativeIons[[peakid]])+1)]] <- list( rule_id = ruleid,
charge = rules[ruleid, "charge"],
nmol = rules[ruleid, "nmol"],
name=paste(rules[ruleid, "name"]),
mass=annoGrp[grpid, 2])
}
}
charge <- 0;
}
return(derivativeIons);
}
getIsotopeCluster <- function(object, number=NULL, value="maxo",
sampleIndex=NULL){
#check values
if(is.null(object)) {
stop("No xsa argument was given.\n");
}else if(!class(object)=="xsAnnotate"){
stop("Object parameter is no xsAnnotate object.\n");
}
value <- match.arg(value, c("maxo", "into", "intb"), several.ok=FALSE)
if(!is.null(number) & !is.numeric(number)){
stop("Number must be NULL or numeric");
}
if(!is.null(sampleIndex) & !all(is.numeric(sampleIndex))){
stop("Parameter sampleIndex must be NULL or numeric");
}
if(is.null(sampleIndex)){
nSamples <- 1;
} else if( all(sampleIndex <= length(object@xcmsSet@filepaths) & sampleIndex > 0)){
nSamples <- length(sampleIndex);
} else {
stop("All values in parameter sampleIndex must be lower equal
the number of samples and greater than 0.\n")
}
if(length(sampnames(object@xcmsSet)) > 1){ ## more than one sample
gvals <- groupval(object@xcmsSet, value=value);
groupmat <- object@groupInfo;
iso.matrix <- matrix(0, ncol=nSamples, nrow=length(object@isotopes));
if(is.null(sampleIndex)){
for(i in 1:length(object@pspectra)){
iso.matrix[object@pspectra[[i]],1] <- gvals[object@pspectra[[i]],object@psSamples[i]];
}
} else {
for(i in 1:length(object@pspectra)){
iso.matrix[object@pspectra[[i]], ] <- gvals[object@pspectra[[i]], sampleIndex]
}
}
peakmat <- cbind(groupmat[, "mz"], iso.matrix );
rownames(peakmat) <- NULL;
if(is.null(sampleIndex)){
colnames(peakmat) <- c("mz",value);
}else{
colnames(peakmat) <- c("mz", sampnames(object@xcmsSet)[sampleIndex]);
}
if(any(is.na(peakmat))){
print("Warning: peak table contains NA values. To remove apply fillpeaks on xcmsSet.");
}
} else if(length(sampnames(object@xcmsSet)) == 1){ ## only one sample was
peakmat <- object@groupInfo[, c("mz", value)];
} else {
stop("sampnames could not extracted from the xcmsSet.\n");
}
#collect isotopes
index <- which(!sapply(object@isotopes, is.null));
tmp.Matrix <- cbind(index, matrix(unlist(object@isotopes[index]), ncol=4, byrow=TRUE))
colnames(tmp.Matrix) <- c("Index","IsoCluster","Type","Charge","Val")
max.cluster <- max(tmp.Matrix[,"IsoCluster"])
max.type <- max(tmp.Matrix[,"Type"])
isotope.Matrix <- matrix(NA, nrow=max.cluster, ncol=(max.type+2));
invisible(apply(tmp.Matrix,1, function(x) {
isotope.Matrix[x["IsoCluster"],x["Type"]+2] <<- x["Index"];
isotope.Matrix[x["IsoCluster"],1] <<- x["Charge"];
}))
invisible(apply(isotope.Matrix,1, function(x) {
list(peaks=peakmat[na.omit(x[-1]),],charge=x[1])
}))
}
naOmit <- function(x) {
return (x[!is.na(x)]);
}
####### ------------ ======== Bottom of this kit ========= ------------ ######\
# --------------------------------------5.MSread--------------------------
read.MSdata <- function(files, pdata = NULL, msLevel. = NULL, centroided. = NA,
smoothed. = NA, cache. = 1L,
mode = c("inMemory", "onDisk")) {
mode <- match.arg(mode)
## o normalize the file path, i.e. replace relative path with absolute
## path. That fixes possible problems on Windows with SNOW parallel
## processing and also proteowizard problems on unis system with ~ paths.
files <- normalizePath(files)
suppressWarnings(.hasChroms <- MSnbase::hasChromatograms(files))
if (!length(files)) {
process <- new("MSnProcess",
processing = paste("No data loaded:", date()))
if (mode == "inMemory")
res <- new("MSnExp",
processingData = process)
else res <- new("OnDiskMSnExp",
processingData = process)
} else {
if (mode == "inMemory") {
if (is.null(msLevel.)) msLevel. <- 2L
res <- read.InMemMSd.data(files, pdata = pdata, msLevel. = msLevel.,
centroided. = centroided., smoothed. = smoothed., cache. = cache.)
} else { ## onDisk
res <- read.OnDiskMS.data(files = files, pdata = pdata,
msLevel. = msLevel., centroided. = centroided., smoothed. = smoothed.)
}
}
res
}
read.InMemMSd.data <- function(files, pdata, msLevel., centroided., smoothed., cache. = 1) {
MSnbase:::.testReadMSDataInput(environment())
if (msLevel. == 1) cache. <- 0
msLevel. <- as.integer(msLevel.)
## Creating environment with Spectra objects
assaydata <- new.env(parent = emptyenv())
ioncount <- c()
ioncounter <- 1
filenams <- filenums <- c()
fullhd2 <- fullhdorder <- c()
fullhdordercounter <- 1
.instrumentInfo <- list()
## List eventual limitations
if (MSnbase:::isCdfFile(files)) {
message("Polarity can not be extracted from netCDF files, please set ",
"manually the polarity with the 'polarity' method.")
}
## ## Idea:
## ## o initialize a featureData-data.frame,
## ## o for each file, extract header info and put that into
## featureData;
for (f in files) {
print(paste("Reading MS from",basename(f),"begin !"))
filen <- match(f, files)
filenums <- c(filenums, filen)
filenams <- c(filenams, f)
## issue #214: define backend based on file format.
msdata <- mzR::openMSfile(f,backend = NULL)
.instrumentInfo <- c(.instrumentInfo, list(mzR::instrumentInfo(msdata)))
fullhd <- mzR::header(msdata)
## Issue #325: get centroided information from file, but overwrite if
## specified with centroided. parameter.
if (!is.na(centroided.))
fullhd$centroided <- as.logical(centroided.)
spidx <- which(fullhd$msLevel == msLevel.)
## increase vectors as needed
ioncount <- c(ioncount, numeric(length(spidx)))
fullhdorder <- c(fullhdorder, numeric(length(spidx)))
if (msLevel. == 1) {
if (length(spidx) == 0)
stop("No MS1 spectra in file",f)
pb <- progress_bar$new(format = "Reading [:bar] :percent Time left: :eta",
total = length(spidx), clear = T, width= 75)
for (i in 1:length(spidx)) {
pb$tick();
j <- spidx[i]
hd <- fullhd[j, ]
## Fix missing polarity from netCDF
pol <- hd$polarity
if (length(pol) == 0)
pol <- NA
.p <- mzR::peaks(msdata, j)
sp <- new("Spectrum1",
rt = hd$retentionTime,
acquisitionNum = as.integer(hd$acquisitionNum),
scanIndex = as.integer(hd$seqNum),
tic = hd$totIonCurrent,
mz = .p[, 1],
intensity = .p[, 2],
fromFile = as.integer(filen),
centroided = as.logical(hd$centroided),
smoothed = as.logical(smoothed.),
polarity = as.integer(pol))
## peaksCount
ioncount[ioncounter] <- sum(.p[, 2])
ioncounter <- ioncounter + 1
.fname <-MSnbase:::formatFileSpectrumNames(fileIds=filen,
spectrumIds=i,
nSpectra=length(spidx),
nFiles=length(files))
assign(.fname, sp, assaydata)
fullhdorder[fullhdordercounter] <- .fname
fullhdordercounter <- fullhdordercounter + 1
}
} else { ## .msLevel != 1
if (length(spidx) == 0)
stop("No MS(n>1) spectra in file", f)
print(paste("Reading ", length(spidx), " MS", msLevel.,
" spectra from file ", basename(f)))
scanNums <- fullhd[fullhd$msLevel == msLevel., "precursorScanNum"]
if (length(scanNums) != length(spidx))
stop("Number of spectra and precursor scan number do not match!")
pb <- progress_bar$new(format = "Reading [:bar] :percent Time left: :eta",
total = length(spidx), clear = T, width= 75)
for (i in 1:length(spidx)) {
pb$tick();
j <- spidx[i]
hd <- fullhd[j, ]
.p <- mzR::peaks(msdata, j)
sp <- new("Spectrum2",
msLevel = as.integer(hd$msLevel),
merged = as.numeric(hd$mergedScan),
precScanNum = as.integer(scanNums[i]),
precursorMz = hd$precursorMZ,
precursorIntensity = hd$precursorIntensity,
precursorCharge = as.integer(hd$precursorCharge),
collisionEnergy = hd$collisionEnergy,
rt = hd$retentionTime,
acquisitionNum = as.integer(hd$acquisitionNum),
scanIndex = as.integer(hd$seqNum),
tic = hd$totIonCurrent,
mz = .p[, 1],
intensity = .p[, 2],
fromFile = as.integer(filen),
centroided = as.logical(hd$centroided),
smoothed = as.logical(smoothed.),
polarity = as.integer(hd$polarity))
## peaksCount
ioncount[ioncounter] <- sum(.p[, 2])
ioncounter <- ioncounter + 1
.fname <- MSnbase:::formatFileSpectrumNames(fileIds=filen,
spectrumIds=i,
nSpectra=length(spidx),
nFiles=length(files))
assign(.fname, sp, assaydata)
fullhdorder[fullhdordercounter] <- .fname
fullhdordercounter <- fullhdordercounter + 1
}
}
if (cache. >= 1)
fullhd2 <- rbind(fullhd2, fullhd[spidx, ])
gc()
mzR::close(msdata)
rm(msdata);
print(paste("This reading finished !"))
}
## cache level 2 yet implemented
cache. <- MSnbase:::testCacheArg(cache., maxCache = 2)
if (cache. >= 1) {
fl <- sapply(assaydata, function(x) x@fromFile)
featnms <- ls(assaydata) ## feature names in final MSnExp
fl <- fl[featnms] ## reorder file numbers
stopifnot(all(base::sort(featnms) == base::sort(fullhdorder)))
fullhdorder <- match(featnms, fullhdorder)
tmphd <- fullhd2[fullhdorder, ] ## reorder
ioncount <- ioncount[fullhdorder]
newhd <- data.frame(fileIdx = fl,
retention.time = tmphd$retentionTime,
precursor.mz = tmphd$precursorMZ,
precursor.intensity = tmphd$precursorIntensity,
charge = tmphd$precursorCharge,
peaks.count = tmphd$peaksCount,
tic = tmphd$totIonCurrent,
ionCount = ioncount,
ms.level = tmphd$msLevel,
acquisition.number = tmphd$acquisitionNum,
collision.energy = tmphd$collisionEnergy)
} else {
newhd <- NULL ## not used anyway
}
.cacheEnv <- MSnbase:::setCacheEnv(list("assaydata" = assaydata,
"hd" = newhd),
cache., lock = TRUE)
## CACHING AS BEEN SUPERSEDED BY THE OnDiskMSnExp IMPLEMENTATION
## if cache==2, do not lock assign msdata in .cacheEnv then lock
## it and do not close(msdata) above; rm(msdata) is OK
## Create 'MSnProcess' object
process <- new("MSnProcess",
processing = paste("Data loaded:", date()),
files = files,
smoothed = smoothed.)
## Create 'fdata' and 'pdata' objects
nms <- ls(assaydata)
if (is.null(pdata)) {
.pd <- data.frame(sampleNames = basename(files))
rownames(.pd) <- .pd$sampleNames
pdata <- new("AnnotatedDataFrame",
data = .pd)
}
fdata <- new("AnnotatedDataFrame",
data = data.frame(
spectrum = 1:length(nms),
row.names = nms))
fdata <- fdata[ls(assaydata)] ## reorder features
## expriment data slot
if (length(.instrumentInfo) > 1) {
cmp <- length(unique(sapply(.instrumentInfo, "[[", 1)))
if (cmp > 1)
message("According to the instrument information in the files,\n",
"the data has been acquired on different instruments!")
for (nm in names(.instrumentInfo[[1]]))
.instrumentInfo[[1]][[nm]] <- sapply(.instrumentInfo, "[[", nm)
}
expdata <- new("MIAPE",
instrumentManufacturer = .instrumentInfo[[1]]$manufacturer,
instrumentModel = .instrumentInfo[[1]]$model,
ionSource = .instrumentInfo[[1]]$ionisation,
analyser = as.character(.instrumentInfo[[1]]$analyzer),
detectorType = .instrumentInfo[[1]]$detector)
## Create and return 'MSnExp' object
toReturn <- new("MSnExp",
assayData = assaydata,
phenoData = pdata,
featureData = fdata,
processingData = process,
experimentData = expdata,
.cache = .cacheEnv)
return(toReturn)
}
read.OnDiskMS.data <- function(files, pdata, msLevel., centroided., smoothed.) {
MSnbase:::.testReadMSDataInput(environment())
stopifnot(is.logical(centroided.))
## Creating environment with Spectra objects
assaydata <- new.env(parent = emptyenv())
filenams <- filenums <- c()
fullhd2 <- fullhdorder <- c()
fullhdordercounter <- 1
.instrumentInfo <- list()
## List eventual limitations
if (MSnbase:::isCdfFile(files)) {
message("Polarity can not be extracted from netCDF files, please set ",
"manually the polarity with the 'polarity' method.")
}
## Idea:
## o initialize a featureData-data.frame,
featureDataList <- list()
## o for each file, extract header info and put that into featureData
pb <- progress_bar$new(format = "Reading [:bar] :percent Time left: :eta",
total = length(spidx), clear = T, width= 75)
for (f in files) {
pb$tick();
filen <- match(f, files)
filenums <- c(filenums, filen)
filenams <- c(filenams, f)
## issue #214: define backend based on file format.
msdata <- mzR::openMSfile(f,backend = NULL)
.instrumentInfo <- c(.instrumentInfo, list(mzR::instrumentInfo(msdata)))
fullhd <- mzR::header(msdata)
spidx <- seq_len(nrow(fullhd))
## Don't read the individual spectra, just define the names of
## the spectra.
fullhdorder <- c(fullhdorder,
MSnbase:::formatFileSpectrumNames(fileIds=filen,
spectrumIds=seq_along(spidx),
nSpectra=length(spidx),
nFiles=length(files)))
## Extract all Spectrum info from the header and put it into the featureData
fdData <- fullhd[spidx, , drop = FALSE]
## rename totIonCurrent and peaksCount, as detailed in
## https://github.com/lgatto/MSnbase/issues/105#issuecomment-229503816
names(fdData) <- sub("peaksCount", "originalPeaksCount", names(fdData))
## Add also:
## o fileIdx -> links to fileNames property
## o spIdx -> the index of the spectrum in the file.
fdData <- cbind(fileIdx = rep(filen, nrow(fdData)),
spIdx = spidx,
smoothed = rep(as.logical(smoothed.), nrow(fdData)),
fdData, stringsAsFactors = FALSE)
if (MSnbase:::isCdfFile(f)) {
## Add the polarity columns if missing in netCDF
if (!any(colnames(fdData) == "polarity"))
fdData <- cbind(fdData, polarity = rep(as.integer(NA),
nrow(fdData)))
}
## Order the fdData by acquisitionNum to force use of acquisitionNum
## as unique ID for the spectrum (issue #103). That way we can use
## the spIdx (is the index of the spectrum within the file) for
## subsetting and extracting.
if (!all(sort(fdData$acquisitionNum) == fdData$acquisitionNum))
warning(paste("Unexpected acquisition number order detected.",
"Please contact the maintainers or open an issue",
"on https://github.com/lgatto/MSnbase.",
sep = "\n")) ## see issue #160
fdData <- fdData[order(fdData$acquisitionNum), ]
featureDataList <- c(featureDataList, list(fdData))
## Fix for #151; would be nice if we could remove that at some point.
gc()
mzR::close(msdata)
rm(msdata)
}
## new in version 1.9.8
lockEnvironment(assaydata, bindings = TRUE)
.cacheEnv <- setCacheEnv(list("assaydata" = assaydata,
"hd" = NULL),
level = 0,
lock = TRUE)
## Create 'MSnProcess' object
process <- new("MSnProcess",
processing = paste0("Data loaded [", date(), "]"),
files = files,
smoothed = NA)
## Create 'fdata' and 'pdata' objects
if (is.null(pdata)) {
.pd <- data.frame(sampleNames = basename(files))
rownames(.pd) <- .pd$sampleNames
pdata <- new("AnnotatedDataFrame",
data = .pd)
}
## If we've got a featureDataList, use it
if (length(featureDataList) > 0) {
fdata <- do.call(rbind, featureDataList)
fdata <- cbind(fdata, spectrum = 1:nrow(fdata),
stringsAsFactors = FALSE)
## Setting rownames on the data.frame not on the AnnotatedDataFrame;
## did get strange errors otherwise.
rownames(fdata) <- fullhdorder
## Re-order them
fdata <- fdata[base::sort(fullhdorder), ]
fdata <- new("AnnotatedDataFrame", data = fdata)
## Re-order the features.
## fdata <- fdata[ls(assaydata), ]
} else fdata <- new("AnnotatedDataFrame")
## expriment data slot
if (length(.instrumentInfo) > 1) {
cmp <- length(unique(sapply(.instrumentInfo, "[[", 1)))
if (cmp > 1)
message("According to the instrument information in the files,\n",
"the data has been acquired on different instruments!")
for (nm in names(.instrumentInfo[[1]]))
.instrumentInfo[[1]][[nm]] <- sapply(.instrumentInfo, "[[", nm)
}
expdata <- new("MIAPE",
instrumentManufacturer = .instrumentInfo[[1]]$manufacturer,
instrumentModel = .instrumentInfo[[1]]$model,
ionSource = .instrumentInfo[[1]]$ionisation,
analyser = as.character(.instrumentInfo[[1]]$analyzer),
detectorType = .instrumentInfo[[1]]$detector)
## Create ProcessingStep if needed.
## Create the OnDiskMSnExp object.
res <- new("OnDiskMSnExp",
assayData = assaydata,
phenoData = pdata,
featureData = fdata,
processingData = process,
experimentData = expdata,
.cache = .cacheEnv)
if (!is.null(msLevel.)) {
msLevel. <- as.integer(msLevel.)
res <- filterMsLevel(res, msLevel.)
}
if (any(!is.na(centroided.))) {
if (length(centroided.) == 1) {
centroided(res) <- centroided.
} else {
for (i in seq_along(centroided.))
centroided(res, msLevel. = i) <- centroided.[i]
}
}
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.