R/bakedpi.R
In yamss: Tools for high-throughput metabolomics

Documented in bakedpi

## Steps
## 1. Read raw data
## 2. Background correction
## 3. Retention time alignment
## 4. Density estimation
## 5. Get cutoffs for density estimation
## 6. Get blobs
## 7. Get XICs
## 8. Quantify
## 9. Differential analysis

backgroundCorrection <- function(object, verbose = FALSE) {
    stopifnot(is(object, "CMSraw"))
    rawDT <- .rawDT(object)
    setkey(rawDT, mz, scan, sample)
    mzbreaks <- unique(c(seq(.minMZ(object), .maxMZ(object), by = 10), .maxMZ(object)))
    scanbreaks <- seq(1, .maxScan(object), 40)
    scanbreaks[length(scanbreaks)] <- .maxScan(object)
    if(verbose) {
        message("[backgroundCorrection] Get marginal intensities")
    }
    getScanWindowIntensDistsAtThisMZ <- function(m) {
        mzseq <- seq(as.integer(mzbreaks[m]*1e5), as.integer(mzbreaks[m+1]*1e5))
        DTmz <- rawDT[.(mzseq), nomatch = 0]
        getSampleSpecificIntensDistsAtThisScan <- function(rt) {
            DTmzscan <- DTmz[scan %in% scanbreaks[rt]:scanbreaks[rt+1]]
            lapply(.sampleNumber(object), function(s) {
                logintens <- log2(DTmzscan[sample==s, intensity] + 1)
                if (length(logintens) < 10)
                    return(NA)
                density(logintens)
            })
        }
        lapply(seq_len(length(scanbreaks)-1), getSampleSpecificIntensDistsAtThisScan)
    }
    ptime1 <- proc.time()
    densGrid <- lapply(seq_len(length(mzbreaks)-1), getScanWindowIntensDistsAtThisMZ)
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if(verbose) {
        message(sprintf("[backgroundCorrection] Get marginal intensities .. done in %.1f secs.", stime))
    }
    ## Estimate retention time window-specific background levels
    bgsd <- 0 # SD of normal distribution characterizing noise intensities
    r <- dnorm(bgsd)/dnorm(0)
    getBackgroundMeanAtThisMZScanSample <- function(m, rt, s) {
        dens <- densGrid[[m]][[rt]][[s]]
        if (class(dens)=="density") {
            indexFirstPeak <- which.max(diff(dens$y) < 0)
            densCutoff <- dens$y[indexFirstPeak]*r
            bgindex <- which.max(dens$y <= densCutoff & c(rep(FALSE, indexFirstPeak), rep(TRUE, length(dens$y)-indexFirstPeak)))
            return(dens$x[bgindex])
        }
        return(NA)
    }
    bgmeans <- lapply(seq_len(length(mzbreaks)-1), function(m) {
        lapply(seq_len(length(scanbreaks)-1), function(rt) {
            lapply(.sampleNumber(object), function(s) {
                getBackgroundMeanAtThisMZScanSample(m, rt, s)
            })
        })
    })
    if(verbose) {
        message("[backgroundCorrection] Get region-specific background trends")
    }
    ptime1 <- proc.time()
    getBackgroundMeanMatrixForSample <- function(s) {
        ## Loop over M/Z windowd. Then loop over scan windows.
        do.call(cbind, lapply(bgmeans, function(mzList) {
            sapply(mzList, function(scanList) { scanList[[s]] })
        }))
    }
    smoothBackgroundMeans <- function(bgmeanmatThisSample) {
        do.call(cbind, lapply(seq_len(ncol(bgmeanmatThisSample)), function(i) {
            ## Weight nearby M/Z windows more in the smoothing
            dists <- seq_len(ncol(bgmeanmatThisSample)) - i
            wts <- dnorm(dists/4)
            weightmat <- matrix(wts, nrow = nrow(bgmeanmatThisSample), ncol = ncol(bgmeanmatThisSample), byrow = TRUE)
            df <- data.frame(intens = as.numeric(bgmeanmatThisSample), 
                             scan = rep(head(scanbreaks, -1), times = ncol(bgmeanmatThisSample)), 
                             weight = as.numeric(weightmat))
            df <- df[complete.cases(df),]
            lofit <- loess(intens ~ scan, data = df, weights = weight, span = 0.1)
            predict(lofit, seq_len(.maxScan(object)))
        }))
    }
    ## For each sample, get the background mean trend as a function of scan
    ## for the different M/Z windows
    smooths <- lapply(.sampleNumber(object), function(s) {
        ## For bgmeanmatThisSample: rows = scans, cols = M/Z bins
        ## Each col is the background trend across scans for a particular M/Z region
        bgmeanmatThisSample <- getBackgroundMeanMatrixForSample(s)
        keepcols <- colSums(!is.na(bgmeanmatThisSample))!=0
        bgmeanmatThisSample <- bgmeanmatThisSample[,keepcols, drop = FALSE]
        ## Each col is a smoothed background trend across scans for a particular M/Z region
        bgmeanmatSmoothed <- smoothBackgroundMeans(bgmeanmatThisSample)
        mzbounds <- cbind(head(mzbreaks, -1), tail(mzbreaks, -1))
        mzbounds <- mzbounds[keepcols,,drop = FALSE]
        attr(bgmeanmatSmoothed, "mzbounds") <- mzbounds
        return(bgmeanmatSmoothed)
    })
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if(verbose) {
        message(sprintf("[backgroundCorrection] Get region-specific background trends .. done in %.1f secs.", stime))
    }

    ## Perform background correction
    setkey(rawDT, mz, sample)
    if(verbose) {
        message("[backgroundCorrection] Correct intensities")
    }
    ptime1 <- proc.time()
    bgcorrDT <- rbindlist(lapply(.sampleNumber(object), function(s) {
        bgmeanmatSmoothed <- smooths[[s]]
        mzbounds <- attr(bgmeanmatSmoothed, "mzbounds")
        rbindlist(lapply(seq_len(nrow(mzbounds)), function(i) {
            mzseq <- seq(as.integer(mzbounds[i,1]*1e5), as.integer(mzbounds[i,2]*1e5))
            bgtrend <- bgmeanmatSmoothed[,i]
            bgtrend[is.na(bgtrend)] <- 0
            rawDT[.(mzseq), nomatch = 0][sample==s][,bg := (2^bgtrend[scan])-1]
        }))
    }))
    bgcorrDT[, intensity := intensity - bg]
    bgcorrDT[, bg := NULL]
    bgcorrDT <- bgcorrDT[intensity > 0]
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if(verbose) {
        message(sprintf("[backgroundCorrection] Correct intensities .. done in %.1f secs.", stime))
    }
    out <- as(object, "CMSproc")
    .bgcorrDT(out) <- bgcorrDT
    return(out)
}

rtAlignment <- function(object, verbose = FALSE) {
    stopifnot(is(object, "CMSproc"))
    rawDT <- .rawDT(object)
    bgcorrDT <- .bgcorrDT(object)
    if(verbose) {
        message("[rtAlignment] Get rough M/Z regions to align")
    }
    ## Get density of M/Z values
    ## This density will be thresholded to yield M/Z regions
    mzdens <- density(bgcorrDT[,mz]/1e5, weights = bgcorrDT[,intensity]/sum(bgcorrDT[,intensity]), n = 2^ceiling(log2(nrow(bgcorrDT))), bw = 0.005)
    ## Get M/Z regions for a variety of quantile cutoffs
    qdens <- quantile(mzdens$y, seq(0.5,0.99,0.01))
    mzbounds <- lapply(qdens, function(cutoff) {
        ir <- whichAsIRanges(mzdens$y > cutoff)
        cbind(mzdens$x[start(ir)], mzdens$x[end(ir)])
    })
    ## Get 90th percentile of M/Z widths for each set of M/Z bounds
    p90 <- sapply(mzbounds, function(mat) {
        quantile(mat[,2] - mat[,1], 0.9)
    })
    ## What is the first cutoff index that for which the 90th percentile of M/Z widths
    ## is less than 0.05?
    ## i.e. We want the lowest cutoff such that the wide M/Z widths are not too wide
    wh <- which.max(p90 < 0.05)
    irmzr <- IRanges(start = as.integer(mzbounds[[wh]][,1]*1e5), end = as.integer(mzbounds[[wh]][,2]*1e5))
    if(verbose) {
        message("[rtAlignment] Get EICs for these regions")
    }
    ptime1 <- proc.time()
    eics <- getEICS(object, mzranges = irmzr)
    scans <- 1:.maxScan(object)
    imputeEIC <- function(eicmat) {
        ## Loop over each sample (columns) and interpolate
        do.call(cbind, lapply(seq_len(ncol(eicmat)), function(col) {
                           bool <- eicmat[,col] > 1e-6
                           if (sum(bool) < 2)
                               return(eicmat[,col])
                           approx(scans[bool], eicmat[bool,col], xout = scans, rule = 2)$y
                       }))
    }
    eicsImputed <- lapply(eics, imputeEIC)
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if(verbose) {
        message(sprintf("[rtAlignment] Get EICs for these regions .. done in %.1f secs.", stime))
        message("[rtAlignment] Find best shifts")
    }
    ptime1 <- proc.time()
    bgcorrDT[, scanorig := scan]
    rawDT[, scanorig := scan]
    shifts <- -20:20
    getCorrsByShift <- function(eicimpmat, refsamp, sampIndex) {
        sapply(shifts, function(shift) {
                if (shift < 0) {
                    x <- tail(eicimpmat[,sampIndex], shift)
                    ref <- head(eicimpmat[,refsamp], shift)
                } else if (shift==0) {
                    x <- eicimpmat[,sampIndex]
                    ref <- eicimpmat[,refsamp]
                } else {
                    x <- head(eicimpmat[,sampIndex], -shift)
                    ref <- tail(eicimpmat[,refsamp], -shift)
                }
                cor(x, ref)
            })
    }
    getSampleSpecificShiftsThisMZRegion <- function(i) {
        eicmat <- eics[[i]]
        eicimpmat <- eicsImputed[[i]]
        ## Set the reference sample as the one with the 
        ## largest total intensity in the EIC
        refsamp <- which.max(colSums(eicmat))
        bestShiftBySample <- sapply(.sampleNumber(object), function(s) {
            if (s==refsamp) {
                return(0)
            }
            corrShifts <- getCorrsByShift(eicimpmat, refsamp, s)
            if (sum(!is.na(corrShifts))==0) {
                return(0)
            }
            shifts[which.max(corrShifts)]
        })
        return(bestShiftBySample)
    }
    shiftsList <- lapply(seq_along(irmzr), getSampleSpecificShiftsThisMZRegion)
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if(verbose) {
        message(sprintf("[rtAlignment] Find best shifts .. done in %.1f secs.", stime))
        message("[rtAlignment] Remap scans")
    }
    ptime1 <- proc.time()
    setkey(bgcorrDT, mz, sample)
    setkey(rawDT, mz, sample)
    for (i in seq_along(irmzr)) {
        mzseq <- seq(start(irmzr[i]), end(irmzr[i]))
        for (s in .sampleNumber(object)) {
            rawDT[CJ(mzseq,s), shift := shiftsList[[i]][s]]
            bgcorrDT[CJ(mzseq,s), shift := shiftsList[[i]][s]]
        }
    }
    rawDT[, shift := ifelse(is.na(shift), 0, shift)]
    bgcorrDT[, shift := ifelse(is.na(shift), 0, shift)]
    rawDT[, scan := scan + shift]
    bgcorrDT[, scan := scan + shift]
    rawDT[, shift := NULL]
    bgcorrDT[, shift := NULL]
    rawDT <- rawDT[scan >= 1 & scan <= .maxScan(object)]
    bgcorrDT <- bgcorrDT[scan >= 1 & scan <= .maxScan(object)]
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if(verbose) {
        message(sprintf("[rtAlignment] Remap scans .. done in %.1f secs.", stime))
    }
    .rawDT(object) <- rawDT
    .bgcorrDT(object) <- bgcorrDT
    .rtAlign(object) <- TRUE
    return(object)
}

densityEstimation <- function(object, dgridstep = dgridstep, dbandwidth = dbandwidth, 
                              outfileDens, verbose = FALSE) {
    stopifnot(is(object, "CMSproc"))
    bgcorrDT <- object@bgcorrDT
    getDensityEstimateApprox <- function(bgcorrDT, bw = dbandwidth,
                                         gridstep = dgridstep, maxbws = 4, mzParams) {
        gridseqMz <- seq(mzParams$minMZ, mzParams$maxMZ, gridstep[1])
        gridseqScan <- seq(1, mzParams$maxScan, gridstep[2])
        ## Assign each point to a grid location
        bgcorrDT[, gmz := cut(mz/1e5, breaks = gridseqMz, labels = FALSE)]
        bgcorrDT[, gscan := cut(scan, breaks = gridseqScan, labels = FALSE, include.lowest = TRUE)]
        ## Number of grid steps to meet maxbws (density beyond is approx 0)
        ng <- maxbws*bw/gridstep
        ## Sort by M/Z grid location than scan grid location
        setkey(bgcorrDT, gmz, gscan)
        tabgmz <- table(factor(bgcorrDT[,gmz], levels = seq_along(gridseqMz)))
        if(verbose) {
            message("[getDensityEstimateApprox] Getting sparse matrix entries (M/Z)")
        }
        ptime1 <- proc.time()
        spdensmz <- lapply(seq_along(gridseqMz), function(i) {
            whg <- max(i - ng[1], 1):min(i + ng[1], length(gridseqMz))
            if (i==1) {
                start <- 1
            } else {
                start <- sum(tabgmz[seq_len(whg[1]-1)])+1
            }
            end <- (start+sum(tabgmz[whg])-1)
            if (end < start)
                return(matrix(1,0,2))
            at <- start:end
            d <- rep(dnorm((gridseqMz[i]-gridseqMz[whg])/bw[1]), times = tabgmz[whg])
            return(cbind(at,d))
        })
        ptime2 <- proc.time()
        stime <- (ptime2 - ptime1)[3]
        if(verbose) {
            message(sprintf("[getDensityEstimateApprox] Getting sparse matrix entries (M/Z) .. done in %.1f secs.", stime))
            message("[getDensityEstimateApprox] Constructing sparse matrix (M/Z)")
        }
        ptime1 <- proc.time()
        spmatmz <- sparseMatrix(
            i = do.call(c, lapply(seq_along(spdensmz), function(i) {
                               spdensmz[[i]][,1]
                           })),
            j = rep(seq_along(spdensmz), times = sapply(spdensmz, nrow)),
            x = do.call(c, lapply(seq_along(spdensmz), function(i) {
                               spdensmz[[i]][,2]
                           })),
            dims = c(nrow(bgcorrDT), length(spdensmz))
        )
        ptime2 <- proc.time()
        stime <- (ptime2 - ptime1)[3]
        if(verbose) {
            message(sprintf("[getDensityEstimateApprox] Constructing sparse matrix (M/Z) .. done in %.1f secs.", stime))
        }
        rm(spdensmz)
        if(verbose) {
            message("[getDensityEstimateApprox] Getting sparse matrix entries (scan) + computing density")
        }
        gscan <- bgcorrDT[,gscan]
        intens <- bgcorrDT[,intensity]
        intens <- intens - min(intens)
        dtgscan <- data.table(gscan = gscan, row = seq_along(gscan))
        setkey(dtgscan, gscan)
        denom <- nrow(bgcorrDT)*prod(bw)*sum(bgcorrDT[,intensity])
        getDensityEstimateSparseList <- function(gridseqScan, dtgscan, gscan, bgcorrDT, spmatmz) {
            scanChunkSize <- 20
            scanChunks <- seq(1, length(gridseqScan), scanChunkSize)
            scanChunks <- unique(c(scanChunks, length(gridseqScan)))
            densChunkList <- vector("list", length(scanChunks)-1)
            for (a in seq_len(length(scanChunks)-1)) {
                ch <- scanChunks[a]
                if (a==length(scanChunks)-1) {
                    chunkEnd <- length(gridseqScan)
                } else {
                    chunkEnd <- min(ch+scanChunkSize-1, .maxScan(object))
                }
                chunkSeq <- ch:chunkEnd
                spdensscan <- lapply(chunkSeq, function(i) {
                    whg <- max(i - ng[2], 1):min(i + ng[2], length(gridseqScan))
                    at <- dtgscan[.(whg), row, nomatch = 0]
                    ## Do intensity weighting here
                    d <- dnorm((gridseqScan[i]-gscan[at])/bw[2])*bgcorrDT[at,intensity]
                    cbind(at, d)
                })
                ## rep(1) instead of rep(i) because we're forming one column matrices
                ## otherwise dims won't be correct
                spmatscan <- sparseMatrix(
                        i = do.call(c, lapply(seq_along(spdensscan), function(i) {
                                           spdensscan[[i]][,1]
                                       })),
                        j = rep(seq_along(spdensscan), times = sapply(spdensscan, nrow)),
                        x = do.call(c, lapply(seq_along(spdensscan), function(i) {
                                           spdensscan[[i]][,2]
                                       })),
                        dims = c(nrow(bgcorrDT), length(spdensscan))
                    )
                dens <- crossprod(spmatmz, spmatscan)/denom
                densChunkList[[a]] <- dens
            }
            return(densChunkList)
        }
        ptime1 <- proc.time()
        dens <- getDensityEstimateSparseList(gridseqScan, dtgscan, gscan, bgcorrDT, spmatmz)
        ptime2 <- proc.time()
        stime <- (ptime2 - ptime1)[3]
        if(verbose) {
            message(sprintf("[getDensityEstimateApprox] Getting sparse matrix entries (scan) + computing density .. done in %.1f secs.", stime))
        }
        return(dens)
    }

    if(verbose) {
        message("[densityEstimation] Get density estimate")
    }
    ## First check to see if the density has been pre-computed
    if (!is.null(outfileDens)) {
        if (file.exists(outfileDens)) {
            if(verbose) {
                message("[getDensityEstimateApprox] Get density estimate .. loading")
            }
            load(outfileDens)
        } else {
            ptime1 <- proc.time()
            densList <- getDensityEstimateApprox(bgcorrDT = bgcorrDT, bw = dbandwidth,
                                                 gridstep = dgridstep, maxbws = 4, mzParams = object@mzParams)
            ptime2 <- proc.time()
            stime <- (ptime2 - ptime1)[3]
            if(verbose) {
                message(sprintf("[densityEstimation] Get density estimate .. done in %.1f secs.", stime))
                message("[densityEstimation] Saving density estimate")
            }
            save(densList, file = outfileDens)
        }
    } else {
        ptime1 <- proc.time()
        densList <- getDensityEstimateApprox(bgcorrDT = bgcorrDT, bw = dbandwidth,
                                             gridstep = dgridstep, maxbws = 4, mzParams = object@mzParams)
        ptime2 <- proc.time()
        stime <- (ptime2 - ptime1)[3]
        if(verbose) {
            message(sprintf("[densityEstimation] Get density estimate  .. done in %.1f secs.", stime))
        }
    }
    
    if(verbose) {
        message("[densityEstimation] Make density matrix")
    }
    ptime1 <- proc.time()
    dmat <- do.call(cbind, lapply(densList, as.matrix))
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if(verbose) {
        message(sprintf("[densityEstimation] Make density matrix .. done in %.1f secs.", stime))
    }
    rm(densList)
    rownames(dmat) <- seq(.minMZ(object), .maxMZ(object), length.out = nrow(dmat))
    colnames(dmat) <- seq(1, .maxScan(object), dgridstep[2])
    return(list(dmat = dmat))
}

bakedpi <- function(cmsRaw, dbandwidth = c(0.005, 10),
                    dgridstep = c(0.005, 1), outfileDens = NULL,
                    dortalign = FALSE, mzsubset = NULL, verbose = TRUE) {
    ## Check that bandwidth is >= gridstep for binning purposes
    stopifnot(sum(dbandwidth >= dgridstep)==2)
    .isArgumentTwoVector(dbandwidth)
    .isArgumentTwoVector(dgridstep)

    ## Set verbosity options
    subverbose <- max(as.integer(verbose) - 1L, 0)

    ## Subset if specified
    cmsRaw <- .subsetByMZ(cmsRaw, mzsubset)
    
    if(verbose) {
        message("[bakedpi] Background correction")
    }
    object <- backgroundCorrection(object = cmsRaw, verbose = subverbose)

    if (dortalign) {
        if(verbose) {
            message("[bakedpi] Retention time alignment")
        }
        object <- rtAlignment(object = object, verbose = subverbose)
    }

    if(verbose) {
        message("[bakedpi] Density estimation")
    }
    dmat <- densityEstimation(object = object, dbandwidth = dbandwidth,
                              dgridstep = dgridstep, outfileDens = outfileDens,
                              verbose = subverbose)$dmat
    .densityEstimate(object) <- dmat
    ## Compute and store density quantiles
    qs <- seq(0,1,0.001)
    .densityQuantiles(object) <- quantile(dmat[dmat!=0], qs)

    return(object)
}