R/PeakFilling.R

Defines functions PeakFilling

Documented in PeakFilling

#' Peak filling of any missed peaks
#'
#' This functions detects which samples (after grouping) are missing from every peak group and reanalyses the raw data to verify whether this peak is actually non-existent for this sample 
#'
#' @param Y.grouped Peaks groups (output from the 'PeakGrouper' function).
#' @param Y.spec The raw NMR spectra in matrix format.
#' @param max.index.shift Maximal shift in index between a filled peak and the group it belongs to.
#' @param window.width The width of the detection window for the wavelets. Because of the Fourier transform lengths of 512 ( window.width = 'small') of 1024 ( window.width = 'large') are preferable.
#' @param nCPU The amount of cpu's to be used for peak detection. If set to '-1' all available cores minus 1 will be used.
#' @param FilMethod A more robust method for peak filling has been implemented. This is now the default. The former method can be used by specifying FilMethod == "old" however this will be deprecated.
#'
#' @return Returns a data frame with grouped peaks and possibly extra peaks obtained from the raw data (these peaks have SNR = NA).
#'
#' @author Charlie Beirnaert, \email{charlie.beirnaert@@uantwerpen.be}
#'
#' @examples
#' subset <- GetWinedata.subset()
#' # to reduce the example time we only select spectra 1 & 2
#' subset.spectra = as.matrix(subset$Spectra)[1:2,] 
#' subset.ppm = as.numeric(subset$PPM)
#' 
#' test.peaks <- getWaveletPeaks(Y.spec=subset.spectra, 
#'                               X.ppm=subset.ppm,
#'                               nCPU = 1) # nCPU set to 1 for the vignette build
#'
#' test.grouped <- PeakGrouper(Y.peaks = test.peaks)
#'
#' test.filled <- PeakFilling(Y.grouped = test.grouped, 
#'                            Y.spec = subset.spectra,  
#'                            nCPU = 1) # nCPU set to 1 for the vignette build
#'
#' 
#' @export
#' 
#' @importFrom MassSpecWavelet tuneInPeakInfo
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom data.table rbindlist
#' @importFrom stats median na.omit
#' @importFrom foreach %dopar% foreach
#' @importFrom parallel makeCluster stopCluster
#' @importFrom doSNOW registerDoSNOW

#'
PeakFilling <- function(Y.grouped, Y.spec, max.index.shift = 10, window.width = "small", nCPU = -1, FilMethod = "new") {
    
    
    groups <- unique(Y.grouped$peakIndex)
    
    Samples <- sort(unique(Y.grouped$Sample))
    nSamples <- length(Samples)
    print(paste("There are", as.character(nSamples), "samples in the dataset."))
    
    if ("small" %in% window.width & !"large" %in% window.width | missing(window.width)) {
        FFTwindow <- 512
    } else if ("large" %in% window.width & !"small" %in% window.width) {
        FFTwindow <- 1024
    } else {
        warning("'window.width' is defined ambiguously or wrong, set to default small.")
        FFTwindow <- 512
    }
    
    if (nCPU == -1) {
        nCPU <- parallel::detectCores(all.tests = FALSE, logical = TRUE) - 1
    }
    if(nCPU > nSamples){
        nCPU <- nSamples
    }
    
    if(FilMethod == "old"){
        warning("The old peak filling method will be deprecated in the next versions.")
    }
    
    groups.to.fill <- rep(NA, length(groups))
    # determine which groups (peakIndex) do not contain a peak for every sample
    for (g in seq_along(groups)) {
        if (length(unique(Y.grouped$Sample[Y.grouped$peakIndex == groups[g]])) != nSamples) 
            groups.to.fill[g] <- groups[g]
    }
    groups.to.fill <- as.numeric(stats::na.omit(groups.to.fill))
    
    if (length(groups.to.fill) == 0) {
        print("No peak group filling performed. All samples are present in every group. If this is unexpected see whether setting the 'nSamples' parameter manually.")
        return(Y.grouped)
    } else {
        
        
        
        if(length(groups.to.fill) > nSamples){
            group.division = cut(seq(1,length(groups.to.fill)),breaks=nSamples,labels=FALSE)
        }else{
            group.division = rep(1,length(groups.to.fill))
        }
        
        nRuns = length(unique(group.division))
        
        cl <- parallel::makeCluster(nCPU)
        #doParallel::registerDoParallel(cl)
        doSNOW::registerDoSNOW(cl)
        FilledList <- list()
        Parcounter <- NULL
        pb <- txtProgressBar(max=nRuns, style=3)
        progress <- function(n) setTxtProgressBar(pb, n)
        opts <- list(progress=progress)
        
        if(FilMethod == "old"){
            FilledList <- foreach::foreach(Parcounter = 1:nRuns, .options.snow=opts, .inorder = FALSE, .packages = c("stats", "MassSpecWavelet")) %dopar% 
            {
                
                groups.to.fill.subset = groups.to.fill[group.division == Parcounter]
                Peaks.filled <- list()
                
                for (gg in seq_along(groups.to.fill.subset)) {
                    # for (gg in 1:length(28)){
                    grpdata <- Y.grouped[Y.grouped$peakIndex == groups.to.fill.subset[gg], ]
                    
                    Mn.ppm <- mean(grpdata$peakPPM)
                    Med.index <- stats::median(grpdata$peakIndex)
                    Med.scale <- stats::median(grpdata$peakScale)
                    
                    samples.to.fill <- Samples[!Samples %in% grpdata$Sample]
                    filled <- matrix(ncol = 6, nrow = length(samples.to.fill))
                    
                    for (k in seq_along(samples.to.fill)) {
                        
                        res <- tryCatch({
                            res.in.try <- MassSpecWavelet::tuneInPeakInfo(Y.spec[samples.to.fill[k], (Med.index - 
                                                                                                          floor((FFTwindow - 1)/2)):(Med.index + ceiling((FFTwindow - 1)/2))], peakIndex = FFTwindow/2, 
                                                                          peakScale = round(Med.scale)) 
                            # This rounding of Med.scale is very important as Massspecwavelet gives warnings in 
                            # the function tuneInPeakInfo -> getLocalMaximumCWT -> localMaximum
                        }, warning = function(warn) {
                            res.in.try <- rep(NA, 6)  # this will result in FWHM=-1 and resolving_power = -1 for this run
                            return(res.in.try)
                        }, error = function(err) {
                            res.in.try <- rep(NA, 6)  # this will result in FWHM=-1 and resolving_power = -1 for this run
                            return(res.in.try)
                        })  # end trycatch
                        
                        
                        res[sapply(res, is.null)] <- NA
                        # !important after sapply <- NA! : sometimes the $peakCenterIndex from tuneInPeakInfo returns 'numeric(0)', throw it away
                        if (length(res) != 6) {
                            res <- rep(NA, 6)
                        }
                        if(length(res) != length(unlist(res))){
                            res <- rep(NA, 6)
                        }
                        res <- unlist(res)
                        filled[k, ] <- res
                        
                    }
                    colnames(filled) <- c("peakIndex", "peakValue", "peakCenterIndex", "peakSNR", "peakScale", 
                                          "unProcessedPeak")
                    filled <- data.frame(filled)
                    filled$Sample <- samples.to.fill
                    filled$peakPPM <- Mn.ppm
                    filled$grp.index <- Med.index
                    
                    # delete the new peaks who's center index is shifted too far from the original center of the group
                    # (larger than +/- max.index.shift)
                    filled <- filled[filled$peakCenterIndex <= (FFTwindow/2 + max.index.shift) & filled$peakCenterIndex >= 
                                         (FFTwindow/2 - max.index.shift), , drop = F]
                    # delete the ones where tuneInPeakInfo returned 6 NA
                    filled <- filled[rowSums(is.na(filled[, 1:6])) != 6, , drop = F]
                    # delete the peaks where the wavelet search returned NaN for peakValue
                    filled <- filled[!is.nan(filled$peakValue), ]
                    
                    Peaks.filled[[gg]] <- filled
                    
                    # update progress bar
                    #utils::setTxtProgressBar(pb, gg)
                }
                
                
                
                Peaks.filled <- data.table::rbindlist(Peaks.filled)
                Peaks.filled <- data.frame(Peaks.filled$grp.index, Peaks.filled$peakPPM, Peaks.filled$peakValue, 
                                           Peaks.filled$peakSNR, Peaks.filled$peakScale, Peaks.filled$Sample)
                colnames(Peaks.filled) <- colnames(Y.grouped)
                return(Peaks.filled)   
            }
        } else{
            FilledList <- foreach::foreach(Parcounter = 1:nRuns, .options.snow=opts, .inorder = FALSE, .packages = c("stats", "MassSpecWavelet")) %dopar% 
            {
                
                groups.to.fill.subset = groups.to.fill[group.division == Parcounter]
                Peaks.filled <- list()
                
                for (gg in seq_along(groups.to.fill.subset)) {
                    grpdata <- Y.grouped[Y.grouped$peakIndex == groups.to.fill.subset[gg], 
                                         ]
                    Mn.ppm <- mean(grpdata$peakPPM)
                    Med.index <- stats::median(grpdata$peakIndex)
                    Med.scale <- stats::median(grpdata$peakScale)
                    samples.to.fill <- Samples[!Samples %in% grpdata$Sample]
                    filled <- matrix(ncol = 6, nrow = length(samples.to.fill))
                    
                    
                    fill_index_start <- Med.index - floor((FFTwindow - 1)/2)
                    fill_index_stop <- Med.index + ceiling((FFTwindow - 1)/2)
                    
                    
                    if(fill_index_start < 0 | fill_index_stop > ncol(Y.spec)){
                        index_seq <- seq(from = fill_index_start, to = fill_index_stop, by = 1)
                        index_seq <- index_seq[index_seq >= 1 & index_seq <= ncol(Y.spec)] 
                        
                        
                        fill_index_start <- index_seq[1]
                        fill_index_stop <- tail(index_seq,1)
                        peak_fill_location <- which(index_seq == Med.index)
                    }else{
                        peak_fill_location <- FFTwindow/2
                    }
                    
                    scales = seq(1, 16, 1)
                    for (k in seq_along(samples.to.fill)) {
                        res <- tryCatch({
                            Peaks_again = MassSpecWavelet::peakDetectionCWT(Y.spec[samples.to.fill[k], fill_index_start:fill_index_stop], 
                                                                            scales = scales, 
                                                                            SNR.Th = -1, nearbyPeak = TRUE)
                            PeaksInfo <- Peaks_again$majorPeakInfo
                            if( length(PeaksInfo$allPeakIndex > (peak_fill_location - max.index.shift) & PeaksInfo$allPeakIndex < (peak_fill_location + max.index.shift))>0){
                                peak_ind = as.numeric(which(PeaksInfo$allPeakIndex > (peak_fill_location - max.index.shift) & PeaksInfo$allPeakIndex < (peak_fill_location + max.index.shift)))
                                
                                peak_location = as.numeric(PeaksInfo$allPeakIndex[peak_ind])
                                peak_scale =  as.numeric(PeaksInfo$peakScale[peak_ind])
                            }
                            
                            res.in.try <- MassSpecWavelet::tuneInPeakInfo(Y.spec[samples.to.fill[k], fill_index_start:fill_index_stop], peakScale = peak_scale, peakIndex=peak_location)
                            
                            
                        }, warning = function(warn) {
                            res.in.try <- rep(NA, 6)
                            return(res.in.try)
                        }, error = function(err) {
                            res.in.try <- rep(NA, 6)
                            return(res.in.try)
                        })
                        res[sapply(res, is.null)] <- NA
                        if (length(res) != 6) {
                            res <- rep(NA, 6)
                        }
                        if (length(res) != length(unlist(res))) {
                            res <- rep(NA, 6)
                        }
                        res <- unlist(res)
                        filled[k, ] <- res
                    }
                    colnames(filled) <- c("peakIndex", "peakValue", 
                                          "peakCenterIndex", "peakSNR", "peakScale", 
                                          "unProcessedPeak")
                    filled <- data.frame(filled)
                    filled$Sample <- samples.to.fill
                    filled$peakPPM <- Mn.ppm
                    filled$grp.index <- Med.index
                    filled <- filled[rowSums(is.na(filled[, 1:6])) != 
                                         6, , drop = F]
                    filled <- filled[!is.na(filled$peakValue), ]
                    Peaks.filled[[gg]] <- filled
                }
                
                
                
                Peaks.filled <- data.table::rbindlist(Peaks.filled)
                Peaks.filled <- data.frame(Peaks.filled$grp.index, Peaks.filled$peakPPM, Peaks.filled$peakValue, 
                                           Peaks.filled$peakSNR, Peaks.filled$peakScale, Peaks.filled$Sample)
                colnames(Peaks.filled) <- colnames(Y.grouped)
                return(Peaks.filled)   
            }
        }
        
        
        
        
        
        close(pb)  # close progress bar
        parallel::stopCluster(cl)
        FilledList = data.table::rbindlist(FilledList)
        
        
        Y.filled <- rbind(Y.grouped, FilledList)
        Y.filled <- data.frame(Y.filled)
        return(Y.filled)
    }
}

Try the speaq package in your browser

Any scripts or data that you put into this service are public.

speaq documentation built on May 23, 2022, 5:06 p.m.