R/MS2_functions.R

#### Main processMS2 routine ####

#'@import igraph
#'@import methods
#'@rawNamespace import(plotly, except = c(groups, last_plot))
#'@importFrom dplyr distinct
#'
#'@title processMS2
#'@description processMS2 processes the MSMS files generated for a given IL and
#'  performs compound identification using an MSMS reference spectra database.
#'  Only the struct id and MS2files parameters are mandatory.
#'
#'@param struct The RHermesExp object to add the processed data.
#'@param id Numeric, the id of the inclusion list to process.
#'@param MS2files Character vector of the MS2 files address.
#'@param sstype How to "process" the MS2 scans. Defaults to assuming you have
#'  acquired in continuous MS2 mode (sstype = "regular"). You can also take the
#'  most intense scan (sstype = "onescan") or even all acquired scans (sstype =
#'  "all"). Beware that "all" takes substantially longer.
#'@param useDB Whether to use an MS2 DB to match the spectra.
#'@param referenceDB Character, where to find the MSMS database to query
#'  against.
#'@param mincos Numeric, between 0 and 1, the minimum spectral cosine between
#'  query and reference spectrum for a match to be reported (see the paper for
#'  more detail on this).
#'@param minhits Numeric, minimum number of matching fragments to calculate the
#'  similarity score. If there are fewer, the match is not considered.
#'@param method Character, the similarity algorithm to use. Defaults to
#'  "cosine", but it can be changed to other metrics included in the package
#'  Philentropy.
#'@return An RHermesExp object with the identifications set in the used MS2Exp
#'  slot.
#'@examples
#'if(FALSE){
#'processMS2(myHermes, 1, c('C:/myFolder/File1.mzML',
#'                          'C:/myFolder/File2.mzML'))
#'}
#'
#'@export
setGeneric("processMS2",
        function(struct, id, MS2files, sstype = "regular", useDB = FALSE,
                    referenceDB = "D:/sp_MassBankEU_20200316_203615.RData",
                    mincos = 0.8, minhits = 1, method = "cosine") {
    standardGeneric("processMS2")
})
#' @rdname processMS2
setMethod("processMS2",
            c("RHermesExp", "numeric", "character", "character", "ANY", "ANY"),
function(struct, id, MS2files, sstype = "regular", useDB = FALSE,
            referenceDB = "D:/sp_MassBankEU_20200316_203615.RData",
            mincos = 0.8, minhits = 1, method = "cosine") {
    ## MS2 Data Importation and sorting within IL
    validObject(struct)
    MS2Exp <- struct@data@MS2Exp[[id]]
    message(paste("Starting MS/MS data importation, merging and",
                    "sorting within the IL entries"))

    #Fills in the MS2data slot
    MS2Exp@MS2Data <- MSMSimporter(MS2Exp@IL, MS2files)
    idx <- vapply(MS2Exp@MS2Data, function(x) {return(length(x) != 0)},
                logical(1))  #Which IL entries are covered by at least 1 scan
    idx <- which(idx)
    if (length(idx) == length(MS2Exp@MS2Data)) {
        message("All IL entries were covered. Nice!")
    } else {
        message("A total of ", length(MS2Exp@MS2Data) - length(idx)," (",
                round((length(MS2Exp@MS2Data) - length(idx)) /
                        length(MS2Exp@MS2Data)*100, digits = 2),
                "%) entries were not covered in the experiment")
    }

    if (length(idx) == 0) {
        stop(paste("No entries were covered in the IL.",
            "Please check the MS2 file paths and the IL object are correct"))
    }

    ##Superspectra Generation
    message("Starting superspectra generation. This may take a while...")
    purifiedSpectra <- CliqueMSMS(MS2Exp, idx, sstype = sstype)
    if (!useDB) {
        message("No spectral matching was performed. Done!")
        purifiedSpectra$results <- rep(
            "No MS2 Spectral DB matching",
            times = nrow(purifiedSpectra)
        )
        MS2Exp@Ident <- list(MS2Features = purifiedSpectra,
                            DatabaseSpectra = list(),
                            MS2_correspondance = list())
        struct@data@MS2Exp[[id]] <- MS2Exp
        struct <- setTime(struct, paste("Processed MS2Exp object", id,
                                        "and generated superspectra but",
                                        "didn't perform matching"))
        return(struct)
    }

    ##Database Query
    # Catching possible load error
    tryCatch({
        obj <- readRDS(referenceDB)
        # Data.tables for faster indexing
        metaesp <- as.data.table(obj$df_spectra)
        metamet <- as.data.table(obj$df_metabolite)
        espmet <- as.data.table(obj$df_spectraMetabolite)
        list_fragments <- obj$list_fragments
        rm(obj)
    }, error = function(cond) {
        stop("The referenceDB input isn't valid")
    })

    metamet$REFformula <- unlist(metamet$REFformula)

    setkeyv(metaesp, c("ID_spectra"))
    setkeyv(espmet, c("ID_metabolite"))
    setkeyv(metamet, c("REFformula"))

    polarity <- ifelse(struct@metadata@ExpParam@ion == "+", 1, 0)

    # Retrieving spectra
    message("Retrieving MS2 spectra from the reference database")
    allf <- purifiedSpectra$anot %>% unlist() %>% unique()
    retrievedMSMS <- lapply(allf, function(f) {
        idmet <- metamet[.(f), ]$ID_metabolite
        if (is.na(idmet[1])) return(list())
        RES <- lapply(idmet, function(id) {
            idesp <- espmet[.(id), ]$ID_spectra
            metadata <- metaesp[.(idesp), ]
            which_polarity <- metadata$REFpolarity == polarity
            spec <- list_fragments[["spectra"]][
                        list_fragments[["ID_spectra"]] %in%
                            idesp[which_polarity]
                    ]
            if (length(spec) == 0) {return(spec)}
            energies <- apply(metadata[which_polarity, c("REFCE", "REFnature")],
                                1, function(x) {
                                    paste(x, collapse = "_")
                                }
                            )
            names(spec) <- energies
            return(spec)
        })
        novalidspec <- vapply(RES, function(x) {length(x) == 0}, logical(1))
        if (any(novalidspec)) {
            RES <- RES[!novalidspec]
        if (length(RES) == 0) {
            return(list())
        }
        names(RES) <- paste(rep(f, times = nrow(metamet[.(f),]) -
                                length(which(novalidspec))),
                            metamet[.(f), ]$ID_metabolite[!novalidspec],
                            unlist(lapply(metamet[.(f), ]$REFname,
                                        function(x) {x[[1]]}))[!novalidspec],
                            metamet[.(f),]$REFsmiles[!novalidspec], sep = "#")
        } else {
        names(RES) <- paste(rep(f, times = nrow(metamet[.(f),])),
                            metamet[.(f), ]$ID_metabolite,
                            lapply(metamet[.(f), ]$REFname,
                                function(x) {x[[1]]}
                            ) %>% unlist(),
                            metamet[.(f), ]$REFsmiles, sep = "#")
        }
        return(RES)
    })
    names(retrievedMSMS) <- allf

    message(paste("Calculating", method, "similarities"))

    corresponding <- lapply(purifiedSpectra$anot, function(x){
        which(allf %in% x)
    })
    cos_list <- mapply(function(entry, reference) {
        curspec <- retrievedMSMS[reference]
        n <- unlist(lapply(curspec, names))
        curspec <- unlist(curspec, recursive = FALSE, use.names = FALSE)
        names(curspec) <- n
        cos <- rapply(curspec, function(compound) {
            calculate_MS2_distance(entry, t(compound), minhits = minhits,
                                    method = method)
        }, classes = "matrix", how = "replace")
        cos[which(lapply(cos, length) != 0)]
    }, purifiedSpectra$ssdata, corresponding)

    purifiedSpectra$results <- lapply(cos_list, function(x) {
        if (length(x) == 0) {return("Missing reference spectra")}
        isValidhit <- any(rapply(x, function(cos) {cos > mincos},
                                "numeric", "unlist"))
        if (!isValidhit) {return("No significant hits")}

        # Extracting cosines for each superspectra
        withcosines <- which(vapply(x, function(x) {
            length(x) != 0
        }, logical(1)))

        #Formula entries that have a good match
        goodf <- which(vapply(x, function(f){
            any(unlist(f) > mincos)
        }, logical(1)))

        RES <- lapply(goodf, function(f) {
            #Detect which spectra have a hit for that formula
            fnames <- names(x[f])
            resdf <- data.frame(formula = fnames, cos = numeric(1),
                                id = numeric(1),
                                stringsAsFactors = FALSE)
            coslist <- x[[f]]
            resdf$id <- which.max(vapply(coslist, function(cos) {
                cos[1]
            }, numeric(1)))
            resdf$cos <- coslist[[resdf$id]]

            return(resdf)
        }) %>% do.call(rbind, .)
        return(RES)
    })
    MS2Exp@Ident <- list(MS2Features = purifiedSpectra,
                        DatabaseSpectra = retrievedMSMS,
                        MS2_correspondance = corresponding)
    message("Done!")
    struct@data@MS2Exp[[id]] <- MS2Exp
    struct <- setTime(struct, paste("Processed MS2Exp object", id,
                                ", generated superspectra and matched against",
                                "database", referenceDB))
    return(struct)
})


MSMSimporter <- function(IL, filelist) {
    ##Extract all MSMS file data (header and peaks)
    filesdata <- mapply(function(f, n) {
        hand <- mzR::openMSfile(f)
        pks <- mzR::peaks(hand)
        h <- mzR::header(hand)
        pks <- pks[which(h$msLevel == 2)]
        h <- h[which(h$msLevel == 2), ]
        h <- cbind(h, runnum = rep(n, times = nrow(h)))
        return(list(h, pks))
    }, filelist, seq_along(filelist))

    heads <- do.call(rbind, filesdata[seq(1, length(filesdata), 2)])
    heads$totIonCurrent <- heads$totIonCurrent * heads$injectionTime / 50

    peaks <- unlist(filesdata[seq(2, length(filesdata), 2)],
                    recursive = FALSE)
    ##Organize MSMS data into the different IL entries
    mzfilter <- IL@ILParam@filtermz
    RES <- apply(IL@IL[, c(1, 2, 3)], 1, function(entry) {
        idx <- between(heads$retentionTime, entry[1], entry[2]) &
            between(heads$precursorMZ, entry[3] - mzfilter, entry[3] +
                    mzfilter)
        if (!any(idx)) {
            return(list())
        }
        subh <- heads[idx, ]
        subpks <- peaks[idx]
        subpks <- do.call(rbind, lapply(seq_along(subpks), function(x) {
            s <- subpks[[x]]
            s <- cbind(s, rep(as.numeric(subh[x, "retentionTime"]),
                        times = nrow(s)),
                        rep(as.numeric(subh[x, "runnum"]), times = nrow(s)))
            #IT scaling, 50ms as reference
            s[,2] <- s[,2] * subh[x, "injectionTime"] / 50
            return(s)
        }))
        subpks <- as.data.frame(subpks)
        names(subpks) <- c("mz", "int", "rt", "ID")
        subpks <- filter(subpks, .data$int > 1)  #Remove zeros
        if (nrow(subpks) == 0) {
            return(list())
        }
        return(list(subh, subpks))
    })
    return(RES)
}

#### MS2 spectra processing functions ####
# First it parses each IL entry annotation to store the annotated
# formulas that correspond to each entry. Then, it reads the MSMS info and
# finds consistent mass traces by sorting all data points by mz and using
# a Centwave-like algorithm.
# The derived 'pure' spectra are called superspectra. A single IL entry
# can yield different superspectra. This function is NOT to be used by
# itself. It forms part of the MSMS processing workflow.

#' @import ggplot2
CliqueMSMS <- function(MS2Exp, idx, contaminant = 173.5, delta = 0.1,
                        sstype = "regular") {
    IL <- MS2Exp@IL
    MS2list <- MS2Exp@MS2Data[idx]
    ## Parsing formulas from annotation for every IL entry
    fs <- strsplit(IL@IL$entrynames[idx], split = "#") %>%
        lapply(., function(x) {
            lapply(x, function(y) {
                res <- strsplit(y, split = "$", fixed = TRUE)[[1]][[1]]
                return(sub(pattern = " ", replacement = "", x = res))
            }) %>% unlist() %>% unique()
        })

    #Main function
    if (sstype == "regular") {
        suppressWarnings(
            RES <- bplapply(seq_along(idx), generate_ss, BPPARAM = bpparam(),
                            MS2list, contaminant, delta, fs, idx, FALSE)
        )
        RES <- do.call(rbind, RES)
        RES <- purify_ss(RES, minint = 30000, minpks = 2)
    } else if(sstype == "onescan"){
        RES <-  lapply(seq_along(idx), wrapper_failsafe_ss, idx = idx,
                        MS2list = MS2list, fs = fs)
        RES <- do.call(rbind, RES)
    } else {
        RES <-  lapply(seq_along(idx), wrapper_allscan_ss, idx = idx,
                        MS2list = MS2list, fs = fs)
        RES <- do.call(rbind, RES)
    }
    RES$start <- as.numeric(RES$start)
    RES$end <- as.numeric(RES$end)
    RES$apex <- as.numeric(RES$apex)
    return(RES)
}

#'@import igraph
generate_ss <- function(curentry, MS2list, contaminant, delta, fs, idx,
                        to_plot) {
    header <- MS2list[[curentry]][[1]]
    data <- MS2list[[curentry]][[2]]
    ## Remove weak signals (<0.5% of the most intense point)
    data <- do.call(rbind, lapply(unique(data$rt), function(t) {
        maxi <- max(data[data$rt == t, "int"])
        return(data[data$rt == t & data$int > 0.005 * maxi, ])
    }))
    if (nrow(data) == 0) {return()}

    ## Remove known contaminant signals
    for (x in contaminant) {
        data <- data[!between(data$mz, contaminant - delta,
                                contaminant + delta), ]
    }
    if (nrow(data) == 0) {return()}
    # To match with cosineSim definition
    colnames(data)[colnames(data) == "int"] <- "rtiv"

    ## Find mass traces like CentWave
    rts <- unique(data$rt)
    soi <- list()
    avgmz <- c()
    mu <- 20
    pmin <- 5
    for (i in rts) {
        if (i == rts[1]) {
            soi <- split(data[data$rt == i, ], data$mz[data$rt == i])
            avgmz <- as.numeric(names(soi))
        } else {
            curdata <- data[data$rt == i, ]
            for (j in unique(curdata$mz)) {
                dist <- abs(avgmz - j)/j * 1e+06
                if (any(dist < mu)) {
                    id <- which.min(dist)
                    soi[[id]] <- rbind(soi[[id]], curdata[curdata$mz == j, ])
                    avgmz[[id]] <- mean(soi[[id]]$mz)
                } else {
                    soi <- c(soi, list(curdata[curdata$mz == j, ]))
                    avgmz <- c(avgmz, j)
                }
            }
        }
    }
    good <- vapply(soi, function(x) {
        nrow(x) > pmin
    }, logical(1))
    soi <- soi[good]
    avgmz <- avgmz[good]

    # Trying to salvage a spectra from suboptimal data
    if (length(soi) < 3) {
        return(failsafe_ss(data, header, idx[curentry], fs[curentry]))
    }

    ## Tidying the regions
    for (i in seq_along(avgmz)) {
        soi[[i]]$mz <- round(avgmz[i], 3)
        soi[[i]] <- soi[[i]][order(soi[[i]]$rt), ]
    }

    ## Centwave peak-picking
    soipks <- lapply(soi, function(x) {
        suppressWarnings(
            df <- xcms::peaksWithCentWave(
                int = x$rtiv, rt = x$rt, snthresh = 0,
                firstBaselineCheck = FALSE, prefilter = c(0, 0),
                peakwidth = c(5, 60)
            )
        )
        df <- as.data.frame(df)
        if (nrow(df) == 0) {
            df <- data.frame(rt = 0, rtmin = min(x$rt), rtmax = max(x$rt),
                                into = 0, intb = 0, maxo = max(x$rtiv), sn = 0)
            df$categ <- "Putative"
        } else {
            df$categ <- "Peak"
            # If a peak is found, consider the rest of fragment RT intervals as
            # putative regions
            dfinterv <- unlist(df[, c("rtmin", "rtmax")])
            times <- c(min(x$rt), dfinterv[seq(1, length(dfinterv), 2)],
                        dfinterv[seq(2, length(dfinterv), 2)], max(x$rt))
            iter <- seq(1, length(times), 2)
            maxo <- mapply(function(t1, t2) {
                max(x$rtiv[between(x$rt, t1, t2)], na.rm = TRUE)
            }, times[iter], times[iter + 1]) %>% unlist()
            df <- rbind(df, data.frame(rt = 0, rtmin = times[iter],
                                        rtmax = times[iter + 1], into = 0,
                                        intb = 0, maxo = maxo, sn = 0,
                                        categ = "Putative"))
        }
        df$mz <- rep(x$mz[1], nrow(df))
        return(df)
    })
    pks <- do.call(rbind, soipks)
    if (nrow(pks) == 0) return(failsafe_ss(data, header, idx[curentry],
                                fs[curentry]))


    ## Matching data to the peaks found
    soi <- do.call(rbind, soi)
    soi$peak <- 0
    for (i in seq_len(nrow(pks))) {
        soi$peak[soi$mz == pks$mz[i] & between(soi$rt, pks$rtmin[i],
                                                pks$rtmax[i])] <- i
    }
    ## Deconvolution based on previous peak-picking -- Centwave-Propagation
    results <- centwavePropag(pks, soi)
    pks <- results[[1]]
    soi <- results[[2]]
    if (nrow(pks) == 0) return(failsafe_ss(data, header, idx[curentry],
                                fs[curentry]))


    ## Calculate all similarities
    cos <- lapply(seq_len(nrow(pks)), function(x) {
        lapply(seq_len(nrow(pks)), function(y) {
            score <- pearsonSim(soi[soi$peak == x, ],
                                            soi[soi$peak == y, ])
            if (is.na(score)) {
                score <- 0
            }
            ifelse(score > 0.3, score, 0)
        })
    }) %>% unlist(.)
    cos <- matrix(cos, nrow = nrow(pks))

    net <- igraph::graph_from_adjacency_matrix(cos, mode = "undirected",
                                                weighted = TRUE, diag = FALSE)
    net <- igraph::simplify(net, remove.multiple = TRUE, remove.loops = TRUE)


    ## Network partitioning
    comp <- components(net)
    members <- comp$membership
    for (i in unique(comp$membership)) {
        vertices <- which(comp$membership == i)
        if (length(vertices) == 1) {next}
        current_net <- induced_subgraph(net, vertices)
        partitioning <- cluster_fast_greedy(current_net)
        dens <- edge_density(current_net)
        mod <- modularity(current_net, membership(partitioning))
        if (is.nan(dens)) {
            dens <- 0
        }
        if (dens < 0.5 & mod > 0.3) {
            # 1e3 as arbitrary number to avoid membership collisions
            members[vertices] <- (1000 * i + membership(partitioning))
        }
    }

    n_mem <- length(unique(members))
    ss <- data.frame(start = numeric(n_mem), end = numeric(n_mem),
                        apex = numeric(n_mem), ILentry = numeric(n_mem),
                        precmass = numeric(n_mem))
    ss$start <- lapply(unique(members), function(id) {
        min(pks$rtmin[which(members == id)])
    })
    ss$end <- lapply(unique(members), function(id) {
        max(pks$rtmax[which(members == id)])
    })
    ss$apex <- lapply(unique(members), function(id) {
        apex <- which.max(soi$rtiv[soi$peak %in% which(members == id)])
        soi$rt[which(soi$peak %in% which(members == id))[apex]]
    })
    ss$ssdata <- lapply(unique(members), function(id) {
        return(data.frame(mz = pks$mz[which(members == id)],
                            int = pks$maxo[which(members == id)]))
    })
    ss$precmass <- rep(header$precursorMZ[1], nrow(ss))
    if (to_plot) {
        return(list(net = net, members = members, data = data, soi = soi,
                    pks = pks, ss = ss))
    }
    ss$ILentry <- rep(idx[curentry], nrow(ss))
    ss$anot <- rep(fs[curentry], nrow(ss))
    return(ss)
}


reassign_and_check <- function(pks, soi) {
    if (nrow(soi) == 0) {
        return(list(data.frame(), soi))
    }
    ## Reassign scan points to peaks
    soi$peak <- 0
    for (i in seq_len(nrow(pks))) {
        soi$peak[soi$mz == pks$mz[i] &
                    between(soi$rt, pks$rtmin[i], pks$rtmax[i])] <- i
    }

    ## Check that in each peak there are at least 5 scans
    tokeep <- vapply(seq_len(nrow(pks)), function(x) {
        length(which(soi$peak == x)) > 5
    }, logical(1))
    pks <- pks[which(tokeep), ]
    soi <- soi[soi$peak %in% which(tokeep), ]
    if (nrow(soi) == 0) {return(list(pks, soi))}

    # Reassign again
    soi$peak <- 0
    for (i in seq_len(nrow(pks))) {
        soi$peak[soi$mz == pks$mz[i] &
                    between(soi$rt, pks$rtmin[i], pks$rtmax[i])] <- i
    }
    return(list(pks, soi))
}

centwavePropag <- function(pks, soi){
    repeat ({
        rows_to_add <- data.frame(rt = numeric(0), rtmin = numeric(0),
                                    rtmax = numeric(0), into = numeric(0),
                                    intb = numeric(0), maxo = numeric(0),
                                    sn = numeric(0), categ = character(0),
                                    mz = numeric(0))
        for (i in which(pks$categ == "Peak")) {
            cur_cos <- vapply(seq_len(nrow(pks)), function(y) {
                score <- cosineSim(soi[soi$peak == i, ], soi[soi$peak == y, ])
                if (is.na(score)) {score <- 0}
                return(score)
            }, numeric(1))
            #Putative peaks that match well with a confirmed Centwave peak
            toconvert <- which(cur_cos > 0.8 & pks$categ == "Putative")
            if (length(toconvert) != 0) {
                for (j in toconvert) {
                    dfinterv <- unlist(pks[i, c("rtmin", "rtmax")])
                    x <- soi[soi$peak == j, ]
                    #Define start-end pairs
                    times <- c(min(x$rt), max(dfinterv[1], min(x$rt)),
                                min(dfinterv[2], max(x$rt)), max(x$rt))
                    iter <- seq(1, length(times), 2)

                    maxo <- mapply(function(t1, t2) {
                        max(x$rtiv[between(x$rt, t1, t2)], na.rm = TRUE)
                    }, times[iter], times[iter + 1]) %>% unlist()

                    pks[j, c("rtmin", "rtmax")] <- c(times[2], times[3])
                    pks$maxo[j] <- max(x$rtiv[between(x$rt, times[2],
                                                        times[3])])
                    pks$categ[j] <- "Peak"
                    rows_to_add <- rbind(rows_to_add,
                                        data.frame(rt = 0, rtmin = times[iter],
                                                    rtmax = times[iter + 1],
                                                    into = 0, intb = 0,
                                                    maxo = maxo, sn = 0,
                                                    categ = "Putative",
                                                    mz = pks$mz[j]))
                }
            }
        }
        rows_to_add <- rows_to_add[rows_to_add$maxo != -Inf, ]
        pks <- rbind(pks, rows_to_add)

        res <- reassign_and_check(pks, soi)
        pks <- res[[1]]
        soi <- res[[2]]

        ##Split putative peaks with >5s gaps
        for (i in which(pks$categ == "Putative")) {
            cur <- soi[soi$peak == i, c("rt", "rtiv")]
            times <- diff(cur$rt)
            jumps <- times > 3
            if (any(jumps)) {
                jumps <- which(jumps)
                splits <- data.frame(rt = numeric(0), rtmin = numeric(0),
                                        rtmax = numeric(0), into = numeric(0),
                                        intb = numeric(0), maxo = numeric(0),
                                        sn = numeric(0), categ = character(0),
                                        mz = numeric(0))
                for (j in seq_along(jumps)) {
                    if (jumps[j] == jumps[1]) {
                        pks[i, c("rtmin", "rtmax")] <- c(cur$rt[1],
                                                            cur$rt[jumps[j]])
                        pks[i, "maxo"] <- max(cur$rtiv[seq_len(jumps[j])])
                    } else {
                        splits <- rbind(splits,
                                        data.frame(rt = 0,
                                                rtmin = cur$rt[jumps[j-1]+1],
                                                rtmax = cur$rt[jumps[j]],
                                                into = 0, intb = 0,
                                                maxo = max(cur$rtiv[
                                                    seq_len(jumps[j])]),
                                                sn = 0, categ = "Putative",
                                                mz = pks$mz[i]))
                    }
                }
                pks <- rbind(pks,
                            splits,
                            data.frame(rt = 0, rtmin = cur$rt[jumps[j] + 1],
                                        rtmax = max(cur$rt), into = 0, intb = 0,
                                        maxo = max(cur$rtiv[seq((jumps[j] + 1),
                                                                nrow(cur))]),
                                        sn = 0, categ = "Putative",
                                        mz = pks$mz[i]))
            }
        }
        res <- reassign_and_check(pks, soi)
        pks <- res[[1]]
        soi <- res[[2]]
        if (nrow(pks) == 0) {break}
        if (nrow(rows_to_add) == 0) {break}
    })
    return(list(pks, soi))
}

purify_ss <- function(sslist, minint = 30000, minpks = 2) {
    good_ss <- vapply(sslist$ssdata, function(data) {
        has_int <- any(data$int > minint)
        has_many_pks <- (nrow(data) >= minpks)
        return(has_int | has_many_pks)
    }, logical(1))
    sslist <- sslist[good_ss, ]
    return(sslist)
}

wrapper_failsafe_ss <- function(curentry, idx, MS2list, fs) {
    header <- MS2list[[curentry]][[1]]
    data <- MS2list[[curentry]][[2]]
    names(data)[2] <- "rtiv"

    # Select by TIC
    besttic_rt <- header$retentionTime[which.max(header$totIonCurrent)]
    data <- data[data$rt == besttic_rt, ]
    header <- header[which.max(header$totIonCurrent), ]
    return(failsafe_ss(data, header, idx[curentry], fs[curentry]))
}

failsafe_ss <- function(data, header, idx, fs) {
    maxt <- data$rt[which.max(data$rtiv)]
    ss <- data.frame(start = numeric(1), end = numeric(1), apex = numeric(1),
                        ILentry = numeric(1), precmass = numeric(1))
    ss$start <- min(data$rt)
    ss$end <- max(data$rt)
    data <- data[data$rt == maxt, c("mz", "rtiv")]
    data <- data[data$rtiv > 0.005 * max(data$rtiv), ]
    ss$ssdata <- list(data.frame(mz = data$mz, int = data$rtiv))
    ss$apex <- maxt
    ss$precmass <- header$precursorMZ[1]
    ss$ILentry <- idx
    ss$anot <- fs
    return(ss)
}

wrapper_allscan_ss <- function(curentry, idx, MS2list, fs) {
    header <- MS2list[[curentry]][[1]]
    data <- MS2list[[curentry]][[2]]
    names(data)[2] <- "rtiv"

    # Select by TIC
    lapply(seq_len(nrow(header)), function(ss){
        rt <- header$retentionTime[ss]
        data <- data[data$rt == rt, ]
        if(nrow(data) == 0){return()}
        header <- header[ss, ]
        return(failsafe_ss(data, header, idx[curentry], fs[curentry]))
    }) %>% rbindlist()
}


match_spec <- function(P, Q, mzdiff = 0.1, minint = 0.1, minhits = 1){
    if (nrow(P) == 0 | nrow(Q) == 0) {
        stop("Invalid pattern or query")
    }
    if (is.null(dim(Q))) {
        Q <- matrix(Q, ncol = 2, byrow = TRUE)
    }
    Q <- Q[Q[, 2] > minint, , drop = FALSE]
    patint <- c()
    qint <- c()
    allmz <- unique(P$mz)
    allmz <- sort(c(P$mz,Q[, 1]))
    toignore <- c()
    for (i in seq_along(allmz)) {
        if(!i%in%toignore){
            m <- allmz[i]
            dist1 <- abs(Q[, 1] - m)
            dist2 <- abs(P[, 1] - m)
            if( any(dist1 < mzdiff) | any(dist2 < mzdiff)   ){
                seqmz <- which(abs(allmz-m)<mzdiff)
                patint <- c(patint , sum(P[which(dist2 < mzdiff), 2]))
                qint <- c(qint, sum(Q[which(dist1 < mzdiff) , 2]))
                toignore <- c(toignore,seqmz)
            }
        }
    }

    if (length(patint) < minhits | length(qint) < minhits) {
        return()
    }
    return(list(patint, qint))
}
RogerGinBer/RHermes documentation built on Nov. 6, 2022, 11:34 a.m.