R/combine_feature_intensities.R

Defines functions combine_feature_intensities

Documented in combine_feature_intensities

#' Combine feature intensities of several spectra
#'
#' Load extracted information about features such as intensities, peak
#' detection and measured mz values and combine them with phenotype information
#' into a \code{\link[SummarizedExperiment]{RangedSummarizedExperiment-class}}
#' object. Features are removed if they have missing mz values or intensities
#' for all samples or constant intensity across all samples.
#'
#' @param feature.files Character vector. Names of files with feature
#' intensities as generated by \code{\link{extract_feature_intensity}}.
#' @param id.samples Character vector. Identifiers of each file. If NULL, ids
#' will be identified as basenames of files.
#' @param pheno data.frame. Information about each sample. Rownames need to
#' correspond to id.samples. If NULL, pheno will only contain ids.
#' @param cor.cutoff Numeric. Cutoff of the pair-wise absolute correlation
#' (default: 0.9).
#' @param verbose Logical. Print out additional information.
#'
#' @return
#' \code{\link[SummarizedExperiment]{RangedSummarizedExperiment-class}}
#' object.
#'
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @export
#'
#' @examples
#' data("info.features")
#' res.dir = tempdir()
#' mzml.files = dir(system.file("extdata",
#'                              package = "preprocessHighResMS"),
#'                  full.names = TRUE)
#'
#' sapply(mzml.files,
#'        extract_feature_intensity,
#'        scanrange = c(1, 2),
#'        info.features = info.features,
#'        ppm = 20,
#'        res.dir = res.dir)
#'
#' feature.files = dir(path = res.dir,
#'                     pattern = ".rds",
#'                     full.names = TRUE)
#' se.example = combine_feature_intensities(feature.files = feature.files,
#'                                          verbose = TRUE)


combine_feature_intensities <- function(feature.files,
                                        id.samples = NULL,
                                        pheno = NULL,
                                        cor.cutoff = 0.9,
                                        verbose = FALSE) {
    if (is.null(id.samples)) {
        ## use basename of files as ids
        id.samples = gsub(".rds", "", basename(feature.files))
    }
    if (length(id.samples) != length(feature.files)) {
        stop("feature.files and id.samples need to have the same length!")
    }
    if (is.null(pheno)) {
        pheno = data.frame(id = id.samples,
                           stringsAsFactors = FALSE)
        rownames(pheno) = pheno$id
    }
    if (!all(id.samples %in% rownames(pheno))) {
        stop("rownames of pheno need to contain all id.samples!")
    }

    ## save intensities, peak information and mz values as matrices
    int = peak.detection = mz = NULL
    for (f in feature.files) {
        temp = readRDS(f)

        int = cbind(int, temp$intensity)
        peak.detection = cbind(peak.detection, temp$peak.detected)
        mz = cbind(mz, temp$mz.meas)

        if (is.null(rownames(int))) {
            rownames(int) = rownames(peak.detection) = rownames(mz) = temp$id
        } else {
            if (!all.equal(rownames(int), temp$id)) {
                stop("different ids!")
            }
        }
    }
    names(id.samples) = NULL
    colnames(int) = colnames(peak.detection) = colnames(mz) = id.samples

    int.log2 = log2(int + 1)

    se = SummarizedExperiment(
        assays = list(
            intensity = int,
            intensity.log2 = int.log2,
            peak.detection = peak.detection,
            mz = mz
        ),
        colData = pheno[id.samples, , drop = FALSE]
    )
    if (verbose) {
        print(paste(nrow(se), "features identified across samples"))
    }

    ## remove features with missing mz values and intensities for all samples
    ## and features with constant intensity across all samples
    se = remove_features(
        se = se,
        assay = "mz",
        method = "missing",
        freq = 1
    )
    se = remove_features(
        se = se,
        assay = "intensity",
        method = "missing",
        freq = 1
    )
    if (verbose) {
        print(paste(
            nrow(se),
            "features after filtering based on missingness"
        ))
    }
    se = remove_features(
        se = se,
        assay = "intensity",
        method = "constant",
        freq = 1
    )
    if (verbose) {
        print(paste(
            nrow(se),
            "features after filtering based on constant intensities"
        ))
    }

    return(se)
}
szymczak-lab/preprocessHighResMS documentation built on Oct. 6, 2020, 12:50 a.m.