R/msdial_functions.R

Defines functions clear_duplicate_links extract_msdial_links import_msdial_file import_msdial_data get_samples_info

Documented in clear_duplicate_links extract_msdial_links get_samples_info import_msdial_data import_msdial_file

#' Get samples names and class from an MSDial file and generate column names.
#'
#' @eval recurrent_params("source")
#' @return A data.frame with the samples names, their class, filetype and corresponding column name.
get_samples_info <- function(source = "pos") {
    rows <- strsplit(readLines(get_project_file_path("normalized_ms_data", source = source), 5),
                     "\t",
                     fixed = TRUE)
    mapping <- data.frame(Sample   = rows[[5]],
                          Class    = rows[[1]],
                          Filetype = rows[[2]],
                          stringsAsFactors = FALSE)
    mapping <- mapping[!mapping$Class %in% c("", "Class", "NA"),]

    mapping$Script_class <- mapping$Class
    non_samples <- mapping$Filetype %in% c("Blank", "QC", "Standard")
    mapping[non_samples,]$Script_class <- mapping[non_samples,]$Filetype
    mapping$Script_class <- make.names(mapping$Script_class, unique = FALSE)
    mapping$Column_name <- increment_strings(make.names(mapping$Script_class, unique = TRUE),
                                             patterns = unique(mapping$Script_class))
    return(mapping)
}



#' Import all data generated by MSDial.
#'
#' @eval recurrent_params("filter_blk", "filter_blk_threshold", "filter_mz", "filter_rsd", "filter_rsd_threshold", "filter_blk_ghost_peaks", "filter_rmd", "filter_rmd_range", "threshold_mz")
#' @return A list of 4 data.frames: \code{peak_data} contains the main data about detected peaks in positive and negative modes,
#'                                  \code{links} contains MSDial links between peaks,
#'                                  \code{detected_adducts} contains the probabilities of detection of adducts based on MSDial information.,
#'                                  \code{samples} contains samples information (names, classes, column names).
import_msdial_data <- function(filter_blk,
                               filter_blk_threshold,
                               filter_mz,
                               filter_rsd,
                               filter_rsd_threshold,
                               filter_blk_ghost_peaks,
                               filter_rmd,
                               filter_rmd_range,
                               threshold_mz) {
    analysis_modes <- get("analysis_modes", envir = mscleanrCache)
    samples <- get_samples_info(source = get("analysis_modes", envir = mscleanrCache)[1])
    # If samples names are incorrect and we have another mode of analysis available
    if ((nrow(samples) == 0 | !("Blank" %in% samples$Filetype)) & length(get("analysis_modes", envir = mscleanrCache)) == 2) {
        samples <- get_samples_info(source = get("analysis_modes", envir = mscleanrCache)[2])
    }
    if (nrow(samples) == 0) stop_script("Couldn't extract samples information from MSDial files.")
    if (!"Blank" %in% samples$Filetype) stop_script("Can't process data without blanks.")

    if ("pos" %in% analysis_modes) {
        msdial_pos <- import_msdial_file("pos", samples)
        msdial_pos_links <- extract_msdial_links(msdial_pos[c("Alignment.ID", "id", "Adduct.type", "Post.curation.result")])
        if (filter_blk) height_pos <- import_msdial_file("pos", samples, normalized = FALSE)
    }
    if ("neg" %in% analysis_modes) {
        msdial_neg <- import_msdial_file("neg", samples)
        msdial_neg_links <- extract_msdial_links(msdial_neg[c("Alignment.ID", "id", "Adduct.type", "Post.curation.result")])
        if (filter_blk) height_neg <- import_msdial_file("neg", samples, normalized = FALSE)
    }
    if ("pos" %in% analysis_modes & "neg" %in% analysis_modes) {
        peak_data <- rbind(msdial_pos, msdial_neg)
        links     <- rbind(msdial_pos_links, msdial_neg_links)
        if (filter_blk) height_data <- rbind(height_pos, height_neg)
    } else {
        if ("pos" %in% analysis_modes) {
            peak_data <- msdial_pos
            links     <- msdial_pos_links
            if (filter_blk) height_data <- height_pos
        } else {
            peak_data <- msdial_neg
            links     <- msdial_neg_links
            if (filter_blk) height_data <- height_neg
        }
    }
    peak_data <- peak_data[peak_data$MS.MS.assigned != "False",]  # We work only on rows with MS/MS
    peak_data$MS.MS.assigned <- NULL
    links <- clear_duplicate_links(links)

    # filters
    if (filter_blk) {  # on Height data
        # ratio Blank
             if("QC"       %in% samples$Script_class) height_data$ratio_Blank <- height_data$avg_Blank / height_data$avg_QC
        else if("Standard" %in% samples$Script_class) height_data$ratio_Blank <- height_data$avg_Blank / height_data$avg_Standard
        else                                          height_data$ratio_Blank <- height_data$avg_Blank / height_data$avg_Samples

        peak_data <- merge(peak_data, height_data[, c("id", "ratio_Blank")], by = "id", all.x = TRUE)
        peak_data <- peak_data[!is.na(peak_data$ratio_Blank),]
        blank_peaks <- peak_data[peak_data$ratio_Blank >= filter_blk_threshold,]
        export_data(blank_peaks, "deleted_blk")
        peak_data <- peak_data[peak_data$ratio_Blank < filter_blk_threshold,]

        if (filter_blk_ghost_peaks & nrow(blank_peaks) > 0) {
            ghosts <- data.frame(mz = unique(blank_peaks$Average.Mz))
            ghosts$round_mz <- round(ghosts$mz, 3)
            peak_data$round_mz <- round(peak_data$Average.Mz, 3)
            ghosts <- merge(peak_data, ghosts, by = "round_mz")
            peak_data$round_mz <- NULL
            export_data(peak_data[peak_data$id %in% unique(ghosts$id),], "deleted_ghosts")
            peak_data <- peak_data[!peak_data$id %in% unique(ghosts$id),]
        }
    }
    if (filter_mz) {
        export_data(peak_data[grep("\\.[89]", peak_data$Average.Mz),], "deleted_mz")
        peak_data <- peak_data[grep("\\.[89]", peak_data$Average.Mz, invert = TRUE),]
    }
    if (filter_rsd) {
        for (class in unique(samples[samples$Script_class != "Blank",]$Script_class)) {
            peak_data[paste0("rsd_", class)] <- peak_data[paste0("sd_", class)] * 100 / peak_data[paste0("avg_", class)]
            if (nrow(is.na(peak_data[paste0("rsd_", class)])) > 0) {
                peak_data[is.na(peak_data[paste0("rsd_", class)]), paste0("rsd_", class)] <- 100
            }
        }
        peak_data$rsd_min <- apply(peak_data[startsWith(names(peak_data), "rsd_")], 1, min, na.rm = TRUE)
        export_data(peak_data[peak_data$rsd_min >= filter_rsd_threshold,], "deleted_rsd")
        peak_data <- peak_data[peak_data$rsd_min < filter_rsd_threshold,]
    }
    if (filter_rmd) {
        peak_data$rmd <- (peak_data$Average.Mz %% 1) / peak_data$Average.Mz * 1000000
        export_data(peak_data[peak_data$rmd < filter_rmd_range[1] | peak_data$rmd > filter_rmd_range[2],], "deleted_rmd")
        peak_data <- peak_data[peak_data$rmd >= filter_rmd_range[1] & peak_data$rmd <= filter_rmd_range[2],]
    }

    # Computing adducts weight
    # Strong hypothesis: [M+H]+ / [M-H]- still there (otherwise there is a problem)
    detected_adducts <- merge(links,
                              peak_data,
                              by.x = "id.1",
                              by.y = "id",
                              all.x = TRUE)[, c("source", "id.1", "id.2", "simple.nature", "Adduct.type.1", "Adduct.type.2")]
    detected_adducts <- detected_adducts[detected_adducts$simple.nature == "pol / adduct" & !is.na(detected_adducts$source),]

    # Case when no adducts are selected in MSDial
    if (nrow(detected_adducts) == 0) {
        detected_adducts <- data.frame(source = c("pos", "neg"),
                                       adduct = c("[M+H]+", "[M-H]-"),
                                       p      = c(1, 1))
    } else {
        detected_adducts <- rbind(data.frame(source = detected_adducts$source,
                                             id     = detected_adducts$id.1,
                                             adduct = as.character(detected_adducts$Adduct.type.1)),
                                  data.frame(source = detected_adducts$source,
                                             id     = detected_adducts$id.2,
                                             adduct = as.character(detected_adducts$Adduct.type.2)))

        if("pos" %in% analysis_modes & !"[M+H]+" %in% detected_adducts$adduct) stop_script("[M+H]+ not found.")
        if("neg" %in% analysis_modes & !"[M-H]-" %in% detected_adducts$adduct) stop_script("[M-H]- not found.")

        detected_adducts <- unique(detected_adducts)
        detected_adducts <- detected_adducts %>% dplyr::group_by(source, .data$adduct) %>% dplyr::summarise(n = dplyr::n())
        detected_adducts <- merge(detected_adducts,
                                  peak_data  %>% dplyr::group_by(source) %>% dplyr::summarise(n_source = dplyr::n()),
                                  by = "source")
        detected_adducts$p <- detected_adducts$n / detected_adducts$n_source
    }

    # identification data
    index.fill <- which(names(peak_data) == make.names("Fill %"))
    index.sn <- which(names(peak_data) == make.names("S/N average"))
    identification_cols <- names(peak_data)[(index.fill+1):(index.sn-1)] # add "Metabolite name" ?
    identified_peaks <- peak_data[!is.na(peak_data$Dot.product), c("id", identification_cols)]
    peak_data <- peak_data[!names(peak_data) %in% identification_cols]
    peak_data$identified_by_MSDial <- ifelse(peak_data$id %in% identified_peaks$id, TRUE, FALSE)

    return(list(peak_data        = peak_data,
                links            = links,
                identified_peaks = identified_peaks,
                detected_adducts = detected_adducts[, c("source", "adduct", "p")],
                samples          = samples))
}



#' Import an MS file generated by MSDial.
#'
#' Needs MSDial v4.00 or higher.
#'
#' @eval recurrent_params("source")
#' @param samples A data.frame containing the samples information.
#' @param normalized Whether to get the Normalized MS-DIAL file or the Height MS-DIAL file (used for blank filtering).
#' @return A data.frame containing MSDial MS data.
import_msdial_file <- function(source, samples, normalized = TRUE) {
    if (normalized) path <- get_project_file_path("normalized_ms_data", source = source)
    else            path <- get_project_file_path("height_ms_data", source = source)
    dial <- utils::read.csv(path, sep = "\t", skip = 4, na.strings = c("", "NA", "null"))

    # column names: order is samples, avg then std
    corr <- unique(samples[, c("Class", "Script_class")])
    corr$Class <- make.names(corr$Class, unique = FALSE)  # to replace special characters with dots
    # avg / std
    for (class in corr$Class) {
        names(dial)[names(dial) == class]               <- paste0("avg_", corr[corr$Class == class,]$Script_class)
        names(dial)[names(dial) == paste0(class, ".1")] <- paste0( "sd_", corr[corr$Class == class,]$Script_class)
    }
    # samples
    names(dial)[(ncol(dial) - length(samples$Sample) - 2*nrow(corr) + 1):(ncol(dial) - 2*nrow(corr))] <- samples$Column_name

    # average of all samples
    is_sample <- !samples$Script_class %in% c("QC", "Blank", "Standard")
    if(nrow(samples[is_sample,]) >= 1) {
        dial$avg_Samples <- rowMeans(data.frame(dial[samples[is_sample,]$Column_name]))
    }

    dial$Metabolite.name <- as.character(dial$Metabolite.name)
    dial$source <- as.factor(source)
    dial$id <- as.character(paste0(source, "_", dial$Alignment.ID))

    return(dial)
}



#' Extract links between elements identified by MSDial.
#'
#' @param msdial_data A 4-columns data.frame with for each peak the alignment id, the global id, the adduct ion name (column "Adduct.type" by default) and the links detected by MSDial (column "Post.curation.result" by default).
#' @return A 6-columns data.frame of links: id of 1st peak, id of 2nd peak, nature, adduct ion name for 1st peak, adduct ion name for 2nd peak, correlation.
extract_msdial_links <- function(msdial_data) {
    links <- msdial_data
    names(links)[names(links) == "Alignment.ID"] <- "Alignment.ID.1"
    names(links)[names(links) == "id"] <- "id.1"
    names(links)[names(links) == "Adduct.type"] <- "Adduct.type.1"
    links <- splitstackshape::cSplit(links, "Post.curation.result", sep = ";", direction = "long")  # returns a data.table
    # extracting _ids ou ids_
    links$Alignment.ID.2 <- stringr::str_extract(links$Post.curation.result, "_[0-9]+")
    links[is.na(links$Alignment.ID.2),]$Alignment.ID.2 <- stringr::str_extract(links[is.na(links$Alignment.ID.2),]$Post.curation.result,
                                                                               "[0-9]+_")
    links$Alignment.ID.2 <- as.numeric(gsub("_", "", links$Alignment.ID.2))
    links <- merge(links,
                   msdial_data[,c("Alignment.ID", "id", "Adduct.type")],
                   by.x = "Alignment.ID.2",
                   by.y = "Alignment.ID")
    names(links)[names(links) == "Adduct.type"] <- "Adduct.type.2"
    # cleaning (nature, double links, and column names)
    links$Post.curation.result <- gsub("_[0-9]+$", "", links$Post.curation.result)
    links$Post.curation.result <- gsub(" linked to [0-9]+_\\[", " \\[", links$Post.curation.result)
    names(links)[names(links) == "id"] <- "id.2"
    links$correlation <- 1
    links$simple.nature <- gsub("adduct .+", "pol / adduct", links$Post.curation.result)
    return(links[, c("id.1", "id.2", "simple.nature", "Adduct.type.1", "Adduct.type.2", "correlation")])
}



#' Clear duplicate undirected MSDial links.
#'
#' @param links A 6-columns data.frame of links: id of 1st peak, id of 2nd peak, nature, adduct ion name for 1st peak, adduct ion name for 2nd peak, correlation.
#' @return \code{links} without duplicate links.
clear_duplicate_links <- function(links) {
    links <- data.table::as.data.table(links)
    # swap Adduct.type
    links$tmp <- links$Adduct.type.1
    links[links$id.1 > links$id.2,]$Adduct.type.1 <- links[links$id.1 > links$id.2,]$Adduct.type.2
    links[links$id.1 > links$id.2,]$Adduct.type.2 <- links[links$id.1 > links$id.2,]$tmp
    # swap id
    links$tmp <- links$id.1
    links[links$tmp > links$id.2,]$id.1 <- links[links$tmp > links$id.2,]$id.2
    links[links$tmp > links$id.2,]$id.2 <- links[links$tmp > links$id.2,]$tmp
    links$tmp <- NULL
    # suppression doublons
    links <- links %>% dplyr::distinct()
    links <- links[!is.na(links$id.1) & !is.na(links$id.2),]
    if (nrow(links[is.nan(links$correlation),]) > 0) links[is.nan(links$correlation),]$correlation <- 1
    return(as.data.frame(links))
}
eMetaboHUB/MS-CleanR documentation built on Jan. 3, 2024, 8:55 p.m.