#' Get samples names and class from an MSDial file and generate column names.
#'
#' @eval recurrent_params("source")
#' @return A data.frame with the samples names, their class, filetype and corresponding column name.
get_samples_info <- function(source = "pos") {
rows <- strsplit(readLines(get_project_file_path("normalized_ms_data", source = source), 5),
"\t",
fixed = TRUE)
mapping <- data.frame(Sample = rows[[5]],
Class = rows[[1]],
Filetype = rows[[2]],
stringsAsFactors = FALSE)
mapping <- mapping[!mapping$Class %in% c("", "Class", "NA"),]
mapping$Script_class <- mapping$Class
non_samples <- mapping$Filetype %in% c("Blank", "QC", "Standard")
mapping[non_samples,]$Script_class <- mapping[non_samples,]$Filetype
mapping$Script_class <- make.names(mapping$Script_class, unique = FALSE)
mapping$Column_name <- increment_strings(make.names(mapping$Script_class, unique = TRUE),
patterns = unique(mapping$Script_class))
return(mapping)
}
#' Import all data generated by MSDial.
#'
#' @eval recurrent_params("filter_blk", "filter_blk_threshold", "filter_mz", "filter_rsd", "filter_rsd_threshold", "filter_blk_ghost_peaks", "filter_rmd", "filter_rmd_range", "threshold_mz")
#' @return A list of 4 data.frames: \code{peak_data} contains the main data about detected peaks in positive and negative modes,
#' \code{links} contains MSDial links between peaks,
#' \code{detected_adducts} contains the probabilities of detection of adducts based on MSDial information.,
#' \code{samples} contains samples information (names, classes, column names).
import_msdial_data <- function(filter_blk,
filter_blk_threshold,
filter_mz,
filter_rsd,
filter_rsd_threshold,
filter_blk_ghost_peaks,
filter_rmd,
filter_rmd_range,
threshold_mz) {
analysis_modes <- get("analysis_modes", envir = mscleanrCache)
samples <- get_samples_info(source = get("analysis_modes", envir = mscleanrCache)[1])
# If samples names are incorrect and we have another mode of analysis available
if ((nrow(samples) == 0 | !("Blank" %in% samples$Filetype)) & length(get("analysis_modes", envir = mscleanrCache)) == 2) {
samples <- get_samples_info(source = get("analysis_modes", envir = mscleanrCache)[2])
}
if (nrow(samples) == 0) stop_script("Couldn't extract samples information from MSDial files.")
if (!"Blank" %in% samples$Filetype) stop_script("Can't process data without blanks.")
if ("pos" %in% analysis_modes) {
msdial_pos <- import_msdial_file("pos", samples)
msdial_pos_links <- extract_msdial_links(msdial_pos[c("Alignment.ID", "id", "Adduct.type", "Post.curation.result")])
if (filter_blk) height_pos <- import_msdial_file("pos", samples, normalized = FALSE)
}
if ("neg" %in% analysis_modes) {
msdial_neg <- import_msdial_file("neg", samples)
msdial_neg_links <- extract_msdial_links(msdial_neg[c("Alignment.ID", "id", "Adduct.type", "Post.curation.result")])
if (filter_blk) height_neg <- import_msdial_file("neg", samples, normalized = FALSE)
}
if ("pos" %in% analysis_modes & "neg" %in% analysis_modes) {
peak_data <- rbind(msdial_pos, msdial_neg)
links <- rbind(msdial_pos_links, msdial_neg_links)
if (filter_blk) height_data <- rbind(height_pos, height_neg)
} else {
if ("pos" %in% analysis_modes) {
peak_data <- msdial_pos
links <- msdial_pos_links
if (filter_blk) height_data <- height_pos
} else {
peak_data <- msdial_neg
links <- msdial_neg_links
if (filter_blk) height_data <- height_neg
}
}
peak_data <- peak_data[peak_data$MS.MS.assigned != "False",] # We work only on rows with MS/MS
peak_data$MS.MS.assigned <- NULL
links <- clear_duplicate_links(links)
# filters
if (filter_blk) { # on Height data
# ratio Blank
if("QC" %in% samples$Script_class) height_data$ratio_Blank <- height_data$avg_Blank / height_data$avg_QC
else if("Standard" %in% samples$Script_class) height_data$ratio_Blank <- height_data$avg_Blank / height_data$avg_Standard
else height_data$ratio_Blank <- height_data$avg_Blank / height_data$avg_Samples
peak_data <- merge(peak_data, height_data[, c("id", "ratio_Blank")], by = "id", all.x = TRUE)
peak_data <- peak_data[!is.na(peak_data$ratio_Blank),]
blank_peaks <- peak_data[peak_data$ratio_Blank >= filter_blk_threshold,]
export_data(blank_peaks, "deleted_blk")
peak_data <- peak_data[peak_data$ratio_Blank < filter_blk_threshold,]
if (filter_blk_ghost_peaks & nrow(blank_peaks) > 0) {
ghosts <- data.frame(mz = unique(blank_peaks$Average.Mz))
ghosts$round_mz <- round(ghosts$mz, 3)
peak_data$round_mz <- round(peak_data$Average.Mz, 3)
ghosts <- merge(peak_data, ghosts, by = "round_mz")
peak_data$round_mz <- NULL
export_data(peak_data[peak_data$id %in% unique(ghosts$id),], "deleted_ghosts")
peak_data <- peak_data[!peak_data$id %in% unique(ghosts$id),]
}
}
if (filter_mz) {
export_data(peak_data[grep("\\.[89]", peak_data$Average.Mz),], "deleted_mz")
peak_data <- peak_data[grep("\\.[89]", peak_data$Average.Mz, invert = TRUE),]
}
if (filter_rsd) {
for (class in unique(samples[samples$Script_class != "Blank",]$Script_class)) {
peak_data[paste0("rsd_", class)] <- peak_data[paste0("sd_", class)] * 100 / peak_data[paste0("avg_", class)]
if (nrow(is.na(peak_data[paste0("rsd_", class)])) > 0) {
peak_data[is.na(peak_data[paste0("rsd_", class)]), paste0("rsd_", class)] <- 100
}
}
peak_data$rsd_min <- apply(peak_data[startsWith(names(peak_data), "rsd_")], 1, min, na.rm = TRUE)
export_data(peak_data[peak_data$rsd_min >= filter_rsd_threshold,], "deleted_rsd")
peak_data <- peak_data[peak_data$rsd_min < filter_rsd_threshold,]
}
if (filter_rmd) {
peak_data$rmd <- (peak_data$Average.Mz %% 1) / peak_data$Average.Mz * 1000000
export_data(peak_data[peak_data$rmd < filter_rmd_range[1] | peak_data$rmd > filter_rmd_range[2],], "deleted_rmd")
peak_data <- peak_data[peak_data$rmd >= filter_rmd_range[1] & peak_data$rmd <= filter_rmd_range[2],]
}
# Computing adducts weight
# Strong hypothesis: [M+H]+ / [M-H]- still there (otherwise there is a problem)
detected_adducts <- merge(links,
peak_data,
by.x = "id.1",
by.y = "id",
all.x = TRUE)[, c("source", "id.1", "id.2", "simple.nature", "Adduct.type.1", "Adduct.type.2")]
detected_adducts <- detected_adducts[detected_adducts$simple.nature == "pol / adduct" & !is.na(detected_adducts$source),]
# Case when no adducts are selected in MSDial
if (nrow(detected_adducts) == 0) {
detected_adducts <- data.frame(source = c("pos", "neg"),
adduct = c("[M+H]+", "[M-H]-"),
p = c(1, 1))
} else {
detected_adducts <- rbind(data.frame(source = detected_adducts$source,
id = detected_adducts$id.1,
adduct = as.character(detected_adducts$Adduct.type.1)),
data.frame(source = detected_adducts$source,
id = detected_adducts$id.2,
adduct = as.character(detected_adducts$Adduct.type.2)))
if("pos" %in% analysis_modes & !"[M+H]+" %in% detected_adducts$adduct) stop_script("[M+H]+ not found.")
if("neg" %in% analysis_modes & !"[M-H]-" %in% detected_adducts$adduct) stop_script("[M-H]- not found.")
detected_adducts <- unique(detected_adducts)
detected_adducts <- detected_adducts %>% dplyr::group_by(source, .data$adduct) %>% dplyr::summarise(n = dplyr::n())
detected_adducts <- merge(detected_adducts,
peak_data %>% dplyr::group_by(source) %>% dplyr::summarise(n_source = dplyr::n()),
by = "source")
detected_adducts$p <- detected_adducts$n / detected_adducts$n_source
}
# identification data
index.fill <- which(names(peak_data) == make.names("Fill %"))
index.sn <- which(names(peak_data) == make.names("S/N average"))
identification_cols <- names(peak_data)[(index.fill+1):(index.sn-1)] # add "Metabolite name" ?
identified_peaks <- peak_data[!is.na(peak_data$Dot.product), c("id", identification_cols)]
peak_data <- peak_data[!names(peak_data) %in% identification_cols]
peak_data$identified_by_MSDial <- ifelse(peak_data$id %in% identified_peaks$id, TRUE, FALSE)
return(list(peak_data = peak_data,
links = links,
identified_peaks = identified_peaks,
detected_adducts = detected_adducts[, c("source", "adduct", "p")],
samples = samples))
}
#' Import an MS file generated by MSDial.
#'
#' Needs MSDial v4.00 or higher.
#'
#' @eval recurrent_params("source")
#' @param samples A data.frame containing the samples information.
#' @param normalized Whether to get the Normalized MS-DIAL file or the Height MS-DIAL file (used for blank filtering).
#' @return A data.frame containing MSDial MS data.
import_msdial_file <- function(source, samples, normalized = TRUE) {
if (normalized) path <- get_project_file_path("normalized_ms_data", source = source)
else path <- get_project_file_path("height_ms_data", source = source)
dial <- utils::read.csv(path, sep = "\t", skip = 4, na.strings = c("", "NA", "null"))
# column names: order is samples, avg then std
corr <- unique(samples[, c("Class", "Script_class")])
corr$Class <- make.names(corr$Class, unique = FALSE) # to replace special characters with dots
# avg / std
for (class in corr$Class) {
names(dial)[names(dial) == class] <- paste0("avg_", corr[corr$Class == class,]$Script_class)
names(dial)[names(dial) == paste0(class, ".1")] <- paste0( "sd_", corr[corr$Class == class,]$Script_class)
}
# samples
names(dial)[(ncol(dial) - length(samples$Sample) - 2*nrow(corr) + 1):(ncol(dial) - 2*nrow(corr))] <- samples$Column_name
# average of all samples
is_sample <- !samples$Script_class %in% c("QC", "Blank", "Standard")
if(nrow(samples[is_sample,]) >= 1) {
dial$avg_Samples <- rowMeans(data.frame(dial[samples[is_sample,]$Column_name]))
}
dial$Metabolite.name <- as.character(dial$Metabolite.name)
dial$source <- as.factor(source)
dial$id <- as.character(paste0(source, "_", dial$Alignment.ID))
return(dial)
}
#' Extract links between elements identified by MSDial.
#'
#' @param msdial_data A 4-columns data.frame with for each peak the alignment id, the global id, the adduct ion name (column "Adduct.type" by default) and the links detected by MSDial (column "Post.curation.result" by default).
#' @return A 6-columns data.frame of links: id of 1st peak, id of 2nd peak, nature, adduct ion name for 1st peak, adduct ion name for 2nd peak, correlation.
extract_msdial_links <- function(msdial_data) {
links <- msdial_data
names(links)[names(links) == "Alignment.ID"] <- "Alignment.ID.1"
names(links)[names(links) == "id"] <- "id.1"
names(links)[names(links) == "Adduct.type"] <- "Adduct.type.1"
links <- splitstackshape::cSplit(links, "Post.curation.result", sep = ";", direction = "long") # returns a data.table
# extracting _ids ou ids_
links$Alignment.ID.2 <- stringr::str_extract(links$Post.curation.result, "_[0-9]+")
links[is.na(links$Alignment.ID.2),]$Alignment.ID.2 <- stringr::str_extract(links[is.na(links$Alignment.ID.2),]$Post.curation.result,
"[0-9]+_")
links$Alignment.ID.2 <- as.numeric(gsub("_", "", links$Alignment.ID.2))
links <- merge(links,
msdial_data[,c("Alignment.ID", "id", "Adduct.type")],
by.x = "Alignment.ID.2",
by.y = "Alignment.ID")
names(links)[names(links) == "Adduct.type"] <- "Adduct.type.2"
# cleaning (nature, double links, and column names)
links$Post.curation.result <- gsub("_[0-9]+$", "", links$Post.curation.result)
links$Post.curation.result <- gsub(" linked to [0-9]+_\\[", " \\[", links$Post.curation.result)
names(links)[names(links) == "id"] <- "id.2"
links$correlation <- 1
links$simple.nature <- gsub("adduct .+", "pol / adduct", links$Post.curation.result)
return(links[, c("id.1", "id.2", "simple.nature", "Adduct.type.1", "Adduct.type.2", "correlation")])
}
#' Clear duplicate undirected MSDial links.
#'
#' @param links A 6-columns data.frame of links: id of 1st peak, id of 2nd peak, nature, adduct ion name for 1st peak, adduct ion name for 2nd peak, correlation.
#' @return \code{links} without duplicate links.
clear_duplicate_links <- function(links) {
links <- data.table::as.data.table(links)
# swap Adduct.type
links$tmp <- links$Adduct.type.1
links[links$id.1 > links$id.2,]$Adduct.type.1 <- links[links$id.1 > links$id.2,]$Adduct.type.2
links[links$id.1 > links$id.2,]$Adduct.type.2 <- links[links$id.1 > links$id.2,]$tmp
# swap id
links$tmp <- links$id.1
links[links$tmp > links$id.2,]$id.1 <- links[links$tmp > links$id.2,]$id.2
links[links$tmp > links$id.2,]$id.2 <- links[links$tmp > links$id.2,]$tmp
links$tmp <- NULL
# suppression doublons
links <- links %>% dplyr::distinct()
links <- links[!is.na(links$id.1) & !is.na(links$id.2),]
if (nrow(links[is.nan(links$correlation),]) > 0) links[is.nan(links$correlation),]$correlation <- 1
return(as.data.frame(links))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.