R/msfinder_functions.R

Defines functions import_msfinder_base_file import_msfinder_structure_file import_msfinder_formula_file import_msfinder_data

Documented in import_msfinder_base_file import_msfinder_data import_msfinder_formula_file import_msfinder_structure_file

#' Import files generated by MSFinder, both formula and structure at the same time.
#'
#' @eval recurrent_params("source", "level")
#' @return MSFinder formula and structure data joined together.
import_msfinder_data <- function(source, level) {
    msf_formula   <- import_msfinder_formula_file(source, level)
    msf_structure <- import_msfinder_structure_file(source, level)

    if (is.null(msf_formula) | is.null(msf_structure)) return(NULL)

    global <- merge(msf_formula,
                    msf_structure,
                    by = c("Alignment.ID", "Title", "MS1.count", "MSMS.count", "PRECURSORMZ", "PRECURSORTYPE", "Formula"),
                    suffixes = c(".formula", ".structure"),
                    all.x = TRUE)
    global <- global[!is.na(global$Structure),]

    if (nrow(global) == 0) return(NULL)
    else {
        global$source <- source
        global$id <- as.character(paste0(global$source, "_", global$Alignment.ID))
        global$level <- level

        if (level == "generic") global$Links <- global$Databases.structure
        else {
            # Extraction columns from Databases.structure column
            # (MSFinder concatenates all non-necessary columns in the column "Databases")
            other_columns <- extract_concatenated_data_from_column(data.frame(global$Databases.structure))
            global <- cbind(global, other_columns)
            if(!"Compound_level" %in% names(global)) global$Compound_level <- NA  # if Compound_level not found, creation of the column
        }

        global$Databases.structure <- NULL
        global[global == "-"] <- NA

        return(global)
    }
}



#' Import a formula file generated by MSFinder.
#'
#' @eval recurrent_params("source", "level")
#' @return A data.frame containing MSFinder formula data.
import_msfinder_formula_file <- function(source, level) {
    filename <- get_project_file_path("msfinder_data", source = source, msfinder_info = "Formula", msfinder_lvl = level)
    if (is.na(filename)) return(NULL)

    formula <- import_msfinder_base_file(filename, c("Theoretical.mass", "Mass.error", "Formula.score", "Databases"))
    names(formula)[names(formula) == "Formula.rank"] <- "Formula"
    formula <- formula[!is.na(formula$Formula),]
    formula <- formula[, c("Alignment.ID", "Title", "MS1.count", "MSMS.count", "PRECURSORMZ", "PRECURSORTYPE", "rank",
                           "Formula", "Theoretical.mass", "Mass.error", "Formula.score", "Databases")]
    return(formula)
}



#' Import a structure file generated by MSFinder.
#'
#' @eval recurrent_params("source", "level")
#' @return A data.frame containing MSFinder structure data.
import_msfinder_structure_file <- function(source, level) {
    filename <- get_project_file_path("msfinder_data", source = source, msfinder_info = "Structure", msfinder_lvl = level)
    if (is.na(filename)) return(NULL)

    structure <- import_msfinder_base_file(filename, c("Total.score", "Databases", "Formula", "Ontology", "InChIKey", "SMILES"))
    names(structure)[names(structure) == "Structure.rank"] <- "Structure"
    structure <- structure[!is.na(structure$Structure),]
    structure <- structure[, c("Alignment.ID", "Title", "MS1.count", "MSMS.count", "PRECURSORMZ", "PRECURSORTYPE", "rank",
                               "Structure", "Total.score", "Databases", "Formula", "Ontology", "InChIKey", "SMILES")]
    return(structure)
}



#' Generic function for importing a file generated by MSFinder
#'
#' @param filepath The path of the MSFinder file to import.
#' @param columns_to_increment A list of strings indicating which columns needs to have the final number in their name incremented.
#' @return A data.frame containing MSFinder data.
import_msfinder_base_file <- function(filepath, columns_to_increment) {
    finder_data <- utils::read.csv(filepath, sep = "\t", na.strings = c("", "-"))
    finder_data$File.path <- NULL

    # Alignment ID extraction
    finder_data <- splitstackshape::cSplit(finder_data, "File.name", sep = "_", direction = "wide")  # returns a data.table
    finder_data <- splitstackshape::cSplit(finder_data,
                                           names(finder_data)[grep("File.name", colnames(finder_data))[1]], # returns 1st column File.name_X
                                           sep = " ",
                                           direction = "wide")
    finder_data$Alignment.ID <- finder_data[, ncol(finder_data), with = FALSE]
    finder_data <- finder_data[-grep("File.name", names(finder_data))]  # delete all columns File.name*

    # Renaming ranked elements
    names(finder_data) <- increment_strings(names(finder_data), columns_to_increment)

    # Reformating file
    finder_data <- reshape2::melt(as.data.frame(finder_data),
                                  id.vars = c("Alignment.ID", "Title", "MS1.count", "MSMS.count", "PRECURSORMZ", "PRECURSORTYPE"))
    # Extract rank from columns names ("name.1", "othername.10")
    finder_data$rank <- substrRight(as.character(finder_data$variable), 2)            # (".1", "10")
    finder_data$rank <- as.numeric(gsub(".", "", finder_data$rank, fixed = TRUE))     # (1, 10)
    finder_data$variable <- gsub('\\.\\d+$', '', as.character(finder_data$variable))  # ("name", "othername")
    finder_data <- reshape2::dcast(finder_data,
                                   Alignment.ID + Title + MS1.count + MSMS.count + PRECURSORMZ + PRECURSORTYPE + rank ~ variable)

    return(finder_data)
}
eMetaboHUB/MS-CleanR documentation built on Jan. 3, 2024, 8:55 p.m.