MS-CleanR: Cleans MS Data for Easier Analyses

Documented in launch_msfinder_annotation

#' Annotates peaks based on files extracted from MSFinder.
#'
#' @eval recurrent_params("compound_levels", "biosoc_levels", "score_only", "other_peaks_warning")
#' @param levels_scores A list of levels names and their corresponding multiplier to adapt final annotation scores.
#'
#' @section Architecture needed by launch_msfinder_annotation in the project_directory:
#' \itemize{
#'   \item pos
#'   \itemize{
#'     \item msf_X
#'     \describe{
#'       \item{Formula<...>.txt}{Formulas for possible identifications in the X biosoc level, exported from MSFinder}
#'       \item{Structure<...>.txt}{Structures for possible identifications in the X biosoc level, exported from MSFinder}
#'     }
#'   }
#'   \item neg
#'   \itemize{
#'     \item msf_X
#'     \describe{
#'       \item{Formula<...>.txt}{Formulas for possible identifications in the X biosoc level, exported from MSFinder}
#'       \item{Structure<...>.txt}{Structures for possible identifications in the X biosoc level, exported from MSFinder}
#'     }
#'   }
#'   \item intermediary_data
#'   \describe{
#'     \item{samples.csv}{Information on samples}
#'     \item{final_peaks_data.csv}{Peaks remaining after all filters have been applied}
#'     \item{links_ms-final.csv}{Links between peaks indicated by MSDial (global) and adducts/neutral losses detection (by cluster)}
#'   }
#' }
#'
#' @section Output files of launch_msfinder_annotation in the project directory:
#' \describe{
#'   \item{annotated_data-cleaned.csv}{Final auto-annotated data with only relevant peaks}
#' }
#' \itemize{
#'   \item intermediary_data
#'   \describe{
#'     \item{identifying_data.csv}{Union of MSDial and MSFinder data, with clusters info}
#'     \item{annotated_data.csv}{Final auto-annotated data, complete with all possibilities}
#'     \item{annotated_data-manual_check.csv}{Final auto-annotated data for clusters needing manual checking (only created in this case)}
#'   }
#' }
#'
#' @export
launch_msfinder_annotation <- function(compound_levels     = NULL,  # c() = NULL
                                       biosoc_levels       = c("generic"),
                                       levels_scores       = NULL,
                                       score_only          = FALSE,
                                       other_peaks_warning = FALSE) {

    check_input_parameters_launch_msfinder(biosoc_levels, compound_levels, levels_scores)
    check_architecture_for_launch_msfinder_annotation(biosoc_levels)
#    export_params(as.list(environment()))

    print_message("*** Treating ", get("analysis_directory", envir = mscleanrCache), " ***")
    if (score_only) {
        print_message("Annotating of MSFINDER scores only.")
    } else {
        print_message("Annotating with",
                      length(compound_levels),
                      "compound levels (",
                      paste(compound_levels, collapse = ", "),
                      ") and",
                      length(biosoc_levels),
                      "biosource levels (",
                      paste(biosoc_levels, collapse = ", "),
                      ").")
    }

    samples     <- import_data("samples")
    final_data  <- import_data("final_peaks_data")
    final_links <- import_data("links_ms_final")

    # Samples check
    for(s in samples$Column_name) {
        if(!(s %in% names(final_data))) stop_script("Sample '", s, "' not found in peaks data file.")
    }

    # Merging all MSFinder files
    msfinder_data <- data.frame()
    for(source in get("analysis_modes", envir = mscleanrCache)) {
        for(level in c(biosoc_levels, "generic")) {
            suppressWarnings(
                msf_tmp <- import_msfinder_data(source, level)
            )
            # Dealing with potentially missing columns
            if (!is.null(msf_tmp) & length(names(msfinder_data)) > 0) {
                main_names <- names(msfinder_data)
                tmp_names  <- names(msf_tmp)
                for (name in main_names) if (!name %in% tmp_names)  msf_tmp[name]       <- NA
                for (name in tmp_names)  if (!name %in% main_names) msfinder_data[name] <- NA
            }
            if (!is.null(msf_tmp)) {
                suppressWarnings(  # Column class mismatch for 'Classyfire_subclass'. Converting column to class 'character'.
                    msfinder_data <- gtools::smartbind(msfinder_data, msf_tmp)
                )
            }
        }
    }

    if(nrow(msfinder_data) == 0) stop_script("No usable MSFinder data found in ",
                                             get("analysis_directory", envir = mscleanrCache),
                                             ".")

    # Deleting duplicates present in several levels
    msfinder_data <- msfinder_data %>% dplyr::distinct(.data$id,
                                                       .data$Formula,
                                                       .data$Structure,
                                                       .data$SMILES,
                                                       .data$InChIKey,
                                                       .keep_all = TRUE)


    # Merging with MSDial data
    msd_main_cols <- c("cluster", "cluster.size", "source", "id", "Alignment.ID", "Average.Rt.min.",
                       "Average.Mz", "Adduct.type", "MS.MS.spectrum")
    msf_main_cols <- c("level", "rank.formula", "Formula", "rank.structure", "Structure", "Total.score")
    msf_other_cols <- names(msfinder_data)[!names(msfinder_data) %in% c(msf_main_cols,
                                                                        "source", "id", "Alignment.ID", "Databases.formula")]

    identifying_data <- merge(final_data, msfinder_data, by = c("id", "source", "Alignment.ID"), all.x = TRUE)
    identifying_data <- identifying_data[, c(msd_main_cols, msf_main_cols, msf_other_cols, samples$Column_name)]
    # sorting
    biosoc_levels <- biosoc_levels[biosoc_levels != "generic"]
    identifying_data$level <- ordered(identifying_data$level, levels = c(biosoc_levels, "generic"))
    identifying_data <- identifying_data[with(identifying_data, order(cluster, Alignment.ID, level, -xtfrm(Total.score))),]
    export_data(identifying_data, "identifying_data")


    # AUTOMATIC ANNOTATION
    identifying_data$Total.score        <- as.numeric(identifying_data$Total.score)
    identifying_data$rowid              <- 1:nrow(identifying_data)
    identifying_data$annotation         <- NA
    identifying_data$annotation_result  <- NA
    identifying_data$annotation_warning <- NA

    # Clusters with a pair [M+H]+ / [M-H]-
    print_message("*** Annotating clusters with [M+H]+ / [M-H]- couples ***")
    m_pairs <- final_links[final_links$Adduct.1 %in% c("[M+H]+", "[M-H]-")
                           & final_links$Adduct.2 %in% c("[M+H]+", "[M-H]-")
                           & final_links$Adduct.1 != final_links$Adduct.2,]
    m_pairs <- unique(m_pairs[c("CpdID.1", "CpdID.2", "cluster.1", "cluster.2", "Mass.1", "Mass.2")])
    m_pairs <- m_pairs[m_pairs$CpdID.1 %in% final_data$id & m_pairs$CpdID.2 %in% final_data$id,]
    m_pairs <- m_pairs[m_pairs$cluster.1 == m_pairs$cluster.2 & !is.na(m_pairs$cluster.1),]
    freq <- dplyr::count(m_pairs, .data$cluster.1)
    for(cluster_id in freq[freq$n == 1,]$cluster.1) {
        identifying_data <- annotate_cluster(identifying_data,
                                             cluster_id,
                                             couple_ids         = c(m_pairs[m_pairs$cluster.1 == cluster_id,]$CpdID.1,
                                                                    m_pairs[m_pairs$cluster.1 == cluster_id,]$CpdID.2),
                                             compound_levels     = compound_levels,
                                             biosoc_levels       = biosoc_levels,
                                             score_only          = score_only,
                                             other_peaks_warning = other_peaks_warning)
    }

    # Clusters with several pairs [M+H]+ / [M-H]-, we only consider the pair having the highest mass
    for(cluster_id in freq[freq$n > 1,]$cluster.1) {
        ids_1 <- unique(unlist(m_pairs[m_pairs$cluster.1 == cluster_id,]$CpdID.1))
        ids_2 <- unique(unlist(m_pairs[m_pairs$cluster.1 == cluster_id,]$CpdID.2))
        max_mass <- max(final_data[final_data$id %in% c(ids_1, ids_2),]$Average.Mz)
        couple_max <- m_pairs[m_pairs$cluster.1 == cluster_id & (m_pairs$Mass.1 == max_mass | m_pairs$Mass.2 == max_mass),]
        if(length(couple_max$CpdID.1) == 1) {
            identifying_data <- annotate_cluster(identifying_data,
                                                 cluster_id,
                                                 couple_ids          = c(couple_max$CpdID.1, couple_max$CpdID.2),
                                                 compound_levels     = compound_levels,
                                                 biosoc_levels       = biosoc_levels,
                                                 score_only          = score_only,
                                                 other_peaks_warning = other_peaks_warning)
        } else {
            # If several pairs with highest mass, we consider all peaks in these pairs
            identifying_data <- annotate_cluster(identifying_data,
                                                 cluster_id,
                                                 couple_ids          = c(ids_1, ids_2),
                                                 compound_levels     = compound_levels,
                                                 biosoc_levels       = biosoc_levels,
                                                 score_only          = score_only,
                                                 other_peaks_warning = other_peaks_warning)
        }
    }

    # Reste
    print_message("*** Annotating remaining clusters ***")
    for(cluster_id in levels(factor(identifying_data[is.na(identifying_data$annotation_result),]$cluster))) {
        identifying_data <- annotate_cluster(identifying_data,
                                             cluster_id,
                                             compound_levels     = compound_levels,
                                             biosoc_levels       = biosoc_levels,
                                             score_only          = score_only,
                                             other_peaks_warning = other_peaks_warning)
    }


    # Adapting scores
    identifying_data$Final.score <- as.numeric(identifying_data$Total.score)
    if (!score_only) {
        for (level in names(levels_scores)) identifying_data <- update_annotations_scores(identifying_data, level, levels_scores[[level]])
    }
    msf_main_cols <- c(msf_main_cols, "Final.score")  # adding Final.score
    identifying_data <- identifying_data[, c(c("cluster", "id", "annotation_result", "annotation", "annotation_warning"),
                                             msd_main_cols[!msd_main_cols %in% c("cluster", "id", "MS.MS.spectrum")],
                                             msf_main_cols,
                                             msf_other_cols,
                                             samples$Column_name,
                                             "MS.MS.spectrum"),]
    identifying_data <- identifying_data[with(identifying_data, order(cluster, Alignment.ID, -xtfrm(Final.score))),]
    export_data(identifying_data, "annotated_data", empty_na = TRUE)


    # Exporting final annotations
    if(nrow(identifying_data[!is.na(identifying_data$annotation_warning),]) > 0) {
        export_data(identifying_data[!is.na(identifying_data$annotation_warning),],
                    "annotated_data-manual_check",
                    empty_na = TRUE)
    }

    final_annotations <- identifying_data[which(identifying_data$annotation),]
    names(final_annotations)[names(final_annotations) == "id"] <- "selected_feature"
    export_data(final_annotations[!names(final_annotations) %in% c("annotation", "rank.formula", "rank.structure")],
                "annotated_data-cleaned",
                empty_na = TRUE)


    # Normalization of remaining peaks
    final_annotations[samples$Column_name] <- scale(final_annotations[samples$Column_name],
                                                    center=FALSE,
                                                    scale=colSums(final_annotations[samples$Column_name]))
    final_annotations[samples$Column_name] <- round(final_annotations[samples$Column_name] * 1000, 4)

    export_data(final_annotations[!names(final_annotations) %in% c("annotation", "rank.formula", "rank.structure")],
                "annotated_data-normalized",
                empty_na = TRUE)

}