R/clustering.R

Defines functions wrapperRunClustering visualizeClusters checkClusterability averageIntensities

Documented in averageIntensities checkClusterability visualizeClusters wrapperRunClustering

#' Average protein/peptide abundances for each condition studied
#'
#' @description Calculate the average of the abundances for each protein in
#' each condition for an ExpressionSet or MSnSet. Needs to have the array
#' expression data ordered in the same way as the phenotype data (columns of
#' the array data in the same order than the condition column in the phenotype
#' data).
#'
#' @param ESet_obj ExpressionSet object containing all the data
#'
#' @return a dataframe in wide format providing (in the case of 3 or more
#' conditions) the means of intensities for each protein/peptide
#' in each condition. If there are less than 3 conditions, an error message is
#' returned.
#'
#' @author Helene Borges
#'
#' @example examples/ex_averageIntensities.R
#'
#' @export
#'
#'
averageIntensities <- function(ESet_obj) {
    pkgs.require(c('Mfuzz', 'tidyr', "dplyr"))
    
    intensities <- Biobase::exprs(ESet_obj)
    sTab <- Biobase::pData(ESet_obj)
    sTab$Condition <- as.factor(sTab$Condition)
    intensities_t <- as.data.frame(t(intensities))
    intensities_t <- dplyr::bind_cols(
        intensities_t, 
        condition = sTab$Condition, 
        sample = rownames(intensities_t)
        )
    tbl_intensities <- dplyr::as_tibble(intensities_t, rownames = NA)
    longer_intensities <- tbl_intensities %>%
        tidyr::pivot_longer(-c(condition, sample), names_to = "feature", 
            values_to = "intensity")
    mean_intensities <- longer_intensities %>%
        dplyr::group_by(condition, feature) %>%
        dplyr::summarise(mean = mean(intensity))
    mean_intensities_wide <- tidyr::pivot_wider(mean_intensities,
        names_from = condition,
        values_from = mean
    )
    averaged_data <- as.data.frame(mean_intensities_wide)
    rownames(averaged_data) <- averaged_data[, 1]

    return(averaged_data)
}


#' @title xxx
#' 
#' @description The first step is to standardize the data (with the \code{Mfuzz}
#' package). Then the function checks that these data are clusterizable or not
#' (use of [diptest::dip.test()] to determine whether the distribution is
#' unimodal or multimodal). Finally, it determines the "optimal" k by
#' the Gap statistic approach.
#'
#' @param standards a matrix or dataframe containing only the standardized
#' mean intensities returned by the function [standardiseMeanIntensities()]
#'
#' @param b Parameter B of the function [gap_cluster()]
#'
#' @return a list of 2 elements:
#' * dip_test: the result of the clusterability of the data
#' * gap_cluster: the gap statistic obtained with the function
#' [cluster::clusGap()].
#'
#' @author Helene Borges
#'
#' @example examples/ex_checkClusterability.R
#'
#' @export
#'
#'
checkClusterability <- function(standards, b = 500) {
    pkgs.require(c("diptest", 'cluster', "methods"))
    

    if (methods::is(standards, "data.frame")) {
        # if there are columns with something other than numeric values
        if (!is.numeric(standards)) {
            # then we only keep the columns with numeric values
            standards <- dplyr::select_if(standards, is.numeric)
        }
        # we transform into a matrix to be usable by dip.test
        standards <- as.matrix(standards)
    } else if (!methods::is(standards, "matrix") || 
            !methods::is(standards, "data.frame")) {
        stop("Input must be a matrix or a dataframe")
    }
    # check the clusterability of the data
    dip_res <- diptest::dip.test(x = standards)
    #print("dip test done")
    # d.power = 2 corresponds to the Tibshirani criteria. B = 500 is to have
    # stable results from one simulation to another.
    gap_cluster <- cluster::clusGap(standards, 
        FUNcluster = stats::kmeans, 
        nstart = 20, 
        K.max = 10, 
        d.power = 2, 
        B = b)

    return(list(
        "dip_test" = dip_res,
        "gap_cluster" = gap_cluster
    ))
}

#' @title Visualize the clusters according to pvalue thresholds
#'
#' @param dat the standardize data returned by the function 
#' [checkClusterability()]
#'
#' @param clust_model the clustering model obtained with dat.
#'
#' @param adjusted_pValues vector of the adjusted pvalues obtained for each
#' protein with a 1-way ANOVA (for example obtained with the function
#' [wrapperClassic1wayAnova()]).
#'
#' @param FDR_th the thresholds of FDR pvalues for the coloring of the profiles.
#' The default (NULL) creates 4 thresholds: 0.001, 0.005, 0.01, 0.05
#' For the sake of readability, a maximum of 4 values can be specified.
#'
#' @param ttl title for the plot.
#'
#' @param subttl subtitle for the plot.
#'
#' @return a ggplot object
#'
#' @author Helene Borges
#'
#' @example examples/ex_visualizeClusters.R
#' @export
#'
visualizeClusters <- function(dat,
                              clust_model,
                              adjusted_pValues,
                              FDR_th = NULL,
                              ttl = "",
                              subttl = "") {

    
    pkgs.require(c("ggplot2", 'stringr', "methods", "dplyr", "reshape2"))
    

    if (is.null(FDR_th)) {
        FDR_th <- c(0.001, 0.005, 0.01, 0.05)
    } else if (length(FDR_th) > 4) {
        message("Too many FDR thresholds provided. Please do not exceed 
            4 values")
        return(NULL)
    }

    str_try <- stringr::str_glue("<{FDR_th}")
    str_max <- stringr::str_glue(">{max(FDR_th)}")

    dat$FDR_threshold <- cut(adjusted_pValues, 
        breaks = c(-Inf, FDR_th, Inf), 
        labels = c(str_try, str_max)
        )
    desc_th <- FDR_th[order(FDR_th, decreasing = TRUE)]
    str_desc <- stringr::str_glue("<{desc_th}")

    dat$FDR_threshold <- factor(dat$FDR_threshold,
        levels = c(str_max, str_desc)
    )

    dat$adjusted_pvalues <- adjusted_pValues
    if (methods::is(clust_model, "factor")) {
        dat$cluster <- clust_model
    } else if (methods::is(clust_model, "kmeans")) {
        dat$cluster <- as.factor(clust_model$cluster)
    } else if (methods::is(clust_model, "APResult")) {
        dat$cluster <- NA
        for (k in seq_len(length(clust_model@clusters))) {
            dat$cluster[clust_model@clusters[[k]]] <- k
        }
    } else {
        message("Wrong model. Currently, only the Kmeans and the Affinity
                Propagation models are supported.")
        return(NULL)
    }

    melted <- reshape2::melt(dat,
        id.vars = c("feature", "cluster", "adjusted_pvalues", "FDR_threshold"),
        value.name = "Intensity",
        variable.name = "Condition"
    )

    palette2use <- c("#DEDEDE", "#FFC125", "#FF8C00", "#CD3700", "#8B0000")
    names(palette2use) <- levels(melted$FDR_threshold)

    colScale <- ggplot2::scale_colour_manual(
        name = "FDR threshold",
        values = palette2use
    )


    melted <- dplyr::arrange(melted, dplyr::desc(adjusted_pvalues))
    return(ggplot2::ggplot(
        data = melted,
        ggplot2::aes(x = Condition, 
            y = Intensity, 
            col = FDR_threshold)) +
        ggplot2::geom_line(ggplot2::aes(group = feature)) +
        ggplot2::facet_wrap(~cluster) +
        colScale +
        ggplot2::xlab("Condition") +
        ggplot2::ylab("Standardized averaged intensity") +
        ggplot2::labs(title = ttl, subtitle = subttl) +
        ggplot2::theme(
            plot.title = ggplot2::element_text(size = 15, face = "bold"),
            plot.subtitle = ggplot2::element_text(size = 12),
            panel.spacing = ggplot2::unit(1.5, "lines"),
            panel.background = ggplot2::element_rect(fill = NA),
            panel.grid.major = ggplot2::element_line(
                linetype = "solid",
                size = 0.25,
                colour = "#f6f6f6"
            ),
            panel.grid.minor = ggplot2::element_blank()
        ))
}



#' @title clustering pipeline of protein/peptide abundance profiles.
#'
#' @description This function does all of the steps necessary to obtain a
#' clustering model and its graph from average abundances of proteins/peptides.
#'  It is possible to carry out either a kmeans model or an affinity
#'  propagation model. See details for exact steps.
#'
#' @param obj ExpressionSet or MSnSet object.
#'
#' @param clustering_method character string. Three possible values are
#' "kmeans", "affinityProp" and "affinityPropReduced. See the details section
#' for more explanation.
#'
#' @param conditions_order vector specifying the order of the Condition factor
#' levels in the phenotype data. Default value is NULL, which means that it is
#' the order of the condition present in the phenotype data of "obj" which is
#' taken to create the profiles.
#'
#' @param k_clusters integer or NULL. Number of clusters to run the kmeans
#' algorithm. If `clustering_method` is set to "kmeans" and this parameter is
#' set to NULL, then a kmeans model will be realized with an optimal number of
#' clusters `k` estimated by the Gap statistic method. Ignored for the Affinity
#'  propagation model.
#'
#' @param adjusted_pvals vector of adjusted pvalues returned by the
#' [wrapperClassic1wayAnova()]
#'
#' @param ttl the title for the final plot
#'
#' @param subttl the subtitle for the final plot
#'
#' @param FDR_thresholds vector containing the different threshold
#' values to be used to color the profiles according to their adjusted pvalue.
#' The default value (NULL) generates 4 thresholds: [0.001, 0.005, 0.01, 0.05].
#'  Thus, there will be 5 intervals therefore 5 colors: the pvalues <0.001,
#'  those between 0.001 and 0.005, those between 0.005 and 0.01, those between
#'  0.01 and 0.05, and those> 0.05. The highest given value will be considered
#'  as the threshold of insignificance, the profiles having a pvalue> this
#'  threshold value will then be colored in gray.
#'
#' @details The first step consists in averaging the abundances of
#' proteins/peptides according to the different conditions defined in the
#' phenotype data of the expressionSet / MSnSet. Then we standardize the data
#' if there are more than 2 conditions. If the user asks to realize a kmeans
#' model without specifying the desired number of
#' clusters (`clustering_method =" kmeans "` and `k_clusters = NULL`), the
#' function checks data's clusterability and estimates a number of clusters k
#' using the gap statistic method. It is advise however to specify a k for the
#' kmeans, because the gap stat gives the smallest possible k, whereas in
#' biology a small number of clusters can turn out to be uninformative.
#' If you want to run a kmeans but you don't know what number of clusters to
#' give, you can let the pipeline run the first time without specifying
#' `k_clusters`, in order to view the profiles the first time and choose by the
#' following is a more appropriate value of k.
#' If it is assumed that the data can be structured with a large number of
#' clusters, it is recommended to use the affinity propagation model instead.
#' This method simultaneously considers all the data as exemplary potentials,
#' unlike hard clustering (kmeans) which initializes with a number k of points
#' taken at random. The "affinityProp" model will use a q parameter set to NA,
#' meaning that exemplar preferences are set to the median of non-Inf values
#' in the similarity matrix (set q to 0.5 will be the same). The
#' "affinityPropReduced" model will use a q set to 0, meaning that exemplar
#' preferences are set to the sample quantile with threshold 0 of non-Inf
#' values. This should lead to a smaller number of final clusters.
#'
#' @author Helene Borges
#'
#' @return a list of 2 elements: "model" is the clustering model, "ggplot" is
#' the ggplot of profiles clustering.
#'
#' @references
#' Tibshirani, R., Walther, G. and Hastie, T. (2001). Estimating the number of
#' data clusters via the Gap statistic.
#' *Journal of the Royal Statistical Society* B, 63, 411–423.
#'
#' Frey, B. J. and Dueck, D. (2007) Clustering by passing messages between
#' data points. *Science* 315, 972-976.
#' DOI: \href{https://science.sciencemag.org/content/315/5814/972}{
#' 10.1126/science.1136800}
#'
#' @example examples/ex_wrapperRunClustering.R
#' @export
#'
#'
wrapperRunClustering <- function(
    obj,
    clustering_method,
    conditions_order = NULL,
    k_clusters = NULL,
    adjusted_pvals,
    ttl = "",
    subttl = "",
    FDR_thresholds = NULL
    ) {
    
    pkgs.require(c("stats", 'cluster', "purrr", "apcluster", 
                   "forcats", 'Mfuzz', 'dplyr', 'stringr'))
    
   

    res <- list("model" = NULL, "ggplot" = NULL)
    # reorder conditions if requested
    if (!is.null(conditions_order)) {
        .cond <- obj@phenoData@data$Condition
        .levels <- levels(forcats::as_factor(.cond))

        # check that given levels are correct in number...
        if (length(conditions_order) == length(.levels)) {
            # ...and have same labels
            if (all(conditions_order %in% .levels)) {
                obj@phenoData@data$Condition <- forcats::fct_relevel(.cond, 
                    conditions_order)
            } else {
                valid_labels <- str_c(.levels, collapse = ", ")
                message(stringr::str_glue("Wrong labels given. The valid 
                    labels are {valid_labels}"))
                return(NULL)
            }
        } else {
            y <- length(conditions_order)
            n <- length(.levels)
            message(stringr::str_glue("Wrong number of given levels. There are 
                currently {n} levels for Condition. You provided {y} levels."))
            return(NULL)
        }
    }

    averaged_means <- averageIntensities(obj)
    averaged_means <- stats::na.omit(averaged_means)

    sTab <- Biobase::pData(obj)
    only_means <- dplyr::select_if(averaged_means, is.numeric)
    only_features <- dplyr::select_if(averaged_means, is.character)
    if (length(levels(as.factor(sTab$Condition))) == 2) {
        #print("there are only two conditions")
        means <- purrr::map(purrr::array_branch(as.matrix(only_means), 1), mean)
        centered <- only_means - unlist(means)
        centered_means <- dplyr::bind_cols(
            feature = dplyr::as_tibble(only_features), 
            dplyr::as_tibble(centered)
            )

        difference <- only_means[, 1] - only_means[, 2]
        clusters <- as.data.frame(difference) %>%
            dplyr::mutate(cluster = dplyr::if_else(difference > 0, 1, 2))

        res$model <- as.factor(clusters$cluster)

        res$ggplot <- visualizeClusters(
            dat = centered_means,
            clust_model = res$model,
            adjusted_pValues = adjusted_pvals,
            FDR_th = FDR_thresholds,
            ttl = ttl,
            subttl = subttl
        )
    } else if (length(levels(as.factor(sTab$Condition))) > 2) {
        eset <- Biobase::ExpressionSet(assayData = as.matrix(only_means))
        eset_s <- Mfuzz::standardise(eset)
        standards <- eset_s@assayData$exprs

        standardized_means <- dplyr::bind_cols(
            feature = dplyr::as_tibble(only_features), 
            dplyr::as_tibble(standards)
            )
        names(standardized_means) <- c("feature", 
            names(dplyr::as_tibble(standards)))
        if (clustering_method == "affinityProp") {
            res$model <- apcluster::apcluster(apcluster::negDistMat(r = 2), 
                standards)
        } else if (clustering_method == "affinityPropReduced") {
            #print("running affinity propagation reduced algorithm")
            res$model <- apcluster::apcluster(apcluster::negDistMat(r = 2), 
                standards, q = 0)
        } else if (clustering_method == "kmeans") {
            if (is.null(k_clusters)) {
                res <- list("model" = NULL, "dip" = NULL, "ggplot" = NULL)
                checked_means <- checkClusterability(standards)
                res$dip <- checked_means$dip_test
                best_k <- cluster::maxSE(checked_means$gap_cluster$Tab[, "gap"],
                    checked_means$gap_cluster$Tab[, "SE.sim"],
                    method = "Tibs2001SEmax"
                )
                res$model <- stats::kmeans(standards, 
                    centers = best_k, 
                    nstart = 25)
            } else if (k_clusters > 1) {
                best_k <- k_clusters
                res$model <- stats::kmeans(standards, 
                    centers = best_k, 
                    nstart = 25)
            } else { # corresponds to the case k = 1 so no need for clustering
                one_cluster <- as.factor(rep_len(1, nrow(standardized_means)))
                res$ggplot <- visualizeClusters(
                    dat = standardized_means,
                    clust_model = one_cluster,
                    adjusted_pValues = adjusted_pvals,
                    FDR_th = FDR_thresholds,
                    ttl = ttl,
                    subttl = subttl
                )
                return(res)
            }
        } else {
            message("Wrong method given. Valid names are affinityProp, 
                affinityPropReduced and kmeans.")
            return(NULL)
        }
        #print("generating gglot...")
        res$ggplot <- visualizeClusters(
            dat = standardized_means,
            clust_model = res$model,
            adjusted_pValues = adjusted_pvals,
            FDR_th = FDR_thresholds,
            ttl = ttl,
            subttl = subttl
        )
    }

    return(res)
}
prostarproteomics/DAPAR documentation built on March 28, 2024, 4:44 a.m.