R/cluster_param_selection.R

Defines functions PamParamSelection AgglomParamSelection ClusterParamSelection

Documented in AgglomParamSelection ClusterParamSelection PamParamSelection

#' PamParamSelection
#' @description Runs PAM with differing numbers of partitions and returns
#'  validation metrics.
#' @param dataset A transcriptomics dataset. Preferably filtered first.
#'  First columns should be gene names. All other columns should be expression
#'  levels. Not needed if an argument to distance is given.
#' @param distance A distance matrix. If a distance matrix has already been
#' created (such as by using the\code{\link{DistanceGen}} function), the matrix
#'  can be passed to this function to save time. If a distance matrix is not
#'  provided then it will be generated by the function.
#' @param k A numeric vector giving the number of clusters to be evaluated.
#' @param metric The distance metric to be used to calculate the distances
#'  between genes. See parallelDist::parDist for all accepted arguments. Also
#'  allows the option of 'abs.correlation'. Not used if a distance matrix is
#'  provided.
#' @param scale Logical. If TRUE then each gene will be scaled
#' @param nthreads The number of threads to be used for parallel computations.
#' Warning! Increasing this value will massively increase RAM usage.
#' If NULL then the maximum number of threads available will be used.
#' @examples
#' k.options <- seq(2,10)
#' pam.validation <- PamParamSelection(Laurasmappings, k=k.options, nthreads = 2)
#' @export
PamParamSelection <- function(dataset = NULL, distance = NULL, k = c(2, 5, 10),
                              metric = "euclidean", scale = TRUE, nthreads = 4){
    i <- NULL

    if (is.null(dataset) == TRUE & is.null(distance) == TRUE) {
        # Check that either a dataset or a distance matrix has been provided
        stop("Either a transcriptomics dataset or a distance matrix needs to be provided!")
    }


    if (is.null(nthreads) == TRUE) {
        # Set the threads to maximum if NULL is given as argument for nthreads
        nthreads <- parallel::detectCores()
    }

    # Load the dopar binary operator from foreach package
    `%dopar%` <- foreach::`%dopar%`

    if (is.null(distance) == TRUE) {
        dataset.sc <- CircadianTools::GeneScale(dataset, scale = scale)
        # Calculate distance matrix
        distance <- CircadianTools::DistanceGen(dataset = dataset.sc,
             metric = metric, nthreads = nthreads)
    }

    cl <- parallel::makeForkCluster(nthreads)  # Create cluster for parallelism
    doParallel::registerDoParallel(cl)


    result.df <- foreach::foreach(i = k, .combine = rbind) %dopar% {
        clust <- cluster::pam(distance, k = i)  # Run PAM
        # Calculate Dunn index
        dunn <- clValid::dunn(distance, clust$cluster)
        # Calculate connectivity
        connect <- (clValid::connectivity(distance, clust$cluster))
        # Calculate silhouette / silhouette width
        silhoutte_values <- cluster::silhouette(clust)
        silhouette <- mean(silhoutte_values[, 3])

        data.frame(i, dunn, connect, silhouette, "PAM")  # Make row of result.df
    }

    parallel::stopCluster(cl)
    # Define column headings for result.df
    colnames(result.df) <- c("k", "Dunn", "Connectivity", "Silhouette",
                             "Method")
    return(result.df)
}


#' AgglomParamSelection
#' @description Runs agglomerative hierarchical clustering with differing
#'  numbers of partitions and returns validation metrics.
#' @param dataset A transcriptomics dataset. Preferably filtered first. First
#'  columns should be gene names. All other columns should be expression levels.
#'  Not needed if an argument to distance is given.
#' @param distance A distance matrix. If a distance matrix has already been
#'  created (such as by using the \code{\link{DistanceGen}} function), the
#'  matrix can be passed to this function to save time. If a distance matrix is
#'  not provided then it will be generated by the function.
#' @param k A numeric vector giving the number of clusters to be evaluated.
#' @param metric The distance metric to be used to calculate the distances
#'  between genes. See parallelDist::parDist for all accepted arguments. Also
#'  allows the option of 'abs.correlation'. Not used if a distance matrix is
#'  provided.
#' @param scale Logical. If TRUE then each gene will be scaled
#' @param nthreads The number of threads to be used for parallel computations.
#' If NULL then the maximum number of threads available will be used.
#' @examples
#' k.options <- seq(2,10)
#' hclust.validation <- AgglomParamSelection(Laurasmappings, k=k.options,
#'                                           nthreads = 2)
#' @export
AgglomParamSelection <- function(dataset = NULL, distance = NULL,
                        k = c(2, 5, 10), scale = TRUE,  metric = "euclidean",
                        nthreads = 4) {
    i <- NULL

    if (is.null(dataset) == TRUE & is.null(distance) == TRUE) {
        # Check that either a dataset or a distance matrix has been provided
        stop("Either a transcriptomics dataset or a distance matrix needs to be provided!")
    }

    if (is.null(nthreads) == TRUE) {
        # Set the threads to maximum if none is specified
        nthreads <- parallel::detectCores()
    }

    # Load the dopar binary operator from foreach package
    `%dopar%` <- foreach::`%dopar%`

    if (is.null(distance) == TRUE) {
        # Center / scale each gene
        dataset.sc <- CircadianTools::GeneScale(dataset, scale = scale)
        distance <- DistanceGen(dataset = dataset.sc, metric = metric,
                                nthreads = nthreads)
    }

    clust <- stats::hclust(distance)  # Run hclustering

    cl <- parallel::makeForkCluster(nthreads)  # Create cluster for parallelism
    doParallel::registerDoParallel(cl)

    result.df <- foreach::foreach(i = k, .combine = rbind) %dopar% {
        cluster <- stats::cutree(clust, k = i)  # Cut tree
        # Calculate Dunn index
        dunn <- clValid::dunn(distance, cluster)
        # Calculate connectivity
        connect <- (clValid::connectivity(distance, cluster))
        # Calculate silhouette / silhouette width
        silhoutte_values <- cluster::silhouette(cluster, distance)
        silhouette <- mean(silhoutte_values[, 3])

        # Make row of result.df
        data.frame(i, dunn, connect, silhouette, "Agglomerative")

    }
    parallel::stopCluster(cl)
    # Define column headings for result.df
    colnames(result.df) <- c("k", "Dunn", "Connectivity", "Silhouette",
                             "Method")
    return(result.df)
}

#' ClusterParamSelection
#' @description Calculates validation metrics for different clustering methods
#'  and different numbers of partitions. The validation metrics are plotted.
#' @param dataset A transcriptomics dataset. Preferably filtered first. First
#'  columns should be gene names. All other columns should be expression levels.
#' @param distance A distance matrix. If a distance matrix has already been
#' created (such as by using the \code{\link{DistanceGen}} function), the matrix
#'  can be passed to this function to save time. If a distance matrix is not
#'  provided then it will be generated by the function.
#' @param k A numeric vector giving the number of clusters to be evaluated.
#' @param method The clustering method(s) to be used. Multiple methods can be
#'  considered by providing a character vector. Currently accepts 'pam',
#'  'agglom' and 'diana'.
#' @param metric The distance metric to be used to calculate the distances
#' between genes. See parallelDist::parDist for all accepted arguments. Also
#' allows the option of 'abs.correlation'. Not used if a distance matrix is
#' provided.
#' @param nthreads The number of threads to be used for parallel computations.
#' If NULL then the maximum number of threads available will be used.
#' @param save.plot Logical. If TRUE then saves the plots generated.
#' @param save.df Logical. If TRUE then saves the validation metric results as a
#'  .csv file.
#' @param scale Logical. If TRUE then each gene will be scaled.
#' @param path The directory to be used for saving plots and the validation
#' metric results to. Uses the name of the dataset object appended with
#' '_validation' if this argument is not specified.
#' @examples
#' k.options <- seq(2,10)
#' hclust.validation <- ClusterParamSelection(Laurasmappings, k=k.options,
#'                                            nthreads = 2)
#' @export
ClusterParamSelection <- function(dataset = NULL, distance = NULL,
                        k = c(2, 5, 10), method = c("pam", "agglom", "diana"),
                        metric = "euclidean", scale = TRUE, nthreads = 2,
                        save.plot = TRUE, save.df = TRUE, path = NULL) {
    Dunn <- NULL
    Method <- NULL
    Connectivity <- NULL
    Silhouette <- NULL

    if (is.null(dataset) == TRUE & is.null(distance) == TRUE) {
        # Check that either a dataset or a distance matrix has been provided
        stop("Either a transcriptomics dataset or a distance matrix needs to be provided!")
    }

    if (is.null(path) == TRUE) {
   # If a filename isn't specified then the name of the dataframe object is used
        path <- deparse(substitute(dataset))
        # Add _validation to directory
        path <- paste(path, "_validation", sep = "")
    }

    if (dir.exists(path) == FALSE) {
        dir.create(path)  # Create directory if it doesn't already exist
    }

    # Initialize dataframe to hold validation results
    validation.df <- data.frame()

    if (is.null(distance) == TRUE) {
        dataset.sc <- CircadianTools::GeneScale(dataset = dataset,
                                                scale = scale)
        distance <- CircadianTools::DistanceGen(dataset = dataset.sc,
                                        metric = metric, nthreads = nthreads)
    }


    if ("pam" %in% method == TRUE) {
        pam.results <- CircadianTools::PamParamSelection(distance = distance,
                                                k = k, nthreads = nthreads)
        # Add pam validation results if pam is specified in methods
        validation.df <- rbind(validation.df, pam.results)
    }

    if ("agglom" %in% method == TRUE) {
        hclust.results <- CircadianTools::AgglomParamSelection(distance =
                                distance, k = k, nthreads = nthreads)
        # Add agglom validation results if hclust is specified in methods
        validation.df <- rbind(validation.df, hclust.results)
    }

    if ("diana" %in% method == TRUE) {
        diana.results <- CircadianTools::DianaParamSelection(distance =
                              distance, k = k, nthreads = nthreads)
        # Add diana validation results if diana is specified in methods
        validation.df <- rbind(validation.df, diana.results)
    }

    # Vector of colours used in package
    colours.vector <- c("#008dd5", "#ffa630", "#ba1200", "#840032", "#412d6b")
    # Select number of colours required
    colours.vector <- colours.vector[1:length(method)]

    #### Dunn ####
    p <- ggplot2::ggplot(ggplot2::aes(x = k, y = Dunn, color = Method),
                         data = validation.df) + ggplot2::geom_line(size = 1.2)
    p <- p + ggplot2::theme_bw() + ggplot2::ggtitle("Dun Index Plot")
    p <- p + ggplot2::scale_colour_manual(values = colours.vector)
    p <- p + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 1))
    print(p)

    if (save.plot == TRUE) {
        ggplot2::ggsave("dunn_index.png", plot = p, path = path, width = 10,
                        height = 4.5, units = "in")  # Save the plot
    }


    #### Connectivity ####
    p <- ggplot2::ggplot(ggplot2::aes(x = k, y = Connectivity, color = Method),
                         data = validation.df) + ggplot2::geom_line(size = 1.2)
    p <- p + ggplot2::theme_bw() + ggplot2::ggtitle("Connectivity Plot")
    p <- p + ggplot2::scale_colour_manual(values = colours.vector)
    p <- p + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 1))
    print(p)

    if (save.plot == TRUE) {
        ggplot2::ggsave("connectivity_plot.png", plot = p, path = path,
                        width = 10, height = 4.5, units = "in")  # Save the plot
    }


    #### Silhouette ####
    p <- ggplot2::ggplot(ggplot2::aes(x = k, y = Silhouette, color = Method),
                         data = validation.df) + ggplot2::geom_line(size = 1.2)
    p <- p + ggplot2::theme_bw() + ggplot2::ggtitle("Silhouette Plot")
    p <- p + ggplot2::scale_colour_manual(values = colours.vector)
    p <- p + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 1))
    print(p)

    if (save.plot == TRUE) {
        ggplot2::ggsave("silhouette_plot.png", plot = p, path = path,
                        width = 10, height = 4.5, units = "in")  # Save the plot
    }

    if (save.df == TRUE) {
        # Save the validation results as .csv file
        df.filename <- paste(path, "/results.csv", sep = "")
        utils::write.csv(validation.df, file = df.filename, row.names = FALSE)
    }

    return(validation.df)
}
nathansam/CircadianTools documentation built on Dec. 26, 2019, 11:30 a.m.