R/CC_without_robScore_functions.R

Defines functions multiview_cluster_gen multiview_pam_gen multiview_kmeans_gen multi_cluster_gen multi_pam_gen multi_kmeans_gen

Documented in multi_cluster_gen multi_kmeans_gen multi_pam_gen multiview_cluster_gen multiview_kmeans_gen multiview_pam_gen

# Generation mechanisms without robustness score
# Functions accept a specific range of hyperparameters (specifically number of clusters)
# and perform several clusterings. The output, that is a matrix of clusterings, can then
# be fed to a consensus function to obtain the final clustering.

# ---------------------- Multi-Method Generation -----------------------

#' Multiple K-means generation
#'
#' @param X input data Nsample x Nfeatures
#' @param rep number of repeats
#' @param range.k vector of minimum and maximum values for k \code{c(min, max)}
#' @param method method for the choice of k at each repeat \code{c("random", "silhouette")}
#'
#' @return matrix of clusterings Nsample x Nrepeat
#'
#' @details
#' At each repeat, k is selected randomly or based on the best silhouette width from a discrete uniform distribution between range.k[1] and range.k[2].
#' Then k-means clustering is applied and result is returned.
#'
#' @examples
#' X = gaussian_clusters()$X
#' Clusters = multi_kmeans_gen(X)
#'
multi_kmeans_gen = function(X, rep = 10, range.k = c(2,5), method = "random"){

  assertthat::assert_that(rep > 0)
  assertthat::assert_that(range.k[1] > 1)
  assertthat::assert_that(range.k[2] > range.k[1])
  assertthat::assert_that(method %in% c("silhouette", "random"))

  Kmin = range.k[1]
  Kmax = range.k[2]
  distX = stats::dist(X)

  Clusters = matrix(0, nrow(X), rep)
  for (i in 1:rep){

    if (method == "silhouette"){
      Sil = rep(0, Kmax)
      for (k in Kmin:Kmax){
        clusters = stats::kmeans(X, k)$cluster
        sil = cluster::silhouette(clusters, distX)
        Sil[k] = mean(sil[,"sil_width"])
      }
      Kopt = which.max(Sil)

    }
    else if (method == "random"){
      Kopt = sample(Kmin:Kmax, 1)
    }
    else
      stop("err")

    Clusters[,i] = stats::kmeans(X, Kopt)$cluster
    # print(Kopt)
  }

  return(Clusters)
}


#' Multiple PAM (K-medoids) generation
#'
#' @param X input data Nsample x Nfeatures or distance matrix.
#' @param rep number of repeats
#' @param range.k vector of minimum and maximum values for k \code{c(min, max)}
#' @param is.distance binary balue indicating if the input \code{X} is distance
#' @param method method for the choice of k at each repeat \code{c("random", "silhouette")}
#'
#' @return matrix of clusterings Nsample x Nrepeat
#'
#' @details
#' At each repeat, k is selected randomly or based on the best silhouette width from a discrete uniform distribution between range.k[1] and range.k[2].
#' Then PAM clustering is applied and result is returned.
#'
#' @examples
#' X = gaussian_clusters()$X
#' Clusters = multi_pam_gen(X)
#'
multi_pam_gen = function(X, rep = 10, range.k = c(2,5), is.distance = FALSE, method = "random"){

  assertthat::assert_that(rep > 0)
  assertthat::assert_that(range.k[1] > 1)
  assertthat::assert_that(range.k[2] > range.k[1])
  assertthat::assert_that(method %in% c("silhouette", "random"))

  Kmin = range.k[1]
  Kmax = range.k[2]

  Clusters = matrix(0, nrow(X), rep)
  for (i in 1:rep){

    if (method == "silhouette"){
      Sil = rep(0, Kmax)
      for (k in Kmin:Kmax){
        pam_result = cluster::pam(X, k = k, diss = is.distance)
        clusters = pam_result$clustering
        Sil[k] = pam_result$silinfo$avg.width
      }
      Kopt = which.max(Sil)

    }
    else if (method == "random"){
      Kopt = sample(Kmin:Kmax, 1)
    }
    else
      stop("err")

    Clusters[,i] = cluster::pam(X, k = Kopt, diss = is.distance, cluster.only = TRUE)
    # print(Kopt)
  }

  return(Clusters)
}

#' Multiple cluster generation
#'
#' @param X input data Nsample x Nfeatures or a distance matrix
#' @param func custom function that accepts \code{X} and a parameter that return a vector of clusterings.
#' \code{cluster_func <- function(X, param)}
#' @param rep number of repeats
#' @param param vector of parameters
#' @param method method for the choice of k at each repeat \code{c("random", "silhouette")}
#'
#' @return matrix of clusterings Nsample x Nrepeat
#'
#' @details
#' At each repeat, k is selected randomly or based on the best silhouette width from a discrete uniform distribution between range.k[1] and range.k[2].
#' Then clustering is applied and result is returned.
#'
#' @examples
#' X = gaussian_clusters()$X
#' cluster_func = function(X, k){return(stats::kmeans(X, k)$cluster)}
#' Clusters = multi_cluster_gen(X, cluster_func, param = c(2,3))
#'
#'
multi_cluster_gen = function(X, func, rep = 10, param, method = "random"){

  assertthat::assert_that(rep > 0)
  assertthat::assert_that(method %in% c("silhouette", "random"))

  distX = stats::dist(X)

  Clusters = matrix(0, nrow(X), rep)
  for (i in 1:rep){

    if (method == "silhouette"){

      Sil = rep(0, length(param))
      for (k in param){
        clusters = func(X, k)
        sil = cluster::silhouette(clusters, distX)
        Sil[k] = mean(sil[,"sil_width"])
      }
      Kopt = which.max(Sil)

    }
    else if (method == "random"){
      Kopt = sample(param, 1)
    }
    else
      stop("err")

    Clusters[,i] = func(X, Kopt)
    # print(Kopt)
  }

  return(Clusters)
}


# ---------------------- Multi-Data CC -----------------------

#' Multiview K-means generation
#'
#' @param X List of input data matrices of Sample x feature. The length of \code{X} is equal to Nviews
#' @param rep number of repeats
#' @param range.k vector of minimum and maximum values for k \code{c(min, max)}
#' @param method method for the choice of k at each repeat \code{c("random", "silhouette")}
#'
#' @return matrix of clusterings Nsample x (Nrepeat x Nviews)
#'
#' @details
#' At each repeat, k is selected randomly or based on the best silhouette width from a discrete uniform distribution between range.k[1] and range.k[2].
#' Then k-means clustering is applied and result is returned.
#'
#' @examples
#' data = multiview_clusters (n = c(40,40,40), hidden.dim = 2, observed.dim = c(2,2,2),
#' sd.max = .1, sd.noise = 0, hidden.r.range = c(.5,1))
#' X_observation = data[["observation"]]
#' Clusters = multiview_kmeans_gen(X_observation)
#'
multiview_kmeans_gen = function(X, rep = 10, range.k = c(2,5), method = "random"){

  assertthat::assert_that(is.list(X))
  assertthat::assert_that(range.k[1] > 1)
  assertthat::assert_that(range.k[2] > range.k[1])

  Kmin = range.k[1]
  Kmax = range.k[2]
  Nview = length(X)

  Clusters = c()
  for (i in 1:Nview){
    X_i = X[[i]]
    cl = multi_kmeans_gen(X_i, rep = rep, range.k = range.k, method = method)
    Clusters = cbind(Clusters, cl)
  }

  return(Clusters)
}

#' Multiview PAM (K-medoids) generation
#'
#' @param X List of input data matrices of Sample x feature or distance matrices.
#' The length of \code{X} is equal to Nviews
#' @param rep number of repeats
#' @param range.k vector of minimum and maximum values for k \code{c(min, max)}
#' @param is.distance binary balue indicating if the input \code{X} is distance
#' @param method method for the choice of k at each repeat \code{c("random", "silhouette")}
#' @param sample.set  vector of samples the clustering is being applied on. can be names or indices.
#' if \code{sample.set} is \code{NA}, it considers all the datasets have the same samples with the same order
#'
#' @return matrix of clusterings Nsample x (Nrepeat x Nviews)
#'
#' @details
#' At each repeat, k is selected randomly or based on the best silhouette width from a discrete uniform distribution between range.k[1] and range.k[2].
#' Then PAM clustering is applied and result is returned.
#'
#' @examples
#' data = multiview_clusters (n = c(40,40,40), hidden.dim = 2, observed.dim = c(2,2,2),
#' sd.max = .1, sd.noise = 0, hidden.r.range = c(.5,1))
#' X_observation = data[["observation"]]
#' Clusters = multiview_pam_gen(X_observation)
#'
multiview_pam_gen = function(X, rep = 10, range.k = c(2,5), is.distance = FALSE, method = "random", sample.set = NA){

  assertthat::assert_that(is.list(X))
  assertthat::assert_that(range.k[1] > 1)
  assertthat::assert_that(range.k[2] > range.k[1])

  if (!is.na(sample.set)[1] & is.null(colnames(X[[1]])))
    stop("err")

  Kmin = range.k[1]
  Kmax = range.k[2]
  Nview = length(X)

  Clusters = c()
  for (i in 1:Nview){
    X_i = X[[i]]

    if (!is.na(sample.set)[1]){

      IntersectedSamples = intersect(sample.set, colnames(X_i))

      if (is.distance)
        X_i = X_i[IntersectedSamples,IntersectedSamples]
      else
        X_i = X_i[,IntersectedSamples]

      for (r in 1:rep){
        cl = multi_pam_gen(X_i, rep = 1, range.k = range.k, is.distance = is.distance, method = method)
        names(cl) = IntersectedSamples
        clusters = rep(0, length(sample.set))
        names(clusters) = sample.set
        clusters[names(cl)] = cl
        Clusters = cbind(Clusters, clusters)
      }

    }else{
      cl = multi_pam_gen(X_i, rep = rep, range.k = range.k, is.distance = is.distance, method = method)
      Clusters = cbind(Clusters, cl)
    }
  }
  return(Clusters)
}

#' Multiview cluster generation
#'
#' @param X List of input data matrices of Sample x feature or distance matrices.
#' The length of \code{X} is equal to Nviews
#' @param func custom function that accepts \code{X} and a parameter that return a vector of clusterings.
#' \code{cluster_func <- function(X, param)}
#' @param rep number of repeats
#' @param param vector of parameters
#' @param is.distance binary balue indicating if the input \code{X[i]} is distance
#' @param sample.set  vector of samples the clustering is being applied on. can be names or indices.
#' if \code{sample.set} is \code{NA}, it considers all the datasets have the same samples with the same order
#'
#' @return matrix of clusterings Nsample x (Nrepeat x Nviews)
#'
#' @details
#' At each repeat, k is selected randomly or based on the best silhouette width from a discrete uniform distribution between range.k[1] and range.k[2].
#' Then clustering is applied and result is returned.
#'
#' @examples
#' data = multiview_clusters (n = c(40,40,40), hidden.dim = 2, observed.dim = c(2,2,2),
#' sd.max = .1, sd.noise = 0, hidden.r.range = c(.5,1))
#' X_observation = data[["observation"]]
#' cluster_func = function(X,rep,param){return(multi_kmeans_gen(X,rep=rep,range.k=param))}
#' Clusters = multiview_cluster_gen(X_observation, func = cluster_func, rep = 10, param = c(2,4))
#'
multiview_cluster_gen = function(X, func, rep = 10, param, is.distance = FALSE, sample.set = NA){

  assertthat::assert_that(is.list(X))

  if (!is.na(sample.set)[1] & is.null(colnames(X[[1]])))
    stop("err")

  Nview = length(X)

  Clusters = c()
  for (i in 1:Nview){
    X_i = X[[i]]

    if (!is.na(sample.set)[1]){

      IntersectedSamples = intersect(sample.set, colnames(X_i))

      if (is.distance)
        X_i = X_i[IntersectedSamples,IntersectedSamples]
      else
        X_i = X_i[,IntersectedSamples]

      for (r in 1:rep){
        cl = func(X_i, rep = 1, param=param)
        names(cl) = IntersectedSamples
        clusters = rep(0, length(sample.set))
        names(clusters) = sample.set
        clusters[names(cl)] = cl
        Clusters = cbind(Clusters, clusters)
      }

    }else{
      cl = func(X_i, rep = rep, param=param)
      Clusters = cbind(Clusters, cl)
    }
  }
  return(Clusters)
}

Try the ConsensusClustering package in your browser

Any scripts or data that you put into this service are public.

ConsensusClustering documentation built on Sept. 11, 2024, 6:38 p.m.