R/clustering.R

Defines functions clustInd_spc clustInd_spc_aux clustInd_kkmeans clustInd_kkmeans_aux clustInd_kmeans clustInd_kmeans_aux kmeans_mahal clustInd_hierarch clustInd_hierarch_aux

Documented in clustInd_hierarch clustInd_kkmeans clustInd_kmeans clustInd_spc

#' Perform hierarchical clustering for a given combination of indices,
#' method and distance
#'
#' @param ind_data Dataframe containing indices applied to the original data and
#' its first and second derivatives.
#' @param vars vector with a combinations of indices in \code{ind_data}.
#' @param method The aglomerative method to be used in hierarchical clustering.
#' @param dist The distance method to be used.
#' @param n_cluster Number of clusters to create.
#' @param true_labels Vector of true labels for validation
#' (if it is not known true_labels is set to NULL)
#'
#' @return A \code{list} containing clustering results and execution time.
#'
#' @noRd
clustInd_hierarch_aux <- function(ind_data, vars, method = "single",
                                  dist = "euclidean", n_cluster = 2,
                                  true_labels = NULL) {
  # Check if input is a data frame
  if (!is.data.frame(ind_data)) {
    stop("Input 'ind_data' must be a data frame.")
  }

  # Check if variables exist in the data frame
  if (!all(vars %in% names(ind_data))) {
    stop("Invalid variable name.")
  }

  if (!all(method %in% c("single", "complete", "average", "centroid", "ward.D2"))) {
    stop("Invalid method name.")
  }

  if (!all(dist %in% c("euclidean", "manhattan"))) {
    stop("Invalid distance name.")
  }

  t0 <- Sys.time()

  # Perform hierarchical clustering
  d <- stats::dist(ind_data[, vars], method = dist)
  met <- stats::hclust(d, method = method)
  clus <- stats::cutree(met, k = n_cluster)

  t1 <- Sys.time()

  # Calculate execution time
  et <- data.frame(difftime(t1, t0, "secs"))

  if (is.null(true_labels)) {
    res <- list("cluster" = clus, "time" = as.numeric(et))
  } else {
    valid <- clustering_validation(true_labels, clus)
    res <- list("cluster" = clus, "valid" = valid, "time" = as.numeric(et))
  }

  return(res)
}

#' Perform hierarchical clustering for a different combinations of indices,
#' method and distance
#'
#' @param ind_data Dataframe containing indices applied to the original data and
#' its first and second derivatives. See \link{generate_indices}.
#' @param vars_combinations \code{list} containing one or more combinations of indices in
#' \code{ind_data}. If it is non-named, the names of the variables are set to
#' vars1, ..., varsk, where k is the number of elements in \code{vars_combinations}.
#' @param method_list \code{list} of clustering methods.
#' @param dist_vector \code{list} of distance metrics.
#' @param n_cluster number of clusters to generate.
#' @param true_labels Vector of true labels for validation
#' (if it is not known true_labels is set to NULL)
#' @param n_cores Number of cores to do parallel computation. 1 by default,
#' which mean no parallel execution.
#'
#' @return A \code{list} containing hierarchical clustering results
#' for each configuration.
#'
#' @examples
#' vars1 <- c("dtaEI", "dtaMEI")
#' vars2 <- c("dtaHI", "dtaMHI")
#' data <- ehymet::sim_model_ex1()
#' data_ind <- generate_indices(data)
#' clustInd_hierarch(data_ind, list(vars1, vars2))
#'
#' @export
clustInd_hierarch <- function(ind_data, vars_combinations,
                              method_list = c(
                                "single", "complete", "average",
                                "centroid", "ward.D2"
                              ),
                              dist_vector = c("euclidean", "manhattan"),
                              n_cluster = 2, true_labels = NULL,
                              n_cores = 1) {
  # Check if input is a data frame
  if (!is.data.frame(ind_data)) {
    stop("input 'ind_data' must be a data frame.", call. = FALSE)
  }

  if (!is.list(vars_combinations)) {
    stop("input 'vars_combinations' must be a list.", call. = FALSE)
  }

  n_cores <- check_n_cores(n_cores)

  # Check for correct vars combinations
  vars_combinations_to_remove <- check_vars_combinations(vars_combinations, ind_data)

  if (length(vars_combinations_to_remove)) {
    vars_combinations <- vars_combinations[-vars_combinations_to_remove]
  }

  # Check if indices, methods and distances lists are provided
  if (!is.character(method_list) ||
    !is.character(dist_vector) || length(vars_combinations) == 0 ||
    length(method_list) == 0 || length(dist_vector) == 0) {
    stop("invalid 'method_list' or 'dist_vector'. Both must be non-empty character vectors.", call. = FALSE)
  }

  if (is.null(names(vars_combinations))) {
    names(vars_combinations) <- paste0("vars", seq_along(vars_combinations))
  }

  # Generate all the possible combinations of indices, methods and distances
  parameter_combinations <- expand.grid(
    vars = names(vars_combinations),
    method = method_list, distance = dist_vector
  )

  tl_null <- is.null(true_labels)

  n_comb <- nrow(parameter_combinations)

  result <- parallel::mclapply(1:n_comb, function(i) {
    vars <- vars_combinations[[parameter_combinations$vars[i]]]
    met <- parameter_combinations$method[i]
    dist <- parameter_combinations$distance[i]

    clustInd_hierarch_aux(ind_data, vars, met, dist, n_cluster, true_labels)
  }, mc.cores = n_cores)

  result_name <- get_result_names("hierarch", parameter_combinations, vars_combinations)
  names(result) <- result_name

  return(result)
}


#' k-means clustering with Mahalanobis distance
#'
#' @param ind_data Dataframe containing indices applied to the original data and
#' its first and second derivatives.
#' @param n_cluster Number of clusters to create.
#'
#' @return k-means clustering with Mahalanobis distance output
#'
#' @noRd
kmeans_mahal <- function(ind_data, n_cluster) {
  # Check if input is numeric matrix or array
  if (!(is.numeric(ind_data) || is.matrix(ind_data) || is.array(ind_data) ||
    is.data.frame(ind_data))) {
    stop("Input must be a numeric matrix, array or data frame.")
  }

  # Convert data to matrix
  if (!is.matrix(ind_data)) {
    data_matrix <- as.matrix(ind_data)
  } else {
    data_matrix <- ind_data
  }

  # Cholesky transformation
  c <- chol(stats::var(data_matrix))
  data_transform <- data_matrix %*% solve(c)

  # vector containing the clustering partition
  km <- stats::kmeans(data_transform,
    centers = n_cluster, iter.max = 1000,
    nstart = 100
  )$cluster
  return(km)
}

#' Perform kmeans clustering for a given combination of indices and distance
#'
#' @param ind_data Dataframe containing indices applied to the original data and
#' its first and second derivatives. See \link{generate_indices}.
#' @param vars vector with a combinations of indices in \code{ind_data}
#' @param dist The distance method to be used.
#' @param n_cluster Number of clusters to create.
#' @param true_labels Vector of true labels for validation.
#' (if it is not known true_labels is set to NULL)
#'
#' @return A list containing clustering results and execution time.
#'
#' @noRd
clustInd_kmeans_aux <- function(ind_data, vars, dist = "euclidean",
                                n_cluster = 2, true_labels = NULL) {
  # Check if input is a data frame
  if (!is.data.frame(ind_data)) {
    stop("Input 'ind_data' must be a data frame.")
  }

  # Check if variables exist in the data frame
  if (!all(vars %in% names(ind_data))) {
    stop("Invalid variable names.")
  }

  # Check if the distance given can be used
  if (!dist %in% c("euclidean", "mahalanobis")) {
    stop("Invalid distance.")
  }

  t0 <- Sys.time()

  if (dist == "euclidean") {
    clus <- stats::kmeans(ind_data[, vars],
      centers = n_cluster,
      iter.max = 1000, nstart = 100
    )$cluster
  } else {
    clus <- kmeans_mahal(ind_data[, vars], n_cluster)
  }
  t1 <- Sys.time()
  t <- data.frame(difftime(t1, t0, "secs"))

  if (is.null(true_labels)) {
    res <- list("cluster" = clus, "time" = as.numeric(t))
  } else {
    valid <- clustering_validation(true_labels, clus)
    res <- list("cluster" = clus, "valid" = valid, "time" = as.numeric(t))
  }
  return(res)
}

#' K-means clustering with indices
#'
#' Perform k-means clustering for a different combinations of indices and
#' distances.
#'
#' @param ind_data Dataframe containing indices applied to the original data and
#' its first and second derivatives. See \link{generate_indices}.
#' @param vars_combinations \code{list} containing one or more combinations of indices in
#' \code{ind_data}. If it is non-named, the names of the variables are set to
#' vars1, ..., varsk, where k is the number of elements in \code{vars_combinations}.
#' @param dist_vector Atomic vector of distance metrics. The possible values are,
#' "euclidean", "mahalanobis" or both.
#' @param n_cluster Number of clusters to create.
#' @param true_labels Vector of true labels for validation.
#' (if it is not known true_labels is set to NULL)
#' @param n_cores Number of cores to do parallel computation. 1 by default,
#' which mean no parallel execution.
#' @return A list containing hierarchical clustering results
#' for each configuration
#'
#' @return A list containing kmeans clustering results for each configuration
#'
#' @examples
#' vars1 <- c("dtaEI", "dtaMEI")
#' vars2 <- c("dtaHI", "dtaMHI")
#' data <- ehymet::sim_model_ex1()
#' data_ind <- generate_indices(data)
#' clustInd_kmeans(data_ind, list(vars1, vars2))
#'
#' @export
clustInd_kmeans <- function(ind_data, vars_combinations,
                            dist_vector = c("euclidean", "mahalanobis"),
                            n_cluster = 2, true_labels = NULL,
                            n_cores = 1) {
  # Check if input is a data frame
  if (!is.data.frame(ind_data)) {
    stop("Input 'ind_data' must be a data frame.", call. = FALSE)
  }

  if (!is.list(vars_combinations)) {
    stop("Input 'vars_combinations' must be a list.", call. = FALSE)
  }

  n_cores <- check_n_cores(n_cores)

  DIST_VECTOR <- c("euclidean", "mahalanobis")
  check_list_parameter(dist_vector, DIST_VECTOR, "dist")

  # Check for correct vars combinations
  vars_combinations_to_remove <- check_vars_combinations(vars_combinations, ind_data)

  if (length(vars_combinations_to_remove)) {
    vars_combinations <- vars_combinations[-vars_combinations_to_remove]
  }

  # Check if indices, methods and distances lists are provided
  if (length(vars_combinations) == 0 || length(dist_vector) == 0) {
    stop("Invalid 'vars_combinations' or 'dist_vector'. Both must be non-empty
         character vectors.", call. = FALSE)
  }

  if (is.null(names(vars_combinations))) {
    names(vars_combinations) <- paste0("vars", seq_along(vars_combinations))
  }

  # Generate all the possible combinations of indices, methods and distances
  parameter_combinations <- expand.grid(
    vars = names(vars_combinations),
    distance = dist_vector
  )

  tl_null <- is.null(true_labels)

  n_comb <- nrow(parameter_combinations)

  result <- parallel::mclapply(1:n_comb, function(i) {
    vars <- vars_combinations[[parameter_combinations$vars[i]]]
    dist <- parameter_combinations$distance[i]

    clustInd_kmeans_aux(
      ind_data = ind_data, vars = vars, dist = dist,
      n_cluster = n_cluster, true_labels = true_labels
    )
  }, mc.cores = n_cores)

  result_name <- get_result_names("kmeans", parameter_combinations, vars_combinations)
  names(result) <- result_name

  return(result)
}

#' Perform kernel kmeans clustering for a given combination of indices
#' and distance.
#'
#' @param ind_data Dataframe containing indices applied to the original data and
#' its first and second derivatives. See \link{generate_indices}.
#' @param vars vector with a combinations of indices in \code{ind_data}
#' @param kernel The kernel method to be used
#' @param n_cluster Number of clusters to create
#' @param true_labels Vector of true labels for validation
#' (if it is not known true_labels is set to NULL)
#'
#' @return A list containing clustering results and execution time.
#'
#' @noRd
clustInd_kkmeans_aux <- function(ind_data, vars, kernel = "rbfdot",
                                 n_cluster = 2, true_labels = NULL, ...) {
  # Check if input is a data frame
  if (!is.data.frame(ind_data)) {
    stop("Input 'ind_data' must be a data frame.")
  }

  # Check if variables exist in the data frame
  if (!all(vars %in% names(ind_data))) {
    stop("Invalid variable names.")
  }

  # Check if the kernel given can be used
  if (!kernel %in% c("rbfdot", "polydot")) {
    stop("Invalid kernel.")
  }


  t0 <- Sys.time()
  kkmeans_out <- kernlab::kkmeans(as.matrix(ind_data[, vars]),
    centers = n_cluster, kernel = kernel, ...
  )
  clus <- kkmeans_out@.Data
  t1 <- Sys.time()
  t <- data.frame(difftime(t1, t0, "secs"))

  if (is.null(true_labels)) {
    res <- list("cluster" = clus, "time" = as.numeric(t))
  } else {
    valid <- clustering_validation(true_labels, clus)
    res <- list("cluster" = clus, "valid" = valid, "time" = as.numeric(t))
  }
  return(res)
}

#' Kernel k-means clustering using indices
#'
#' Perform kernel kmeans clustering for a different combinations of indices
#' and kernel
#'
#' @param ind_data Dataframe containing indices applied to the original data and
#' its first and second derivatives. See \link{generate_indices}.
#' @param vars_combinations \code{list} containing one or more combinations of indices in
#' \code{ind_data}. If it is non-named, the names of the variables are set to
#' vars1, ..., varsk, where k is the number of elements in \code{vars_combinations}.
#' @param kernel_list List of kernels
#' @param n_cluster Number of clusters to create
#' @param true_labels Vector of true labels for validation
#' (if it is not known true_labels is set to NULL)
#' @param n_cores Number of cores to do parallel computation. 1 by default,
#' which mean no parallel execution.
#'
#' @return A \code{list} containing kernel-kmeans clustering results for each configuration.
#'
#' @examples
#' vars1 <- c("dtaEI", "dtaMEI")
#' vars2 <- c("dtaHI", "dtaMHI")
#' data <- ehymet::sim_model_ex1()
#' data_ind <- generate_indices(data)
#' clustInd_kkmeans(data_ind, list(vars1, vars2))
#'
#' @export
clustInd_kkmeans <- function(ind_data, vars_combinations,
                             kernel_list = c("rbfdot", "polydot"),
                             n_cluster = 2, true_labels = NULL,
                             n_cores = 1) {
  # Check if input is a data frame
  if (!is.data.frame(ind_data)) {
    stop("Input 'ind_data' must be a data frame.", call. = FALSE)
  }

  if (!is.list(vars_combinations)) {
    stop("Input 'vars_combinations' must be a list.", call. = FALSE)
  }

  n_cores <- check_n_cores(n_cores)

  # Check for correct vars combinations
  vars_combinations_to_remove <- check_vars_combinations(vars_combinations, ind_data)

  if (length(vars_combinations_to_remove)) {
    vars_combinations <- vars_combinations[-vars_combinations_to_remove]
  }

  # Check if indices, and kernel lists are provided
  if (!is.character(kernel_list) || length(vars_combinations) == 0 || length(kernel_list) == 0) {
    stop("Invalid 'kernel_list' or 'vars_combinations'. Both must be non-empty
         character vectors.", call. = FALSE)
  }

  if (is.null(names(vars_combinations))) {
    names(vars_combinations) <- paste0("vars", seq_along(vars_combinations))
  }

  # Generate all the possible combinations of indices, methods and distances
  parameter_combinations <- expand.grid(
    vars = names(vars_combinations),
    kernel = kernel_list
  )

  tl_null <- is.null(true_labels)

  n_comb <- nrow(parameter_combinations)

  result <- parallel::mclapply(1:n_comb, function(i) {
    vars <- vars_combinations[[parameter_combinations$vars[i]]]
    kern <- as.character(parameter_combinations$kernel[i])

    clustInd_kkmeans_aux(ind_data, vars, kern, n_cluster, true_labels)
  }, mc.cores = n_cores)

  result_name <- get_result_names("kkmeans", parameter_combinations, vars_combinations)
  names(result) <- result_name

  return(result)
}

#' Perform spectral clustering for a given combination of indices and kernel
#'
#' @param ind_data Dataframe containing indices applied to the original data and
#' its first and second derivatives. See \link{generate_indices}.
#' @param vars vector with a combinations of indices in \code{ind_data}
#' @param kernel The kernel method to be used
#' @param n_cluster Number of clusters to create
#' @param true_labels Vector of true labels for validation
#' (if it is not known true_labels is set to NULL)
#'
#' @return A list containing clustering results and execution time.
#'
#' @noRd
clustInd_spc_aux <- function(ind_data, vars, kernel = "rbfdot", n_cluster = 2,
                             true_labels = NULL, ...) {
  # Check if input is a data frame
  if (!is.data.frame(ind_data)) {
    stop("Input 'ind_data' must be a data frame.", call. = FALSE)
  }

  # Check if variables exist in the data frame
  if (!all(vars %in% names(ind_data))) {
    stop("Invalid variable names.", call. = FALSE)
  }

  t0 <- Sys.time()

  # create target variable
  nrow_ind_data <- nrow(ind_data)
  specc_output <- kernlab::specc(as.matrix(ind_data[, vars]),
    centers = n_cluster, kernel = kernel, ...
  )
  clus <- specc_output@.Data

  t1 <- Sys.time()
  t <- data.frame(difftime(t1, t0, "secs"))

  if (is.null(true_labels)) {
    res <- list("cluster" = clus, "time" = as.numeric(t))
  } else {
    valid <- clustering_validation(true_labels, clus)
    res <- list("cluster" = clus, "valid" = valid, "time" = as.numeric(t))
  }
  return(res)
}

#' Spectral clustering using indices
#'
#' Perform spectral clustering for a different combinations of indices
#' and kernels
#'
#' @param ind_data Dataframe containing indices applied to the original data and
#' its first and second derivatives. See \link{generate_indices}.
#' @param vars_combinations \code{list} containing one or more combinations of indices in
#' \code{ind_data}. If it is non-named, the names of the variables are set to
#' vars1, ..., varsk, where k is the number of elements in \code{vars_combinations}.
#' @param kernel_list List of kernels
#' @param n_cluster Number of clusters to create
#' @param true_labels Vector of true labels for validation
#' (if it is not known true_labels is set to NULL)
#' @param n_cores Number of cores to do parallel computation. 1 by default,
#' which mean no parallel execution.
#'
#' @return A list containing kkmeans clustering results for each configuration
#'
#' @examples
#' vars1 <- c("dtaEI", "dtaMEI")
#' vars2 <- c("dtaHI", "dtaMHI")
#' data <- ehymet::sim_model_ex1()
#' data_ind <- generate_indices(data)
#' clustInd_spc(data_ind, list(vars1, vars2))
#'
#' @export
clustInd_spc <- function(ind_data, vars_combinations,
                         kernel_list = c("rbfdot", "polydot"),
                         n_cluster = 2, true_labels = NULL,
                         n_cores = 1) {
  # Check if input is a data frame
  if (!is.data.frame(ind_data)) {
    stop("Input 'ind_data' must be a data frame.", call. = FALSE)
  }

  if (!is.list(vars_combinations)) {
    stop("Input 'vars_combinations' must be a data frame.", call. = FALSE)
  }

  n_cores <- check_n_cores(n_cores)

  # Check for correct vars combinations
  vars_combinations_to_remove <- check_vars_combinations(vars_combinations, ind_data)

  if (length(vars_combinations_to_remove)) {
    vars_combinations <- vars_combinations[-vars_combinations_to_remove]
  }

  # Check if indices, methods and distances lists are provided
  if (!is.character(kernel_list) || length(vars_combinations) == 0 ||
    length(kernel_list) == 0) {
    stop("Invalid 'kernel_list' or 'vars_combinations'. Both must be non-empty
         character vectors.", call. = FALSE)
  }

  if (is.null(names(vars_combinations))) {
    names(vars_combinations) <- paste0("vars", seq_along(vars_combinations))
  }

  # Generate all the possible combinations of indices, methods and distances
  parameter_combinations <- expand.grid(
    vars = names(vars_combinations),
    kernel = kernel_list
  )

  tl_null <- is.null(true_labels)

  n_comb <- nrow(parameter_combinations)

  result <- parallel::mclapply(1:n_comb, function(i) {
    vars <- vars_combinations[[parameter_combinations$vars[i]]]
    kern <- as.character(parameter_combinations$kernel[i])

    clustInd_spc_aux(ind_data, vars, kern, n_cluster, true_labels)
  }, mc.cores = n_cores)

  result_name <- get_result_names("spc", parameter_combinations, vars_combinations)
  names(result) <- result_name

  return(result)
}

Try the ehymet package in your browser

Any scripts or data that you put into this service are public.

ehymet documentation built on June 22, 2024, 10:50 a.m.