tests/replica/replica_hierarchical.R

find_centers <- function(indices, distances) {
  ISCC_GR_NUM_TO_CHECK <- 100L
  step <- max(2L, length(indices) %/% ISCC_GR_NUM_TO_CHECK)
  to_check <- indices[seq(1, min(step * ISCC_GR_NUM_TO_CHECK, length(indices)), step)]
  checked <- to_check
  best <- NULL

  while (length(to_check) > 0) {
    check_dist <- distances[to_check, indices, drop = FALSE]
    candidate <- arrayInd(which.max(t(check_dist)), dim(t(check_dist)))
    cand_tmp <- as.integer(to_check[candidate[1,2]])
    candidate[1,2] <- as.integer(indices[candidate[1,1]])
    candidate[1,1] <- cand_tmp
    if (is.null(best) || (distances[candidate] > distances[best])) {
      best <- candidate
    }
    to_check <- setdiff(indices[unique(apply(check_dist, 1, which.max))], checked)
    checked <- c(checked, to_check)
  }

  best
}


break_cluster <- function(indices,
                          distances,
                          size_constraint,
                          batch_assign) {
  centers <- as.integer(find_centers(indices, distances))
  clusters <- list(centers[1], centers[2])
  unassigned <- setdiff(indices, centers)

  if (sort(distances[centers[1], unassigned])[size_constraint - 1] >=
      sort(distances[centers[2], unassigned])[size_constraint - 1]) {
    clusters[[1]] <- c(clusters[[1]], unassigned[order(distances[centers[1], unassigned])[1:(size_constraint - 1)]])
    unassigned <- setdiff(unassigned, clusters[[1]])
    clusters[[2]] <- c(clusters[[2]], unassigned[order(distances[centers[2], unassigned])[1:(size_constraint - 1)]])
    unassigned <- setdiff(unassigned, clusters[[2]])
  } else {
    clusters[[2]] <- c(clusters[[2]], unassigned[order(distances[centers[2], unassigned])[1:(size_constraint - 1)]])
    unassigned <- setdiff(unassigned, clusters[[2]])
    clusters[[1]] <- c(clusters[[1]], unassigned[order(distances[centers[1], unassigned])[1:(size_constraint - 1)]])
    unassigned <- setdiff(unassigned, clusters[[1]])
  }

  while (length(unassigned) > 0) {
    if (batch_assign) {
      if (length(unassigned) < size_constraint) {
        size_constraint <- length(unassigned)
      }
      if (sort(distances[centers[1], unassigned])[size_constraint] <=
          sort(distances[centers[2], unassigned])[size_constraint]) {
        clusters[[1]] <- c(clusters[[1]], unassigned[order(distances[centers[1], unassigned])[1:size_constraint]])
        unassigned <- setdiff(unassigned, clusters[[1]])
      } else {
        clusters[[2]] <- c(clusters[[2]], unassigned[order(distances[centers[2], unassigned])[1:size_constraint]])
        unassigned <- setdiff(unassigned, clusters[[2]])
      }
    } else {
      if (sort(distances[centers[1], unassigned])[1] <=
          sort(distances[centers[2], unassigned])[1]) {
        clusters[[1]] <- c(clusters[[1]], unassigned[order(distances[centers[1], unassigned])[1]])
        unassigned <- setdiff(unassigned, clusters[[1]])
      } else {
        clusters[[2]] <- c(clusters[[2]], unassigned[order(distances[centers[2], unassigned])[1]])
        unassigned <- setdiff(unassigned, clusters[[2]])
      }
    }
  }

  clusters[[2]] <- rev(clusters[[2]])

  clusters
}


run_hierarchical <- function (num_data_points,
                              cluster_queue,
                              distances,
                              size_constraint,
                              batch_assign) {
  cl_labels <- rep(NA, num_data_points)
  current_label <- 0L
  cluster_queue <- rev(cluster_queue)

  while(length(cluster_queue) > 0) {
    if (length(cluster_queue[[1]]) < 2 * size_constraint) {
      cl_labels[cluster_queue[[1]]] <- current_label
      current_label <- current_label + 1L
      cluster_queue <- cluster_queue[-1]
    } else {
      new_clusters <- break_cluster(cluster_queue[[1]], distances, size_constraint, batch_assign)
      cluster_queue[[1]] <- new_clusters[[1]]
      cluster_queue <- c(new_clusters[2], cluster_queue)
    }
  }

  cl_labels
}


replica_hierarchical_clustering <- function(distances,
                                            size_constraint,
                                            batch_assign = TRUE,
                                            existing_clustering = NULL) {
  ensure_distances(distances)
  num_data_points <- length(distances)
  size_constraint <- coerce_size_constraint(size_constraint, num_data_points)
  ensure_indicators(batch_assign, 1L)
  if (!is.null(existing_clustering)) {
    ensure_scclust(existing_clustering, num_data_points)
  }

  if (!is.null(existing_clustering)) {
    cluster_queue <- lapply(sort(unique(existing_clustering)), function(x) { rev(which(existing_clustering == x)) })
  } else {
    cluster_queue <- list(1:num_data_points)
  }

  new_labels <- run_hierarchical(num_data_points,
                                 cluster_queue,
                                 as.matrix(distances),
                                 size_constraint,
                                 batch_assign)

  make_scclust(new_labels,
               length(unique(new_labels)),
               attr(distances, "ids", exact = TRUE))
}
fsavje/Rscclust documentation built on Jan. 5, 2024, 2:31 a.m.