tests/replica/replica_nng.R

assign_by_seed <- function(seeds, nng) {
  cl_label <- rep(as.integer(NA), ncol(nng))
  current_label <- 0

  invisible(lapply(seeds, function(i) {
    cl_label[nng[, i]] <<- current_label
    current_label <<- current_label + 1
  }))

  cl_label
}


match_n_assign <- function(cl_label,
                           match,
                           unassigned,
                           radius,
                           distances) {
  if (!is.null(radius)) {
    unassigned[unassigned] <- (apply(distances[unassigned, match, drop = FALSE], 1, sort)[1, ] <= radius)
  }
  if (sum(unassigned) > 0) {
    cl_label[unassigned] <- cl_label[match[apply(distances[unassigned, match, drop = FALSE], 1, order)[1, ]]]
  }
  cl_label
}


any_neighbor_or_closest_assigned <- function(unassigned_method,
                                             unassigned,
                                             nng,
                                             assigned,
                                             radius,
                                             cl_label,
                                             distances) {
  if (unassigned_method == "any_neighbor") {
    unassigned <- apply(nng, 2, any) & unassigned
  }
  if (any(unassigned)) {
    cl_label <- match_n_assign(cl_label, which(assigned), unassigned, radius, distances)
  }
  cl_label
}


assign_unassigned <- function(distances,
                              cl_label,
                              seeds,
                              nng,
                              unassigned_method,
                              radius,
                              primary_data_points,
                              secondary_unassigned_method,
                              secondary_radius) {
  assigned <- !is.na(cl_label)
  unassigned <- !assigned
  if (!is.null(primary_data_points)) {
    unassigned <- unassigned & primary_data_points
  }

  if (any(unassigned)) {
    if (unassigned_method == "ignore") {
      # nothing
    } else if (unassigned_method == "any_neighbor" || unassigned_method == "closest_assigned") {
      cl_label <- any_neighbor_or_closest_assigned(unassigned_method,
                                                   unassigned,
                                                   nng,
                                                   assigned,
                                                   radius,
                                                   cl_label,
                                                   distances)
    } else if (unassigned_method == "closest_seed") {
      cl_label <- match_n_assign(cl_label, seeds, unassigned, radius, distances)
    } else {
      stop("Unknown options.")
    }
  }

  unassigned <- is.na(cl_label)
  if (any(unassigned)) {
    if (secondary_unassigned_method == "ignore") {
      # nothing
    } else if (secondary_unassigned_method == "closest_assigned") {
      cl_label <- match_n_assign(cl_label, which(assigned), unassigned, secondary_radius, distances)
    } else if (secondary_unassigned_method == "closest_seed") {
      cl_label <- match_n_assign(cl_label, seeds, unassigned, secondary_radius, distances)
    } else {
      stop("Unknown options.")
    }
  }

  cl_label
}


est_average_seed_dist <- function(distances,
                                  nng,
                                  seeds) {
  if (length(seeds) > 1000) stop("Too many seeds for replica.")
  diag(nng) <- FALSE
  mean(unlist(lapply(seeds, function(i) {
    mean(distances[i, nng[, i]])
  })))
}


replica_nng_clustering <- function(distances,
                                   size_constraint,
                                   seed_method = "exclusion_updating",
                                   unassigned_method = "closest_seed",
                                   radius = NULL,
                                   primary_data_points = NULL,
                                   secondary_unassigned_method = "ignore",
                                   secondary_radius = NULL) {
  ensure_distances(distances)
  num_data_points <- length(distances)
  size_constraint <- coerce_size_constraint(size_constraint, num_data_points)
  seed_method <- coerce_args(seed_method, all_seed_methods)
  unassigned_method <- coerce_args(unassigned_method,
                                   c("ignore",
                                     "any_neighbor",
                                     "closest_assigned",
                                     "closest_seed",
                                     "estimated_radius_closest_seed"))
  radius <- coerce_radius(radius)
  if (is.null(primary_data_points)) {
    secondary_unassigned_method <- "ignore"
  } else {
    ensure_indicators(primary_data_points, num_data_points, TRUE)
  }
  secondary_unassigned_method <- coerce_args(secondary_unassigned_method,
                                             c("ignore",
                                               "closest_assigned",
                                               "closest_seed",
                                               "estimated_radius_closest_seed"))
  secondary_radius <- coerce_radius(secondary_radius)

  distances <- as.matrix(distances)
  nng <- get_simple_nng(distances,
                        size_constraint,
                        radius,
                        primary_data_points)
  seeds <- findseeds(nng, seed_method)
  cl_label <- assign_by_seed(seeds, nng)

  if (unassigned_method == "estimated_radius_closest_seed") {
    unassigned_method <- "closest_seed"
    radius <- est_average_seed_dist(distances, nng, seeds)
  }
  if (secondary_unassigned_method == "estimated_radius_closest_seed") {
    secondary_unassigned_method <- "closest_seed"
    secondary_radius <- est_average_seed_dist(distances, nng, seeds)
  }

  cl_label <- assign_unassigned(distances,
                                cl_label,
                                seeds,
                                nng,
                                unassigned_method,
                                radius,
                                primary_data_points,
                                secondary_unassigned_method,
                                secondary_radius)

  make_scclust(as.integer(cl_label),
               length(unique(cl_label[!is.na(cl_label)])),
               attr(distances, "ids", exact = TRUE))
}



replica_nng_clustering_types <- function(distances,
                                         type_labels,
                                         type_size_constraints,
                                         total_size_constraint = NULL,
                                         seed_method = "exclusion_updating",
                                         unassigned_method = "closest_seed",
                                         radius = NULL,
                                         primary_data_points = NULL,
                                         secondary_unassigned_method = "ignore",
                                         secondary_radius = NULL) {
  ensure_distances(distances)
  num_data_points <- length(distances)
  type_labels <- coerce_type_labels(type_labels, num_data_points)
  type_size_constraints <- coerce_type_constraints(type_size_constraints)
  type_size_constraints <- make_type_size_constraints(type_size_constraints,
                                                      type_labels)
  total_size_constraint <- coerce_total_size_constraint(total_size_constraint,
                                                        type_size_constraints,
                                                        num_data_points)
  seed_method <- coerce_args(seed_method, all_seed_methods)
  unassigned_method <- coerce_args(unassigned_method,
                                   c("ignore",
                                     "any_neighbor",
                                     "closest_assigned",
                                     "closest_seed",
                                     "estimated_radius_closest_seed"))
  radius <- coerce_radius(radius)
  if (is.null(primary_data_points)) {
    secondary_unassigned_method <- "ignore"
  } else {
    ensure_indicators(primary_data_points, num_data_points, TRUE)
  }
  secondary_unassigned_method <- coerce_args(secondary_unassigned_method,
                                             c("ignore",
                                               "closest_assigned",
                                               "closest_seed",
                                               "estimated_radius_closest_seed"))
  secondary_radius <- coerce_radius(secondary_radius)

  distances <- as.matrix(distances)
  nng <- get_type_nng(distances,
                      type_labels,
                      type_size_constraints,
                      total_size_constraint,
                      radius,
                      primary_data_points)
  seeds <- findseeds(nng, seed_method)
  cl_label <- assign_by_seed(seeds, nng)

  if (unassigned_method == "estimated_radius_closest_seed") {
    unassigned_method <- "closest_seed"
    radius <- est_average_seed_dist(distances, nng, seeds)
  }
  if (secondary_unassigned_method == "estimated_radius_closest_seed") {
    secondary_unassigned_method <- "closest_seed"
    secondary_radius <- est_average_seed_dist(distances, nng, seeds)
  }

  cl_label <- assign_unassigned(distances,
                                cl_label,
                                seeds,
                                nng,
                                unassigned_method,
                                radius,
                                primary_data_points,
                                secondary_unassigned_method,
                                secondary_radius)

  make_scclust(as.integer(cl_label),
               length(unique(cl_label[!is.na(cl_label)])),
               attr(distances, "ids", exact = TRUE))
}
fsavje/scclust-R documentation built on Jan. 5, 2024, 2:34 a.m.