R/initial_routes.R

Defines functions plot.rb_clustering rb_clustering fix_connectivity resolve_disputes route_clustering

Documented in fix_connectivity plot.rb_clustering rb_clustering resolve_disputes route_clustering

#' Perform hierarchical clustering based on initial routes
#'
#' @param p_inst
#' @param init_routes a list of initial routes obtained from the `initial_route2` function
#' @param k the number of agents
#' @param measure what dissimilarity measure to use, can be either "element-based" or "position-based". Defaults to "position-based".
#' @param weigthed
#'
#' @return a list containing the `hclust` object and a vector of the clusters
#' @export
#'
route_clustering <- function(p_inst, init_routes, k, measure = "position-based", weigthed = T, order_preserving = F) {
  # dissimilarity functions
  compute_dissimilarity <- function(i,j) {
      nodes_i <- unique(init_routes[[i]]$route)
      nodes_j <- unique(init_routes[[j]]$route)
      difference <- setdiff(nodes_i, nodes_j)
      return(length(difference))
    }
  position_dissimilarity <- function(ids) {
      # Determine the longest route
      routes <- list(init_routes[[ids[1]]]$route, init_routes[[ids[2]]]$route)
      longest_route <- do.call(c, lapply(routes, length)) |> which.max()

      if (!order_preserving) {
        mean_dist <- sapply(
          routes[[longest_route]],
          function(node_id) {
            p_inst$dst[node_id, unique(routes[-longest_route][[1]])] |> min()
          }
        ) |> mean()
      } else {
        stop("Order preserving not implemented")
      }

      # beta determines if the realized score of the two routes are close to each other
      beta <- 1/abs((init_routes[[ids[1]]]$realized_score - init_routes[[ids[2]]]$realized_score))

      return(c("mean_dist" = mean_dist, "beta" = beta))
    }

  if (measure == "element-based") {

    n <- length(init_routes)
    dissimilarity <- matrix(nrow = n, ncol = n)
    for (i in 1:n) {
      for (j in 1:n) {
        dissimilarity[i,j] <- compute_dissimilarity(i,j)
      }
    }
  } else if (measure == "position-based") {

    # find all combinations of routes and compute dissimilarity
    combinations <- utils::combn(1:length(init_routes), 2, simplify = F)
    dis <- pbapply::pblapply(combinations, position_dissimilarity)

    # create matrix to hold results
    dissimilarity <- matrix(data = 0, nrow = length(init_routes), ncol = length(init_routes))

    # post processing to weight dissimilarities

    # mu is the mean realized score for the initial routes
    mu <- mean(sapply(init_routes, function(x) x$realized_score))
    mu_pbd <- mean(sapply(dis, function(x) x["mean_dist"]))
    all_beta <- sapply(dis, function(x) x["beta"])
    max_beta <- max(all_beta[!is.na(all_beta) & is.finite(all_beta)])

    adjustment <- numeric(length = length(combinations))

    # take dissimilarity values from the list and insert into the matrix
    lapply(seq_along(combinations), function(i) {
      ids <- combinations[[i]]

      # alpha determines the ratio of ids[1]'s realized to the mean realized score
      alpha <- init_routes[[ids[1]]]$realized_score/mu

      if (!is.finite(all_beta[i])) beta <- max_beta else beta <- all_beta[i]

      gamma <- dis[[i]]["mean_dist"] - mu_pbd

      adjustment[i] <<- alpha*beta*gamma

      if (weigthed) {
        dissimilarity[ids[1], ids[2]] <<- dis[[i]]["mean_dist"] * alpha * beta * gamma
      } else {
        dissimilarity[ids[1], ids[2]] <<- dis[[i]]["mean_dist"]
      }
    })

    # mirror the upper part of the matrix into the lower part
    dissimilarity <- dissimilarity + t(dissimilarity)
  }

  hc <- stats::hclust(as.dist(dissimilarity))
  cluster <- cutree(hc, k)
  return(
    list("hclust" = hc, "cutree" = cluster)
  )
}

#' Resolve disputes so each node are only assigned one cluster
#'
#' @param init_routes a list of initial routes obtained from the `initial_route2` function
#' @param cluster a vector of clusters obtained from the `route_clustering` function
#' @param obj what objective to use when solving the disputes, currently there are "most_frequent" and "highest_score"
#'
#' @return list containing the zones as well as the zones of only undisputed nodes
#' @export
#'
resolve_disputes <- function(init_routes, cluster, obj = "most_frequent") {
  k <- length(unique(cluster))

  route_info <- tibble::tibble(
    id = lapply(init_routes, function(x) x$route),
    realized_score = do.call(c, lapply(init_routes, function(x) x$realized_score)),
    expected_score = do.call(c, lapply(init_routes, function(x) x$expected_score)),
    cluster,
    route_id = 1:length(init_routes)
  )

  route_count <- route_info |>
    dplyr::group_by(cluster) |>
    dplyr::summarise(n_route = dplyr::n_distinct(route_id))

  route_score <- route_info |>
    tidyr::unnest(cols = id) |>
    dplyr::distinct() |> # only one node id per route
    dplyr::group_by(id, cluster) |>
    dplyr::summarise(mean_expected_score = mean(expected_score),
                     mean_realized_score = mean(realized_score),
                     n_route_id = dplyr::n_distinct(route_id))

  node_usage <- route_info |>
    tidyr::unnest(cols = id) |>
    dplyr::group_by(id, cluster) |>
    dplyr::summarise(n = dplyr::n()) |> # number of times the node id is used in the cluster
    dplyr::left_join(route_count, by = c("cluster")) |>
    dplyr::mutate(n = n / n_route) |> # usage adjusted for number of routes
    dplyr::left_join(route_score, by = c("id", "cluster")) |>
    dplyr::summarise(num_cluster_use = dplyr::n_distinct(cluster),
                     most_frequent = dplyr::first(cluster, order_by = -n),
                     highest_mean_realized_score = dplyr::first(cluster, order_by = -mean_realized_score)) |>
    dplyr::mutate(disputed = ifelse(num_cluster_use > 1, 1, 0)) |>
    dplyr::ungroup()

  # Construct the zones based the node_usage tibble
  zones_undisputed <- list() # kept for illustrative purposes
  zones <- list()

  for (i in 1:k) {
    # add the most frequent undisputed points to each zone
    filtered_nodes <- node_usage |>
      purrr::when(obj == "most_frequent" ~ dplyr::filter(., most_frequent == i),
                  obj == "highest_score" ~ dplyr::filter(., highest_mean_realized_score == i),
                  ~ stop(paste0("obj: ", obj, " not recognized in resolve_disputes()")))

    ids_undisputed <- filtered_nodes |>
      dplyr::filter(disputed == 0) |>
      dplyr::pull(id)

    ids <- filtered_nodes |>
      dplyr::pull(id)

    zones_undisputed[[i]] <- unique(c(1, ids_undisputed))
    zones[[i]] <- unique(c(1, ids))
  }

  return(
    list("zones" = zones, "zones_undisputed" = zones_undisputed)
  )
}

#' Fix connectivity in zones
#'
#' @param inst
#' @param zones
#' @param available_nodes
#'
#' @return
#' @export
#'
fix_connectivity <- function(inst, zones, available_nodes = integer()) {
  k <- length(zones)

  # function to determine if a zone is connected
  connected <- function(zone) {
    # zone <- zones[[1]]
    sub_g <- igraph::induced_subgraph(inst$g, vids = zone)
    igraph::is_connected(sub_g, mode = "weak") # check for undirected path between pairs of vertices
  }

  # Check the connectedness of each zone
  are_connected <- do.call(c, lapply(zones, connected))

  if (!all(are_connected)) {
    # de-zone points that are not connected to the source through its own zone
    # available_nodes <- integer()
    if (length(available_nodes) > 0) stop("not all zones are connected but available nodes was provided. Please fix the connectivity issue before providing available nodes.")

    for (i in 1:k) {
      # Make the sub graph induced by each zone
      sub_g <- igraph::induced_subgraph(inst$g, vids = zones[[i]])
      if (!igraph::is_connected(sub_g, mode = "weak")) {
        # The problem is the nodes that are not connected to the source,
        # nodes that are not connected to the source will have distance == Inf
        temp_dst <- igraph::distances(sub_g, v = 1) # calculate distances
        unconnected <- igraph::V(sub_g)[temp_dst == Inf] |> # get the node id for nodes with distance == Inf
          names() |> as.integer()

        # Idea: discard the nodes from the zone and see if we can add them to another zone
        available_nodes <- append(available_nodes, unconnected)
        zones[[i]] <- zones[[i]][!zones[[i]] %in% unconnected]
      }
    }
  }

  # save number of nodes with connectivity issues
  num_issues <- length(available_nodes)

  # make function to evaluate each zone based on objective
  avg_dist <- function(zone) {
    # subset the distance matrix (of shortest paths) to only nodes in the zone
    dst_temp <- inst$dst[zone, zone]
    # return the mean distance between nodes in the zone
    mean(dst_temp[lower.tri(dst_temp, diag = F)])
  }
  # do.call(sum, lapply(zones, avg_dist))

  if (!all(are_connected) & (length(available_nodes) == 0)) {
    num_issues <- 0
    return(
      list("zones" = zones, "num_issues" = num_issues)
    )
  }

  while (length(available_nodes) > 0) {
    # Check the result of inserting an available node into each zone
    candidates <- list(); l = 0

    for (i in available_nodes) {
      for (j in 1:k) {
        # cat("i is:", i, "j is:", j, "\n")
        l = l + 1
        # add available node to the zone
        temp_zones <- zones
        temp_zones[[j]] <- append(temp_zones[[j]], i)

        # see if the zone is connected
        sub_g <- igraph::induced_subgraph(inst$g, vids = temp_zones[[j]])
        connected <- igraph::is_connected(sub_g, mode = c("weak"))

        # calculate the average distance
        if (connected) {
          zone_avg_dist <- avg_dist(temp_zones[[j]])
        } else {
          zone_avg_dist <- NA
        }

        candidates[[l]] <- tibble::tibble(
          available_node = i, zone = j, is_connected = connected, avg_dist = zone_avg_dist
        )
      }
    }

    best_candidate <- do.call(dplyr::bind_rows, candidates) |>
      dplyr::filter(is_connected == T) |>
      dplyr::filter(avg_dist == min(avg_dist))

    if (nrow(best_candidate) < 1) {
      warning("there are no best candidate")
      # TODO: handle no candidate is_connected
    }

    zones[[best_candidate$zone]] <- append(
      zones[[best_candidate$zone]], best_candidate$available_node
    )
    available_nodes <- available_nodes[available_nodes != best_candidate$available_node]
  }

  return(
    list("zones" = zones, "num_issues" = num_issues)
  )
}

#' Routing based clustering
#'
#' @param p_inst
#' @param L
#' @param k
#' @param info
#' @param dispute_obj
#' @param measure
#' @param weighted
#' @param shiny
#'
#' @return
#' @export
#'
rb_clustering <- function(p_inst, L, k, num_routes, info, top_percentile = .15,  dispute_obj = "most_frequent", measure = "position-based", weigthed = T,  shiny = F) {
  # inst = test_instances$p7_chao; L = 100; k = 3; variances = generate_variances(inst = inst); info = generate_information(inst, r = 20); p_inst <- prepare_instance(inst, variances, info); dispute_obj = "most_frequent"; shiny = F; num_routes = 100; measure = "position-based"; weigthed = T; set.seed(3)

  # First we generate the initial routes
  message("Construct the initial routes")
  suppressMessages(
    init_routes <- 1:num_routes |> as.list() |> pbapply::pblapply(function(x) {if (shiny) shiny::incProgress(amount = 1/num_routes); initial_route2(p_inst, L, info, top_percentile = top_percentile)})
  )

  # Then we perform the route clustering
  message("Performing the clustering")
  rc <- route_clustering(p_inst, init_routes, k, measure = measure, weigthed = weigthed); cluster <- rc$cutree

  # Next is assignment of the disputed points
  suppressMessages(
    {rd <- resolve_disputes(init_routes, cluster, obj = dispute_obj); zones <- rd$zones}
  )

  # Lastly we fix any issues with connectivity
  message("Fixing any connectivity issues")
  fc <- fix_connectivity(inst, zones); zones <- fc$zones; num_issues <- fc$num_issues
  message(paste0("number of issues fixed: ", num_issues))

  # Check if zones are connected through the source
  spans_source <- function(zone) {
    sub_g <- igraph::induced_subgraph(inst$g, vids = zone[zone != 1])
    igraph::is_connected(sub_g, mode = "weak")
  }
  pass <- do.call(c, lapply(zones, spans_source))

  if (any(!pass)) {# handle the zone(s) that span the source
    warning("A zone spans across the source, trying to re-zone one half of the spanning zone")
    problems <- which(!pass)

    if (length(problems) > 1) stop("More than one zone spans the source")

    zone <- zones[[problems[1]]]

    # Find out how many subclusters there are
    sub_g <- igraph::induced_subgraph(inst$g, vids = zone[zone != 1])
    decomp <- igraph::clusters(sub_g)

    if (decomp$no != 2) stop("There are not two sub clusters in the zone that spans the source")

    # dezone the nodes from the smallest sub cluster
    available_nodes <- decomp$membership[
      decomp$membership == which.min(decomp$csize)
    ] |> names() |> as.integer()
    zones[[problems[1]]] <- zones[[problems[1]]][!zones[[problems[1]]] %in% available_nodes]

    message(paste0("number of nodes rezoned: ", min(decomp$csize)))

    fc <- fix_connectivity(inst, zones, available_nodes); zones <- fc$zones
  }

  if (any(sapply(zones, function(x) length(x) < 5))) stop("Insufficient number of zones generated or a zone have insufficient number of nodes")

  # if (any(sapply(zones, function(x) identical(x, c(1))))) stop("Insufficient number of zones generated")

  structure(
    list(
      "zones" = zones,
      "num_issues" = num_issues,
      "p_inst" = p_inst,
      "info" = info,
      "spanning_source" = any(!pass)
    ),
    class = "rb_clustering"
  )
}

#' Plot the result from routing-based clustering
#'
#' @param rb_clust
#'
#' @return
#' @export
#'
plot.rb_clustering <- function(rb_clust) {
  # inst = test_instances$p7_chao; L = 100; k = 3; variances = generate_variances(inst = inst); info = generate_information(inst, r = 20); rb_clust <- rb_clustering(inst, L, k, num_routes = 100, info)
  p_inst <- rb_clust$p_inst

  temp <- tibble::tibble(id = rb_clust$zones, agent_id = 1:length(rb_clust$zones)) |>
    tidyr::unnest(cols = id)

  same_zone_edges <- p_inst$edges |>
    dplyr::left_join(
      temp |> dplyr::rename(zone = agent_id),
      by = c("ind1" = "id")
    ) |>
    dplyr::left_join(
      temp |> dplyr::rename(zone = agent_id),
      by = c("ind2" = "id")
    ) |>
    dplyr::filter((zone.x == zone.y) | (zone.x == 0) | (zone.y == 0)) |>
    dplyr::mutate(zone = ifelse(zone.x == 0, zone.y, zone.x)) |>
    dplyr::select(-c(zone.x,zone.y))

  ggplot2::ggplot() +
    ggplot2::geom_segment(
      data = same_zone_edges,
      ggplot2::aes(x = x1, y = y1, xend = x2, yend = y2),
      color = ggplot2::alpha("black", 0.3), linetype = "dashed"
    ) +
    ggplot2::geom_point(
      data = p_inst$points |> dplyr::filter(point_type == "terminal"),
      ggplot2::aes(x, y), color = "red", shape = 17
    ) +
    ggplot2::geom_point(
      data = p_inst$points |>
        dplyr::filter(point_type == "intermediate") |>
        dplyr::left_join(temp, by = c("id")),
      ggplot2::aes(x, y, color = as.character(agent_id), size = score, alpha = score_variance)
    ) +
    # ggplot2::scale_color_manual(values = c("black", scales::hue_pal()(k))) +
    ggplot2::ggtitle(paste0("Instance: ", p_inst$name)) +
    ggplot2::theme_bw() +
    ggplot2::guides(
      shape = "none",
      fill = "none",
      color = "none",
      alpha = "none",
      size = "none"
    )
}
Rosenkrands/dz documentation built on June 26, 2022, 10:43 p.m.