R/find_k.R

Defines functions find_k

Documented in find_k

#' Find the optimal number of synteny blocks (i.e. k)
#'
#' @param clust
#' @param mark_df data frame, a dataframe of markers, e.g. the first element of a map_list object
#' @param max_cluster_range numeric of length 1, the maximum genetic distance a single cluster of markers can span
#'
#' @return a numeri
#' @export
#'
#' @examples
find_k <- function(clust, mark_df, max_cluster_range) {

  # define a function that outputs the maximum range (in x or y) for the clusters defined by any k

  test_k <- function(k){

    mark_df$cluster <- cutree(clust, k)

    cent_df <- mark_df %>%
      group_by(cluster) %>%
      summarise(centroid_x_max = max(map2_posfull), centroid_x_min = min(map2_posfull), centroid_y_max = max(map1_posfull), centroid_y_min = min(map1_posfull))

    return(max(c((cent_df$centroid_x_max - cent_df$centroid_x_min), (cent_df$centroid_y_max - cent_df$centroid_y_min))))

  }

  # find the maximum cluster range for k values that span the number of markers in the maps and
  # identify the smallest k value that yields a cluster range that is below the maximum cut-off set by max_cluster_range
  # (small k values means more markers in each cluster)
  # this is done iteratively, starting with k values that are far apart
  # if there are many markers (>1000), the initially tested k values start even farther apart

  if (nrow(mark_df) > 1000){

    ks_to_test <- seq(500, nrow(mark_df), by = 500)
    approx_k <- ks_to_test[which.min(abs(unlist(lapply(ks_to_test, test_k)) - max_cluster_range))]

    ks_to_test <- pmin(pmax(seq(approx_k - 500, approx_k + 500, by = 50), 1), nrow(mark_df))
    approx_k <- ks_to_test[which.min(abs(unlist(lapply(ks_to_test, test_k)) - max_cluster_range))]

    ks_to_test <- pmin(pmax(seq(approx_k - 50, approx_k + 50, by = 5), 1), nrow(mark_df))
    approx_k <- ks_to_test[which.min(abs(unlist(lapply(ks_to_test, test_k)) - max_cluster_range))]

    ks_to_test <- pmin(pmax(seq(approx_k - 5, approx_k + 5, by = 1), 1), nrow(mark_df))
    k <- ks_to_test[which.min(abs(unlist(lapply(ks_to_test, test_k)) - max_cluster_range))]

  } else {

    ks_to_test <- seq(50, nrow(mark_df), by = 50)
    approx_k <- ks_to_test[which.min(abs(unlist(lapply(ks_to_test, test_k)) - max_cluster_range))]

    ks_to_test <- pmin(pmax(seq(approx_k - 50, approx_k + 50, by = 5), 1), nrow(mark_df))
    approx_k <- ks_to_test[which.min(abs(unlist(lapply(ks_to_test, test_k)) - max_cluster_range))]

    ks_to_test <- pmin(pmax(seq(approx_k - 5, approx_k + 5, by = 1), 1), nrow(mark_df))
    k <- ks_to_test[which.min(abs(unlist(lapply(ks_to_test, test_k)) - max_cluster_range))]

  }

  return(k)

}
ksamuk/syntR documentation built on Feb. 11, 2021, 4:46 a.m.