make_knn_clust: Make clusters using mstknnclust::mst.knn

View source: R/make_knn_clust.R

make_knn_clustR Documentation

Make clusters using mstknnclust::mst.knn

Description

Make clusters using mstknnclust::mst.knn

Usage

make_knn_clust(
  dist_mat_bio,
  clust_col = "clust",
  clust_col_out = "cluster",
  sites = NULL,
  ...
)

Arguments

dist_mat_bio

matrix. usually as.matric(distance object)

clust_col

Character. Name to give column representing each of groups

clust_col_out

Character. Name to give column representing groups in words.

sites

dataframe of sites represented by the distance object

...

passed to mstknnclust::mst.knn() (thus, really just suggested_k).

Value

List retruned by mstknnclust::mst.knn() with an additional clusters object as sites augmented with their associated cluster.

Examples


# data from: https://www.davidzeleny.net/anadat-r/doku.php/en:data:dune
bio_wide <- read.delim('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/dune2.spe.txt', row.names = 1) |>
  tibble::as_tibble() |>
  dplyr::mutate(site_id = dplyr::row_number())

bio_long <- bio_wide |>
  tidyr::pivot_longer(cols = - site_id)

sites <- read.delim('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/dune2.env.txt', row.names = 1) |>
  tibble::as_tibble() |>
  dplyr::mutate(site_id = dplyr::row_number()
                , Moisture = forcats::fct_relevel(as.character(Moisture), "1", "2", "4", "5")
                )

dist_bio <- vegan::vegdist(bio_wide)
dist_bio_mat <- as.matrix(dist_bio)

dist_env <- dist(sites |>
                   dplyr::select(where(is.numeric))
                 )

possible_groups <- 2:(nrow(bio_wide) * 3 / 4)

use_methods <- tibble::tibble(method = c("mcquitty", "average", "geo", "geo")
                              , alpha = c(NA, NA,  0.1, 0.3)
                              )

# make dendograms ------
dends <- use_methods |>
  dplyr::mutate(dend = purrr::map2(method
                                   , alpha
                                   , \(x, y) make_dend(method = x
                                                       , dist_bio = dist_bio
                                                       , dist_env = dist_env
                                                       , geo_alpha = y
                                                       )
                                   )
                )

# knn clusters ---------
knn_clusters <- tibble::tibble(method = "knn"
                                 , groups = possible_groups
                                 ) |>
  dplyr::mutate(knn = purrr::map(groups
                                 , \(x) make_knn_clust(dist_bio_mat
                                                       , sites
                                                       , suggested_k = x
                                                       )
                                 )
                , clusters = purrr::map(knn, "clusters")
                , groups = purrr::map_dbl(knn, "cnumber")
                ) |>
  dplyr::group_by(groups) |>
  dplyr::slice(1) |>
  dplyr::ungroup()

# make clusters -----------
clusters <- dends |>
  dplyr::mutate(clusters = purrr::map(dend
                                      , \(x) make_clusters(x
                                                           , group_range = 2:(nrow(bio_wide) / 2)
                                                           , sites = sites
                                                           )
                                      )
                ) |>
  tidyr::unnest(cols = c(clusters)) |>
  dplyr::bind_rows(knn_clusters)

# summarise clusters --------
clusters_summarise <- clusters |>
  dplyr::mutate(summary = purrr::map(clusters, \(x) make_summary_df(x))) |>
  tidyr::unnest(cols = c(summary))

# frequency of taxa --------
clusters_freq <- clusters |>
  dplyr::mutate(freq = purrr::map(clusters, \(x) make_freq_df(clust_df = x
                                                              , bio_df = bio_long
                                                              , context = "site_id"
                                                              )
                                  )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(freq))

# indicator value ---------
clusters_ind_val <- clusters |>
  dplyr::mutate(ind_val = purrr::map(clusters
                                     , \(x) make_ind_val_df(x
                                                            , bio_wide = bio_wide
                                                            , context = "site_id"
                                                            )
                                     )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(ind_val))

# silhouette width ---------
clusters_sil <- clusters |>
  dplyr::mutate(sil = purrr::map(clusters
                                 , \(x) make_sil_df(x
                                                    , dist_obj = dist_bio
                                                    )
                                 )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(sil))


# gap statistic --------
clusters_gap <- clusters |>
  dplyr::mutate(gap = purrr::map(clusters
                                 , \(x) make_gap_df(x
                                                    , dist_mat = dist_bio_mat
                                                    , n_sample = 30
                                                    )
                                 )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(gap))

# explore -------

clusters_explore <- clusters_summarise |>
  dplyr::left_join(clusters_freq) |>
  dplyr::left_join(clusters_ind_val) |>
  dplyr::left_join(clusters_sil) |>
  dplyr::left_join(clusters_gap)

# find a good clustering ---------
best <- clusters_explore |>
  ## min_clust_size is larger than x sites
  dplyr::filter(min_clust_size > 1) |>
  ## large av clust size
  dplyr::filter(av_clust_size > quantile(av_clust_size, probs = 0.5)) |>
  ## high macro_sil
  dplyr::filter(macro_sil > quantile(macro_sil, probs = 0.5)) |>
  ## high macro_gap
  dplyr::filter(macro_gap > quantile(macro_gap, probs = 0.5))

## dendogram --------
dend <- decorate_dend(clust_df = best$clusters[[1]]
                      , dend = best$dend[[1]]
                      , second_group_col = "Moisture"
                      , label_col = "Moisture"
                      )

plot(dend$dend)

## silhouette plot ---------

make_sil_plot(best$sil[[1]])

Acanthiza/envCluster documentation built on July 16, 2025, 10:17 p.m.