make_summary_df: Simple summary of a clustering.

View source: R/make_summary_df.R

make_summary_dfR Documentation

Simple summary of a clustering.

Description

Simple summary of a clustering.

Usage

make_summary_df(clust_df, clust_col = "cluster", min_sites = 10)

Arguments

clust_df

Dataframe with context columns and a column with cluster membership for that context.

clust_col

Name of column in clust_df with class membership.

min_sites

Desired minimum absolute number of sites in a class.

Value

single row tibble with summary information

Examples


# data from: https://www.davidzeleny.net/anadat-r/doku.php/en:data:dune
bio_wide <- read.delim('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/dune2.spe.txt', row.names = 1) |>
  tibble::as_tibble() |>
  dplyr::mutate(site_id = dplyr::row_number())

bio_long <- bio_wide |>
  tidyr::pivot_longer(cols = - site_id)

sites <- read.delim('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/dune2.env.txt', row.names = 1) |>
  tibble::as_tibble() |>
  dplyr::mutate(site_id = dplyr::row_number()
                , Moisture = forcats::fct_relevel(as.character(Moisture), "1", "2", "4", "5")
                )

dist_bio <- vegan::vegdist(bio_wide)
dist_bio_mat <- as.matrix(dist_bio)

dist_env <- dist(sites |>
                   dplyr::select(where(is.numeric))
                 )

possible_groups <- 2:(nrow(bio_wide) * 3 / 4)

use_methods <- tibble::tibble(method = c("mcquitty", "average", "geo", "geo")
                              , alpha = c(NA, NA,  0.1, 0.3)
                              )

# make dendograms ------
dends <- use_methods |>
  dplyr::mutate(dend = purrr::map2(method
                                   , alpha
                                   , \(x, y) make_dend(method = x
                                                       , dist_bio = dist_bio
                                                       , dist_env = dist_env
                                                       , geo_alpha = y
                                                       )
                                   )
                )

# knn clusters ---------
knn_clusters <- tibble::tibble(method = "knn"
                                 , groups = possible_groups
                                 ) |>
  dplyr::mutate(knn = purrr::map(groups
                                 , \(x) make_knn_clust(dist_bio_mat
                                                       , sites
                                                       , suggested_k = x
                                                       )
                                 )
                , clusters = purrr::map(knn, "clusters")
                , groups = purrr::map_dbl(knn, "cnumber")
                ) |>
  dplyr::group_by(groups) |>
  dplyr::slice(1) |>
  dplyr::ungroup()

# make clusters -----------
clusters <- dends |>
  dplyr::mutate(clusters = purrr::map(dend
                                      , \(x) make_clusters(x
                                                           , group_range = 2:(nrow(bio_wide) / 2)
                                                           , sites = sites
                                                           )
                                      )
                ) |>
  tidyr::unnest(cols = c(clusters)) |>
  dplyr::bind_rows(knn_clusters)

# summarise clusters --------
clusters_summarise <- clusters |>
  dplyr::mutate(summary = purrr::map(clusters, \(x) make_summary_df(x))) |>
  tidyr::unnest(cols = c(summary))

# frequency of taxa --------
clusters_freq <- clusters |>
  dplyr::mutate(freq = purrr::map(clusters, \(x) make_freq_df(clust_df = x
                                                              , bio_df = bio_long
                                                              , context = "site_id"
                                                              )
                                  )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(freq))

# indicator value ---------
clusters_ind_val <- clusters |>
  dplyr::mutate(ind_val = purrr::map(clusters
                                     , \(x) make_ind_val_df(x
                                                            , bio_wide = bio_wide
                                                            , context = "site_id"
                                                            )
                                     )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(ind_val))

# silhouette width ---------
clusters_sil <- clusters |>
  dplyr::mutate(sil = purrr::map(clusters
                                 , \(x) make_sil_df(x
                                                    , dist_obj = dist_bio
                                                    )
                                 )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(sil))


# gap statistic --------
clusters_gap <- clusters |>
  dplyr::mutate(gap = purrr::map(clusters
                                 , \(x) make_gap_df(x
                                                    , dist_mat = dist_bio_mat
                                                    , n_sample = 30
                                                    )
                                 )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(gap))

# explore -------

clusters_explore <- clusters_summarise |>
  dplyr::left_join(clusters_freq) |>
  dplyr::left_join(clusters_ind_val) |>
  dplyr::left_join(clusters_sil) |>
  dplyr::left_join(clusters_gap)

# find a good clustering ---------
best <- clusters_explore |>
  ## min_clust_size is larger than x sites
  dplyr::filter(min_clust_size > 1) |>
  ## large av clust size
  dplyr::filter(av_clust_size > quantile(av_clust_size, probs = 0.5)) |>
  ## high macro_sil
  dplyr::filter(macro_sil > quantile(macro_sil, probs = 0.5)) |>
  ## high macro_gap
  dplyr::filter(macro_gap > quantile(macro_gap, probs = 0.5))

## dendogram --------
dend <- decorate_dend(clust_df = best$clusters[[1]]
                      , dend = best$dend[[1]]
                      , second_group_col = "Moisture"
                      , label_col = "Moisture"
                      )

plot(dend$dend)

## silhouette plot ---------

make_sil_plot(best$sil[[1]])

Acanthiza/envCluster documentation built on July 16, 2025, 10:17 p.m.