make_ind_val_df: Create a dataframe of indicator values for a clustering
In Acanthiza/envCluster: Create Clusters From Records of Taxa

make_ind_val_df

R Documentation

Create a dataframe of indicator values for a clustering

Description

Create a dataframe of indicator values for a clustering

Usage

make_ind_val_df(
  clust_df = NULL,
  bio_wide,
  context,
  taxas = NULL,
  clust_col = "cluster",
  p_val_thresh = 0.05,
  ...
)

Arguments

`clust_df`	Dataframe with context columns and a column with cluster membership for that context. Optional if `clust_col` appears in bio_wide.
`bio_wide`	Dataframe containing the site and taxa data in wide format.
`context`	Character. Name(s) of column(s) that define the context.
`taxas`	Character. Vector of taxa names (column names in `bio_wide`). Optional if `bio_wide` contains only taxa names and `context` columns.
`clust_col`	Character. Name of column containing cluster membership.
`p_val_thresh`	Numeric. Threshold p value at which to accept a taxa as an indicator.
`...`	Passed to `labdsv::indval()`

Value

Dataframe of each taxa and the cluster (clust as numeric, cluster as character) class to which it is most likely an indicator, plus the following values from labdsv::indval() output: ind_val, p_val, abu and frq.

Examples


# data from: https://www.davidzeleny.net/anadat-r/doku.php/en:data:dune
bio_wide <- read.delim('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/dune2.spe.txt', row.names = 1) |>
  tibble::as_tibble() |>
  dplyr::mutate(site_id = dplyr::row_number())

bio_long <- bio_wide |>
  tidyr::pivot_longer(cols = - site_id)

sites <- read.delim('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/dune2.env.txt', row.names = 1) |>
  tibble::as_tibble() |>
  dplyr::mutate(site_id = dplyr::row_number()
                , Moisture = forcats::fct_relevel(as.character(Moisture), "1", "2", "4", "5")
                )

dist_bio <- vegan::vegdist(bio_wide)
dist_bio_mat <- as.matrix(dist_bio)

dist_env <- dist(sites |>
                   dplyr::select(where(is.numeric))
                 )

possible_groups <- 2:(nrow(bio_wide) * 3 / 4)

use_methods <- tibble::tibble(method = c("mcquitty", "average", "geo", "geo")
                              , alpha = c(NA, NA,  0.1, 0.3)
                              )

# make dendograms ------
dends <- use_methods |>
  dplyr::mutate(dend = purrr::map2(method
                                   , alpha
                                   , \(x, y) make_dend(method = x
                                                       , dist_bio = dist_bio
                                                       , dist_env = dist_env
                                                       , geo_alpha = y
                                                       )
                                   )
                )

# knn clusters ---------
knn_clusters <- tibble::tibble(method = "knn"
                                 , groups = possible_groups
                                 ) |>
  dplyr::mutate(knn = purrr::map(groups
                                 , \(x) make_knn_clust(dist_bio_mat
                                                       , sites
                                                       , suggested_k = x
                                                       )
                                 )
                , clusters = purrr::map(knn, "clusters")
                , groups = purrr::map_dbl(knn, "cnumber")
                ) |>
  dplyr::group_by(groups) |>
  dplyr::slice(1) |>
  dplyr::ungroup()

# make clusters -----------
clusters <- dends |>
  dplyr::mutate(clusters = purrr::map(dend
                                      , \(x) make_clusters(x
                                                           , group_range = 2:(nrow(bio_wide) / 2)
                                                           , sites = sites
                                                           )
                                      )
                ) |>
  tidyr::unnest(cols = c(clusters)) |>
  dplyr::bind_rows(knn_clusters)

# summarise clusters --------
clusters_summarise <- clusters |>
  dplyr::mutate(summary = purrr::map(clusters, \(x) make_summary_df(x))) |>
  tidyr::unnest(cols = c(summary))

# frequency of taxa --------
clusters_freq <- clusters |>
  dplyr::mutate(freq = purrr::map(clusters, \(x) make_freq_df(clust_df = x
                                                              , bio_df = bio_long
                                                              , context = "site_id"
                                                              )
                                  )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(freq))

# indicator value ---------
clusters_ind_val <- clusters |>
  dplyr::mutate(ind_val = purrr::map(clusters
                                     , \(x) make_ind_val_df(x
                                                            , bio_wide = bio_wide
                                                            , context = "site_id"
                                                            )
                                     )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(ind_val))

# silhouette width ---------
clusters_sil <- clusters |>
  dplyr::mutate(sil = purrr::map(clusters
                                 , \(x) make_sil_df(x
                                                    , dist_obj = dist_bio
                                                    )
                                 )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(sil))


# gap statistic --------
clusters_gap <- clusters |>
  dplyr::mutate(gap = purrr::map(clusters
                                 , \(x) make_gap_df(x
                                                    , dist_mat = dist_bio_mat
                                                    , n_sample = 30
                                                    )
                                 )
                ) |>
  dplyr::select(-clusters) |>
  tidyr::unnest(cols = c(gap))

# explore -------

clusters_explore <- clusters_summarise |>
  dplyr::left_join(clusters_freq) |>
  dplyr::left_join(clusters_ind_val) |>
  dplyr::left_join(clusters_sil) |>
  dplyr::left_join(clusters_gap)

# find a good clustering ---------
best <- clusters_explore |>
  ## min_clust_size is larger than x sites
  dplyr::filter(min_clust_size > 1) |>
  ## large av clust size
  dplyr::filter(av_clust_size > quantile(av_clust_size, probs = 0.5)) |>
  ## high macro_sil
  dplyr::filter(macro_sil > quantile(macro_sil, probs = 0.5)) |>
  ## high macro_gap
  dplyr::filter(macro_gap > quantile(macro_gap, probs = 0.5))

## dendogram --------
dend <- decorate_dend(clust_df = best$clusters[[1]]
                      , dend = best$dend[[1]]
                      , second_group_col = "Moisture"
                      , label_col = "Moisture"
                      )

plot(dend$dend)

## silhouette plot ---------

make_sil_plot(best$sil[[1]])

Acanthiza/envCluster documentation built on July 16, 2025, 10:17 p.m.