View source: R/make_ind_val_df.R
make_ind_val_df | R Documentation |
Create a dataframe of indicator values for a clustering
make_ind_val_df(
clust_df = NULL,
bio_wide,
context,
taxas = NULL,
clust_col = "cluster",
p_val_thresh = 0.05,
...
)
clust_df |
Dataframe with context columns and a column with cluster
membership for that context. Optional if |
bio_wide |
Dataframe containing the site and taxa data in wide format. |
context |
Character. Name(s) of column(s) that define the context. |
taxas |
Character. Vector of taxa names (column names in |
clust_col |
Character. Name of column containing cluster membership. |
p_val_thresh |
Numeric. Threshold p value at which to accept a taxa as an indicator. |
... |
Passed to |
Dataframe of each taxa and the cluster (clust as numeric, cluster as
character) class to which it is most likely an indicator, plus the following
values from labdsv::indval()
output: ind_val, p_val, abu and frq.
# data from: https://www.davidzeleny.net/anadat-r/doku.php/en:data:dune
bio_wide <- read.delim('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/dune2.spe.txt', row.names = 1) |>
tibble::as_tibble() |>
dplyr::mutate(site_id = dplyr::row_number())
bio_long <- bio_wide |>
tidyr::pivot_longer(cols = - site_id)
sites <- read.delim('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/dune2.env.txt', row.names = 1) |>
tibble::as_tibble() |>
dplyr::mutate(site_id = dplyr::row_number()
, Moisture = forcats::fct_relevel(as.character(Moisture), "1", "2", "4", "5")
)
dist_bio <- vegan::vegdist(bio_wide)
dist_bio_mat <- as.matrix(dist_bio)
dist_env <- dist(sites |>
dplyr::select(where(is.numeric))
)
possible_groups <- 2:(nrow(bio_wide) * 3 / 4)
use_methods <- tibble::tibble(method = c("mcquitty", "average", "geo", "geo")
, alpha = c(NA, NA, 0.1, 0.3)
)
# make dendograms ------
dends <- use_methods |>
dplyr::mutate(dend = purrr::map2(method
, alpha
, \(x, y) make_dend(method = x
, dist_bio = dist_bio
, dist_env = dist_env
, geo_alpha = y
)
)
)
# knn clusters ---------
knn_clusters <- tibble::tibble(method = "knn"
, groups = possible_groups
) |>
dplyr::mutate(knn = purrr::map(groups
, \(x) make_knn_clust(dist_bio_mat
, sites
, suggested_k = x
)
)
, clusters = purrr::map(knn, "clusters")
, groups = purrr::map_dbl(knn, "cnumber")
) |>
dplyr::group_by(groups) |>
dplyr::slice(1) |>
dplyr::ungroup()
# make clusters -----------
clusters <- dends |>
dplyr::mutate(clusters = purrr::map(dend
, \(x) make_clusters(x
, group_range = 2:(nrow(bio_wide) / 2)
, sites = sites
)
)
) |>
tidyr::unnest(cols = c(clusters)) |>
dplyr::bind_rows(knn_clusters)
# summarise clusters --------
clusters_summarise <- clusters |>
dplyr::mutate(summary = purrr::map(clusters, \(x) make_summary_df(x))) |>
tidyr::unnest(cols = c(summary))
# frequency of taxa --------
clusters_freq <- clusters |>
dplyr::mutate(freq = purrr::map(clusters, \(x) make_freq_df(clust_df = x
, bio_df = bio_long
, context = "site_id"
)
)
) |>
dplyr::select(-clusters) |>
tidyr::unnest(cols = c(freq))
# indicator value ---------
clusters_ind_val <- clusters |>
dplyr::mutate(ind_val = purrr::map(clusters
, \(x) make_ind_val_df(x
, bio_wide = bio_wide
, context = "site_id"
)
)
) |>
dplyr::select(-clusters) |>
tidyr::unnest(cols = c(ind_val))
# silhouette width ---------
clusters_sil <- clusters |>
dplyr::mutate(sil = purrr::map(clusters
, \(x) make_sil_df(x
, dist_obj = dist_bio
)
)
) |>
dplyr::select(-clusters) |>
tidyr::unnest(cols = c(sil))
# gap statistic --------
clusters_gap <- clusters |>
dplyr::mutate(gap = purrr::map(clusters
, \(x) make_gap_df(x
, dist_mat = dist_bio_mat
, n_sample = 30
)
)
) |>
dplyr::select(-clusters) |>
tidyr::unnest(cols = c(gap))
# explore -------
clusters_explore <- clusters_summarise |>
dplyr::left_join(clusters_freq) |>
dplyr::left_join(clusters_ind_val) |>
dplyr::left_join(clusters_sil) |>
dplyr::left_join(clusters_gap)
# find a good clustering ---------
best <- clusters_explore |>
## min_clust_size is larger than x sites
dplyr::filter(min_clust_size > 1) |>
## large av clust size
dplyr::filter(av_clust_size > quantile(av_clust_size, probs = 0.5)) |>
## high macro_sil
dplyr::filter(macro_sil > quantile(macro_sil, probs = 0.5)) |>
## high macro_gap
dplyr::filter(macro_gap > quantile(macro_gap, probs = 0.5))
## dendogram --------
dend <- decorate_dend(clust_df = best$clusters[[1]]
, dend = best$dend[[1]]
, second_group_col = "Moisture"
, label_col = "Moisture"
)
plot(dend$dend)
## silhouette plot ---------
make_sil_plot(best$sil[[1]])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.