inst/doc/a03_example_clustering_and_dapc.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
set.seed(123)

## -----------------------------------------------------------------------------
library(tidypopgen)
vcf_path <-
  system.file("/extdata/anolis/punctatus_t70_s10_n46_filtered.recode.vcf.gz",
    package = "tidypopgen"
  )
anole_gt <-
  gen_tibble(vcf_path, quiet = TRUE, backingfile = tempfile("anolis_"))

## -----------------------------------------------------------------------------
anole_gt

## -----------------------------------------------------------------------------
pops_path <- system.file("/extdata/anolis/punctatus_n46_meta.csv",
  package = "tidypopgen"
)
pops <- read.csv(pops_path)
pops

## ----error=TRUE---------------------------------------------------------------
try({
anole_gt <- anole_gt %>% left_join(pops, by = "id")
})

## -----------------------------------------------------------------------------
anole_gt %>% glimpse()

## -----------------------------------------------------------------------------
anole_gt <- gt_add_sf(anole_gt, c("longitude", "latitude"))
anole_gt

## -----------------------------------------------------------------------------
library(rnaturalearth)
library(ggplot2)

map <- ne_countries(
  continent = "South America",
  type = "map_units", scale = "medium"
)

ggplot() +
  geom_sf(data = map) +
  geom_sf(data = anole_gt$geometry) +
  coord_sf(
    xlim = c(-85, -30),
    ylim = c(-30, 15)
  ) +
  theme_minimal()

## ----error=TRUE---------------------------------------------------------------
try({
anole_pca <- anole_gt %>% gt_pca_partialSVD(k = 30)
})

## -----------------------------------------------------------------------------
anole_gt <- gt_impute_simple(anole_gt, method = "mode")

## -----------------------------------------------------------------------------
anole_pca <- anole_gt %>% gt_pca_partialSVD(k = 30)

## -----------------------------------------------------------------------------
anole_pca

## -----------------------------------------------------------------------------
tidy(anole_pca, matrix = "eigenvalues")

## ----fig.alt = "Scree Plot of eigenvalues for each Principal Component"-------
autoplot(anole_pca, type = "screeplot")

## ----fig.alt = "Score plot of individuals across the first and second Principal Components"----
autoplot(anole_pca, type = "scores")

## ----fig.alt = "Score plot of individuals across the first and second Principal Components, with individual samples coloured by population"----
library(ggplot2)
autoplot(anole_pca, type = "scores") +
  aes(color = anole_gt$population) +
  labs(color = "population")

## -----------------------------------------------------------------------------
anole_gt <- augment(anole_pca, data = anole_gt)

## ----fig.alt = "Another score plot of individuals across the first and second Principal Components, with individual samples coloured by population, using ggplot2"----
anole_gt %>% ggplot(aes(.fittedPC1, .fittedPC2, color = population)) +
  geom_point() +
  labs(x = "PC1", y = "PC2", color = "Population")

## ----fig.alt = "Plot of the loadings of Principal Component 1 for each loci"----
autoplot(anole_pca, type = "loadings")

## -----------------------------------------------------------------------------
anole_gt_load <- augment_loci(anole_pca, data = anole_gt)

## ----fig.alt = "Plot of the cumulative loadings for each Principal Component"----
library(ggplot2)
tidy(anole_pca, matrix = "eigenvalues") %>%
  ggplot(mapping = aes(x = PC, y = cumulative)) +
  geom_point()

## -----------------------------------------------------------------------------
anole_clusters <- gt_cluster_pca(anole_pca, n_pca = 10)

## ----fig.alt = "Plot of BIC (Bayesian Information Criteria) for each value of k"----
autoplot(anole_clusters)

## -----------------------------------------------------------------------------
anole_clusters <- gt_cluster_pca_best_k(anole_clusters)

## -----------------------------------------------------------------------------
anole_clusters$best_k

## -----------------------------------------------------------------------------
anole_dapc <- gt_dapc(anole_clusters)

## -----------------------------------------------------------------------------
anole_dapc

## -----------------------------------------------------------------------------
tidy(anole_dapc, matrix = "eigenvalues")

## ----fig.alt = "Scree plot of the eigenvalues on the two discriminant axes (defined by `ld`)"----
autoplot(anole_dapc, type = "screeplot")

## ----fig.alt = "Bar plot of eigenvalues against the two discriminant axes"----
tidy(anole_dapc, matrix = "eigenvalues") %>%
  ggplot(aes(x = LD, y = eigenvalue)) +
  geom_col()

## ----fig.alt = "Scatterplot of the scores of each individual on the two discriminant axes (defined by `ld`)"----
autoplot(anole_dapc, type = "scores")

## ----fig.alt = "Bar plot showing the probability of assignment to each cluster"----
autoplot(anole_dapc, type = "components", group = anole_gt$population)

## ----fig.alt = "Plot of the loadings of Principal Component 1 for each loci"----
autoplot(anole_dapc, "loadings")

## ----fig.alt = "Scatterplot of individuals with ellipses surrounding each cluster of individuals identified by the DAPC analysis. Eigenvalues are displayed in an inset in the bottom right corner of the plot."----
library(adegenet)
scatter(anole_dapc, posi.da = "bottomright")

## -----------------------------------------------------------------------------
anole_gt <- anole_gt %>% mutate(dapc = anole_dapc$grp)

ggplot() +
  geom_sf(data = map) +
  geom_sf(data = anole_gt$geometry, aes(color = anole_gt$dapc)) +
  coord_sf(
    xlim = c(-85, -30),
    ylim = c(-30, 15)
  ) +
  labs(color = "DAPC cluster") +
  theme_minimal()

## ----results='hide', echo=TRUE, eval=FALSE------------------------------------
# anole_gt <- anole_gt %>% group_by(population)
# 
# anole_adm_original <- gt_admixture(
#   x = anole_gt,
#   k = 2:6,
#   n_runs = 1,
#   crossval = TRUE,
#   seed = 1
# )

## ----results="hide", echo=FALSE, eval=TRUE, warning=FALSE---------------------
anole_gt <- anole_gt %>% group_by(population)

anole_adm <- readRDS("a03_anole_adm.rds")

## ----fig.alt = "Plot of cross-entropy for each value of K, showing K = 3 has the lowest cross-entropy"----
autoplot(anole_adm, type = "cv")

## ----fig.alt = "Barplot of individuals coloured by predicted ancestry proportion (Q) from each of K ancestral sources"----
autoplot(anole_adm,
  k = 3, run = 1, data = anole_gt, annotate_group = FALSE,
  type = "barplot"
)

## ----fig.alt = "Barplot of individuals coloured by predicted ancestry proportion (Q) from each of K ancestral sources"----
autoplot(anole_adm,
  type = "barplot", k = 3, run = 1, data = anole_gt,
  annotate_group = TRUE, arrange_by_group = TRUE
)

## ----fig.alt = "Barplot of individuals coloured by predicted ancestry proportion (Q) from each of K ancestral sources, with individuals arranged according to their dominant ancestry propotion"----
autoplot(anole_adm,
  type = "barplot", k = 3, run = 1,
  data = anole_gt, annotate_group = TRUE, arrange_by_group = TRUE,
  arrange_by_indiv = TRUE, reorder_within_groups = TRUE
)

## -----------------------------------------------------------------------------
q_mat <- get_q_matrix(anole_adm, k = 3, run = 1)

## -----------------------------------------------------------------------------
anole_gt_adm <- augment(q_mat, data = anole_gt)
head(anole_gt_adm)

## -----------------------------------------------------------------------------
tidy_q <- tidy(q_mat, anole_gt)
head(tidy_q)

## ----fig.alt = "Barplot of individuals coloured by predicted ancestry proportion (Q) from each of K ancestral sources"----
tidy_q <- tidy_q %>%
  dplyr::group_by(id) %>%
  dplyr::mutate(dominant_q = max(percentage)) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(group, dplyr::desc(dominant_q)) %>%
  dplyr::mutate(
    plot_order = dplyr::row_number(),
    id = factor(id, levels = unique(id))
  )

plt <- ggplot2::ggplot(tidy_q, ggplot2::aes(x = id, y = percentage, fill = q)) +
  ggplot2::geom_col(
    width = 1,
    position = ggplot2::position_stack(reverse = TRUE)
  ) +
  ggplot2::labs(
    y = "Population Structure for K = 3",
    title = "ADMIXTURE algorithm on A. punctatus"
  ) +
  theme_distruct() +
  scale_fill_distruct()

plt

## -----------------------------------------------------------------------------
anole_adm <- gt_admix_reorder_q(anole_adm, group = anole_gt$pop)

## ----fig.alt = "Barplot of individuals coloured by predicted ancestry proportion (Q) from each of K ancestral sources, labelled Atlantic forest or Amazonian forest"----
autoplot(anole_adm,
  type = "barplot", k = 3, run = 1,
  annotate_group = TRUE, arrange_by_group = TRUE,
  arrange_by_indiv = TRUE, reorder_within_groups = TRUE
)

## ----echo=FALSE, results='hide'-----------------------------------------------
# reload the admxiture results (they were reordered above)
anole_adm <- readRDS("a03_anole_adm.rds")
runs <- c(1)
k_values <- c(2:6)
q_matrices <- list()
adm_dir <- file.path(tempdir(), "anolis_adm")
if (!dir.exists(adm_dir)) {
  dir.create(adm_dir)
}

for (x in k_values) {
  q_matrices[[x]] <- list()

  for (i in runs) {
    q_matrices[[x]][[i]] <- get_q_matrix(anole_adm, k = x, run = i)

    write.table(q_matrices[[x]][[i]],
      file = paste0(adm_dir, "/K", x, "run", i, ".Q"),
      col.names = FALSE, row.names = FALSE, quote = FALSE
    )
  }
}

## -----------------------------------------------------------------------------
adm_dir <- file.path(tempdir(), "anolis_adm")
list.files(adm_dir)

## -----------------------------------------------------------------------------
q_list <- read_q_files(adm_dir)
summary(q_list)

## -----------------------------------------------------------------------------
head(get_q_matrix(q_list, k = 3, run = 1))

## ----fig.alt = "Barplot of individuals coloured by predicted ancestry proportion (Q) from each of K ancestral sources"----
autoplot(get_q_matrix(q_list, k = 3, run = 1),
  data = anole_gt,
  annotate_group = TRUE, arrange_by_group = TRUE,
  arrange_by_indiv = TRUE, reorder_within_groups = TRUE
)

Try the tidypopgen package in your browser

Any scripts or data that you put into this service are public.

tidypopgen documentation built on Aug. 28, 2025, 1:08 a.m.