Nothing
#' Read and structure .Q files or existing matrices as `q_matrix` or `gt_admix`
#' objects.
#'
#' This function reads .Q matrix files generated by external clustering
#' algorithms (such as ADMIXTURE) and transforms them into `gt_admix` objects.
#'
#' @param x can be:
#' - a path to a directory containing .Q files
#' @return
#' - a `gt_admix` object containing a list of Q matrices and a list of
#' indices for each Q matrix separated by K
#' @export
#' @examples
#' q_files_path <- system.file("extdata", "anolis", package = "tidypopgen")
#'
#' admix_obj <- read_q_files(q_files_path)
#' summary(admix_obj)
read_q_files <- function(x) {
if (!dir.exists(x)) {
stop("Input is not a valid directory")
} else if (dir.exists(x)) {
# List all .Q files in the directory
files <- list.files(x, pattern = "\\.Q(?:\\.gz)?$", full.names = TRUE)
# Check if the directory contains at least one .Q file
if (length(files) == 0) {
stop("No .Q files found in the directory")
}
# Read all .Q files into a list and convert each to a q-matrix
matrix_list <- lapply(files, function(file) {
con <- if (grepl("\\.gz$", file)) gzfile(file, "rt") else file
data <- utils::read.table(con, header = FALSE)
if (grepl("\\.gz$", file)) {
close(con)
}
q_matrix(data)
})
# Get the number of columns for each Q matrix (which corresponds to K)
k_values <- sapply(matrix_list, ncol)
# Create main list containing k_list and matrix list
gt_admix <- list(k = k_values, Q = matrix_list)
# Set class and return the final list
class(gt_admix) <- c("gt_admix", class(gt_admix))
return(gt_admix) # nolint
}
}
#' Convert a standard matrix to a `q_matrix` object
#'
#' Takes a single Q matrix that exists as either a matrix or a data frame and
#' returns a `q_matrix` object.
#'
#' @param x A matrix or a data frame
#' @return A `q_matrix` object
#' @export
#' @examples
#' # Read in a single .Q file
#' q_mat <- read.table(system.file("extdata", "anolis", "anolis_ld_run1.3.Q",
#' package = "tidypopgen"
#' ))
#' class(q_mat)
#'
#' # Convert to a Q matrix object
#' q_mat <- q_matrix(q_mat)
#' class(q_mat)
#'
q_matrix <- function(x) {
if (inherits(x, "data.frame")) {
x <- as.matrix(x)
}
colnames(x) <- paste0(".Q", seq_len(ncol(x)))
class(x) <- c("q_matrix", class(x))
return(x) # nolint
}
#' Return a single Q matrix from a `gt_admix` object
#'
#' This function retrieves a single Q matrix from a `gt_admix` object
#' based on the specified k value and run number.
#'
#' @param x A `gt_admix` object containing multiple Q matrices
#' @param ... Not used
#' @param k The k value of the desired Q matrix
#' @param run The run number of the desired Q matrix
#' @return A single Q matrix from the `gt_admix` object
#' @export
#' @examples
#' # Read example gt_admix obejct
#' admix_obj <-
#' readRDS(system.file("extdata", "anolis", "anole_adm_k3.rds",
#' package = "tidypopgen"
#' ))
#'
#' # Extract a Q matrix
#' get_q_matrix(admix_obj, k = 3, run = 1)
get_q_matrix <- function(x, ..., k, run) {
# Check if 'x' is a valid gt_admix object
if (!inherits(x, "gt_admix")) {
stop("Input must be a 'gt_admix' object")
}
# Check if the requested k exists in k_indices
if (!k %in% x$k) {
stop("Specified k value not found in gt_admix")
}
k_values <- split(seq_along(x$k), x$k)
k <- which(names(k_values) == k)
# Get the indices for the specified k
k_indices <- k_values[[k]]
# Check if the specified run is valid within the chosen k
if (run < 1 || run > length(k_indices)) {
stop("Specified run is out of range for the given k value")
}
# Get the index of the desired q-matrix in q_matrices
matrix_index <- k_indices[run]
# Retrieve and return the q-matrix
q_matrix <- x$Q[[matrix_index]]
# if the id variable exists, add it as an attribute
if ("id" %in% names(x)) {
attr(q_matrix, "id") <- x$id
}
# if a grouping varible exist, add it as an attribute
if ("group" %in% names(x)) {
attr(q_matrix, "group") <- x$group
}
class(q_matrix) <- c("q_matrix", class(q_matrix))
return(q_matrix) # nolint
}
#' Return a single P matrix from a `gt_admix` object
#'
#' This function retrieves a single P matrix from a `gt_admix` object
#' based on the specified k value and run number.
#'
#' @param x A `gt_admix` object containing P matrices
#' @param ... Not used
#' @param k The k value of the desired P matrix
#' @param run The run number of the desired P matrix
#' @return A single P matrix from the `gt_admix` object
#' @export
#' @examples
#' # Read example gt_admix object
#' admix_obj <-
#' readRDS(system.file("extdata", "anolis", "anole_adm_k3.rds",
#' package = "tidypopgen"
#' ))
#'
#' # Extract a P matrix
#' get_p_matrix(admix_obj, k = 3, run = 1)
get_p_matrix <- function(x, ..., k, run) {
# Check if 'x' is a valid gt_admix object
if (!inherits(x, "gt_admix")) {
stop("Input must be a 'gt_admix' object")
}
# Check if p_matrices exists in x
if (!"P" %in% names(x)) {
stop("Input object does not contain any P matrices")
}
# Check if the requested k exists in k_indices
if (!k %in% x$k) {
stop("Specified k value not found in gt_admix")
}
k_values <- split(seq_along(x$k), x$k)
k <- which(names(k_values) == k)
# Get the indices for the specified k
k_indices <- k_values[[k]]
# Check if the specified run is valid within the chosen k
if (run < 1 || run > length(k_indices)) {
stop("Specified run is out of range for the given k value")
}
# Get the index of the desired p-matrix in p_matrices
matrix_index <- k_indices[run]
# Retrieve and return the p-matrix
p_matrix <- x$P[[matrix_index]]
return(p_matrix) # nolint
}
#' Tidy a Q matrix
#'
#' Takes a `q_matrix` object, which is a matrix, and returns a tidied tibble.
#'
#' @param x A Q matrix object (as returned by [`q_matrix`]).
#' @param data An associated tibble (e.g. a [`gen_tibble`]), with the
#' individuals in the same order as the data used to generate the Q matrix
#' @param ... not currently used
#' @return A tidied tibble containing columns:
#' \item{`row`}{ID of the original observation (i.e. rowname from original
#' data).}
#' \item{`Q`}{Integer indicating a Q component.} \item{`value`}{The proportion
#' for that particular Q value.}
#'
#' @export
#' @examples
#' # run the example only if we have the package installed
#' if (requireNamespace("LEA", quietly = TRUE)) {
#' example_gt <- load_example_gt("gen_tbl")
#'
#' # Create a gt_admix object
#' admix_obj <- example_gt %>% gt_snmf(k = 1:3, project = "force")
#'
#' # Extract a Q matrix
#' q_mat_k3 <- get_q_matrix(admix_obj, k = 3, run = 1)
#'
#' tidy(q_mat_k3, data = example_gt)
#' }
tidy.q_matrix <- function(x, data, ...) {
rlang::check_dots_empty()
# convert to tibble
q_tbl <- x %>%
tibble::as_tibble()
# use attributes if supplied
if (!is.null(attr(x, "id")) && !is.null(attr(x, "group"))) {
q_tbl$id <- attr(x, "id")
q_tbl$group <- attr(x, "group")
for (i in seq_len(ncol(q_tbl))) {
attr(q_tbl[[i]], "id") <- NULL
attr(q_tbl[[i]], "group") <- NULL
attr(q_tbl[[i]], "class") <- NULL
}
# otherwise use supplied gen_tibble
} else if (
is.null(attr(x, "id")) &&
is.null(attr(x, "group")) &&
inherits(data, "grouped_df")
) {
# nolint
groupdf <- data %>% dplyr::group_data()
colu <- seq(1, nrow(data))
for (i in seq_len(nrow(groupdf))) {
group_rows <- groupdf$.rows[[i]]
group_name <- groupdf[[1]][i]
colu[group_rows] <- as.character(group_name)
}
q_tbl <- q_tbl %>%
dplyr::mutate(
id = data$id,
group = colu
)
} else {
q_tbl <- q_tbl %>%
dplyr::mutate(id = data$id)
}
# pivot the resulting data
q_tbl <- q_tbl %>%
tidyr::pivot_longer(
cols = dplyr::starts_with(".Q"),
names_to = "q",
values_to = "percentage"
) %>%
dplyr::mutate(percentage = as.numeric(.data$percentage))
q_tbl
}
#' Augment data with information from a q_matrix object
#'
#' Augment for `q_matrix` accepts a model object and a dataset and adds Q values
#' to each observation in the dataset. Q values are stored in separate columns,
#' which is given name with the pattern ".Q1",".Q2", etc. For consistency with
#' [broom::augment.prcomp], a column ".rownames" is also returned; it is a copy
#' of 'id', but it ensures that any scripts written for data augmented with
#' [broom::augment.prcomp] will work out of the box (this is especially helpful
#' when adapting plotting scripts).
#' @param x A `q_matrix` object
#' @param data the `gen_tibble` used to run the clustering algorithm
#' @param ... Not used. Needed to match generic signature only.
#' @return A [gen_tibble] containing the original data along with additional
#' columns containing each observation's Q values.
#' @export
#' @name augment_q_matrix
#' @examples
#' # run the example only if we have the package installed
#' if (requireNamespace("LEA", quietly = TRUE)) {
#' example_gt <- load_example_gt("gen_tbl")
#'
#' # Create a gt_admix object
#' admix_obj <- example_gt %>% gt_snmf(k = 1:3, project = "force")
#'
#' # Extract a Q matrix
#' q_mat_k3 <- get_q_matrix(admix_obj, k = 3, run = 1)
#'
#' # Augment the gen_tibble with Q values
#' augment(q_mat_k3, data = example_gt)
#' }
augment.q_matrix <- function(x, data = NULL, ...) {
if (inherits(data, "grouped_df")) {
if (!".rownames" %in% names(data)) {
group_vars <- group_vars(data)
data <- data %>% dplyr::ungroup()
data <- data %>%
dplyr::mutate(.rownames = row_number())
data <- data %>% dplyr::group_by(across(all_of(group_vars)))
}
} else {
if (!".rownames" %in% names(data)) {
data <- data %>%
dplyr::mutate(.rownames = data$id)
}
}
q_tbl <- x %>%
tibble::as_tibble()
if (!is.null(attr(x, "id")) && !is.null(attr(x, "group"))) {
q_tbl$id <- attr(x, "id")
q_tbl$group <- attr(x, "group")
for (i in seq_len(ncol(q_tbl))) {
attr(q_tbl[[i]], "id") <- NULL
attr(q_tbl[[i]], "group") <- NULL
attr(q_tbl[[i]], "class") <- NULL
}
} else if (
is.null(attr(x, "id")) &&
is.null(attr(x, "group")) &&
inherits(data, "grouped_df")
) {
# nolint
groupdf <- data %>% dplyr::group_data()
colu <- seq(1, nrow(data))
for (i in seq_len(nrow(groupdf))) {
group_rows <- groupdf$.rows[[i]]
group_name <- groupdf[[1]][i]
colu[group_rows] <- as.character(group_name)
}
q_tbl <- q_tbl %>%
dplyr::mutate(
id = data$id,
group = colu
)
} else {
q_tbl <- q_tbl %>%
dplyr::mutate(id = data$id)
}
q_tbl
# and finally left join
data <- dplyr::left_join(data, q_tbl, by = "id")
if (inherits(data, "grouped_df")) {
data <- data %>% dplyr::group_by(across(group_vars(data)))
class(data) <- c("gen_tbl", class(data))
}
data
}
#' Autoplots for `q_matrix` objects
#'
#' This autoplot will automatically rearrange individuals according to their id
#' and any grouping variables if an associated 'data' gen_tibble is provided. To
#' avoid any automatic re-sorting of individuals, set `arrange_by_group` and
#' `arrange_by_indiv` to FALSE.
#'
#' @param object A Q matrix object (as returned by [q_matrix()]).
#' @param data An associated tibble (e.g. a [`gen_tibble`]), with the
#' individuals in the same order as the data used to generate the Q matrix
#' @param annotate_group Boolean determining whether to annotate the plot with
#' the group information
#' @param arrange_by_group Boolean determining whether to arrange the
#' individuals by group. If the grouping variable in the `gen_tibble` or the
#' metadata of the `gt_admixt` object is a factor, the data will be ordered by
#' the levels of the factor; else it will be ordered alphabetically.
#' @param arrange_by_indiv Boolean determining whether to arrange the
#' individuals by their individual id (if arrange_by_group is TRUE, they will
#' be arranged by group first and then by individual id, i.e. within each
#' group). If `id` in the `get_tibble` or the metadata of the `gt_admix`
#' object is a factor, it will be ordered by the levels of the factor; else it
#' will be ordered alphabetically.
#' @param reorder_within_groups Boolean determining whether to reorder the
#' individuals within each group based on their ancestry proportion (note that
#' this is not advised if you are making multiple plots, as you would get a
#' different order for each plot!). If TRUE, `annotate_group` must also be
#' TRUE.
#' @param ... not currently used.
#' @returns a barplot of individuals, coloured by ancestry proportion
#' @name autoplot_q_matrix
#' @export
#' @examples
#' # Read example gt_admix obejct
#' admix_obj <-
#' readRDS(system.file("extdata", "anolis", "anole_adm_k3.rds",
#' package = "tidypopgen"
#' ))
#'
#' # Extract a Q matrix
#' q_mat_k3 <- get_q_matrix(admix_obj, k = 3, run = 1)
#'
#' # Basic autoplot
#' autoplot(q_mat_k3, annotate_group = FALSE, arrange_by_group = FALSE)
#'
#' # To arrange individuals by group and by Q proportion
#' autoplot(q_mat_k3,
#' annotate_group = TRUE, arrange_by_group = TRUE,
#' arrange_by_indiv = TRUE, reorder_within_groups = TRUE
#' )
#'
autoplot.q_matrix <- function(
object,
data = NULL,
annotate_group = TRUE,
arrange_by_group = TRUE,
arrange_by_indiv = TRUE,
reorder_within_groups = FALSE,
...) {
rlang::check_dots_empty()
# test that if reorder_within_groups is TRUE, annotate_group is also TRUE
if (reorder_within_groups && !annotate_group) {
stop("If reorder_within_groups is TRUE, annotate_group should also be TRUE")
}
K <- ncol(object) # nolint
# create dataset if we don't have a gen_tibble
if (is.null(data)) {
q_tbl <- as.data.frame(object)
# if the q_matrix has an id attribute, add it to the data
if ("id" %in% names(attributes(object))) {
q_tbl$id <- attr(object, "id")
} else {
q_tbl$id <- seq_len(nrow(q_tbl))
}
# if the q_matrix has a group attribute, add it to the data
if ("group" %in% names(attributes(object))) {
q_tbl$group <-
rep(
attr(object, "group"),
each = nrow(q_tbl) / length(attr(object, "group"))
)
}
} else {
# if we have the info from the gen_tibble
# if there is geometry, drop it
data <- sf::st_drop_geometry(data)
q_tbl <- augment(object, data)
}
# if we have a grouping variable and we plan to use it, then reorder by it
q_tbl$id <- seq_len(nrow(q_tbl))
if (("group" %in% names(q_tbl)) && annotate_group) {
if (arrange_by_group && arrange_by_indiv) {
q_tbl <- q_tbl %>%
dplyr::arrange(.data$group, .data$id)
} else if (arrange_by_group) {
q_tbl <- q_tbl %>%
dplyr::arrange(.data$group)
} else if (arrange_by_indiv) {
q_tbl <- q_tbl %>%
dplyr::arrange(.data$group)
}
}
# now reset the id to the new order
q_tbl$id <- seq_len(nrow(q_tbl))
q_tbl <- q_tbl %>%
tidyr::pivot_longer(
cols = dplyr::starts_with(".Q"),
names_to = "q",
values_to = "percentage"
) %>%
dplyr::mutate(percentage = as.numeric(.data$percentage))
# if we reorder within group
if (("group" %in% names(q_tbl)) && reorder_within_groups) {
q_tbl <- q_tbl %>%
dplyr::group_by(.data$group, .data$id) %>%
# only sort by group, so that we don't reorder individuals within groups
dplyr::arrange(.data$group) %>%
dplyr::mutate(
q = factor(
.data$q,
levels = .data$q[order(.data$percentage, decreasing = FALSE)]
)
) # nolint
dominant_q <- q_tbl %>%
dplyr::group_by(.data$id) %>%
dplyr::summarize(dominant_q = max(.data$percentage), .groups = "drop")
q_tbl <- q_tbl %>%
dplyr::left_join(dominant_q, by = "id")
q_tbl <- q_tbl %>%
dplyr::group_by(.data$group) %>%
dplyr::arrange(desc(.data$dominant_q), .by_group = TRUE)
}
levels_q <- unique(q_tbl$id)
q_tbl <- q_tbl %>%
dplyr::mutate(id = factor(.data$id, levels = levels_q))
plt <- ggplot2::ggplot(
q_tbl,
ggplot2::aes(
x = .data$id,
y = .data$percentage,
fill = .data$q
)
) +
ggplot2::geom_col(
width = 1,
position = "stack"
) +
ggplot2::labs(y = paste("K = ", K)) +
theme_distruct() +
scale_fill_distruct()
if (annotate_group) {
if (!"group" %in% names(q_tbl)) {
warning(paste(
"no annotation possible if 'gen_tbl' is NULL and",
"q_matrix does not contain group information"
))
} else {
plt <- plt + annotate_group_info(q_tbl, plt) # nolint
}
}
plt
}
# thin vertical lines show when plot is saved as .pdf and opened with certain
# viewers, this is a product of the specific viewer (seen on unix and mac),
# knitting to html instead fixes, or choosing a different output (not pdf)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.