R/q_matrix.R

Defines functions autoplot.q_matrix augment.q_matrix tidy.q_matrix get_p_matrix get_q_matrix q_matrix read_q_files

Documented in augment.q_matrix autoplot.q_matrix get_p_matrix get_q_matrix q_matrix read_q_files tidy.q_matrix

#' Read and structure .Q files or existing matrices as `q_matrix` or `gt_admix`
#' objects.
#'
#' This function reads .Q matrix files generated by external clustering
#' algorithms (such as ADMIXTURE) and transforms them into `gt_admix` objects.
#'
#' @param x can be:
#' - a path to a directory containing .Q files
#' @return
#' - a `gt_admix` object containing a list of Q matrices and a list of
#' indices for each Q matrix separated by K
#' @export
#' @examples
#' q_files_path <- system.file("extdata", "anolis", package = "tidypopgen")
#'
#' admix_obj <- read_q_files(q_files_path)
#' summary(admix_obj)
read_q_files <- function(x) {
  if (!dir.exists(x)) {
    stop("Input is not a valid directory")
  } else if (dir.exists(x)) {
    # List all .Q files in the directory
    files <- list.files(x, pattern = "\\.Q(?:\\.gz)?$", full.names = TRUE)

    # Check if the directory contains at least one .Q file
    if (length(files) == 0) {
      stop("No .Q files found in the directory")
    }

    # Read all .Q files into a list and convert each to a q-matrix
    matrix_list <- lapply(files, function(file) {
      con <- if (grepl("\\.gz$", file)) gzfile(file, "rt") else file
      data <- utils::read.table(con, header = FALSE)
      if (grepl("\\.gz$", file)) {
        close(con)
      }
      q_matrix(data)
    })

    # Get the number of columns for each Q matrix (which corresponds to K)
    k_values <- sapply(matrix_list, ncol)
    # Create main list containing k_list and matrix list
    gt_admix <- list(k = k_values, Q = matrix_list)
    # Set class and return the final list
    class(gt_admix) <- c("gt_admix", class(gt_admix))
    return(gt_admix) # nolint
  }
}

#' Convert a standard matrix to a `q_matrix` object
#'
#' Takes a single Q matrix that exists as either a matrix or a data frame and
#' returns a `q_matrix` object.
#'
#' @param x A matrix or a data frame
#' @return A `q_matrix` object
#' @export
#' @examples
#' # Read in a single .Q file
#' q_mat <- read.table(system.file("extdata", "anolis", "anolis_ld_run1.3.Q",
#'   package = "tidypopgen"
#' ))
#' class(q_mat)
#'
#' # Convert to a Q matrix object
#' q_mat <- q_matrix(q_mat)
#' class(q_mat)
#'
q_matrix <- function(x) {
  if (inherits(x, "data.frame")) {
    x <- as.matrix(x)
  }
  colnames(x) <- paste0(".Q", seq_len(ncol(x)))
  class(x) <- c("q_matrix", class(x))
  return(x) # nolint
}


#' Return a single Q matrix from a `gt_admix` object
#'
#' This function retrieves a single Q matrix from a `gt_admix` object
#' based on the specified k value and run number.
#'
#' @param x A `gt_admix` object containing multiple Q matrices
#' @param ... Not used
#' @param k The k value of the desired Q matrix
#' @param run The run number of the desired Q matrix
#' @return A single Q matrix from the `gt_admix` object
#' @export
#' @examples
#' # Read example gt_admix obejct
#' admix_obj <-
#'   readRDS(system.file("extdata", "anolis", "anole_adm_k3.rds",
#'     package = "tidypopgen"
#'   ))
#'
#' # Extract a Q matrix
#' get_q_matrix(admix_obj, k = 3, run = 1)
get_q_matrix <- function(x, ..., k, run) {
  # Check if 'x' is a valid gt_admix object
  if (!inherits(x, "gt_admix")) {
    stop("Input must be a 'gt_admix' object")
  }

  # Check if the requested k exists in k_indices
  if (!k %in% x$k) {
    stop("Specified k value not found in gt_admix")
  }

  k_values <- split(seq_along(x$k), x$k)

  k <- which(names(k_values) == k)

  # Get the indices for the specified k
  k_indices <- k_values[[k]]

  # Check if the specified run is valid within the chosen k
  if (run < 1 || run > length(k_indices)) {
    stop("Specified run is out of range for the given k value")
  }

  # Get the index of the desired q-matrix in q_matrices
  matrix_index <- k_indices[run]

  # Retrieve and return the q-matrix
  q_matrix <- x$Q[[matrix_index]]
  # if the id variable exists, add it as an attribute
  if ("id" %in% names(x)) {
    attr(q_matrix, "id") <- x$id
  }
  # if a grouping varible exist, add it as an attribute
  if ("group" %in% names(x)) {
    attr(q_matrix, "group") <- x$group
  }
  class(q_matrix) <- c("q_matrix", class(q_matrix))
  return(q_matrix) # nolint
}

#' Return a single P matrix from a `gt_admix` object
#'
#' This function retrieves a single P matrix from a `gt_admix` object
#' based on the specified k value and run number.
#'
#' @param x A `gt_admix` object containing P matrices
#' @param ... Not used
#' @param k The k value of the desired P matrix
#' @param run The run number of the desired P matrix
#' @return A single P matrix from the `gt_admix` object
#' @export
#' @examples
#' # Read example gt_admix object
#' admix_obj <-
#'   readRDS(system.file("extdata", "anolis", "anole_adm_k3.rds",
#'     package = "tidypopgen"
#'   ))
#'
#' # Extract a P matrix
#' get_p_matrix(admix_obj, k = 3, run = 1)
get_p_matrix <- function(x, ..., k, run) {
  # Check if 'x' is a valid gt_admix object
  if (!inherits(x, "gt_admix")) {
    stop("Input must be a 'gt_admix' object")
  }

  # Check if p_matrices exists in x
  if (!"P" %in% names(x)) {
    stop("Input object does not contain any P matrices")
  }

  # Check if the requested k exists in k_indices
  if (!k %in% x$k) {
    stop("Specified k value not found in gt_admix")
  }

  k_values <- split(seq_along(x$k), x$k)

  k <- which(names(k_values) == k)

  # Get the indices for the specified k
  k_indices <- k_values[[k]]

  # Check if the specified run is valid within the chosen k
  if (run < 1 || run > length(k_indices)) {
    stop("Specified run is out of range for the given k value")
  }

  # Get the index of the desired p-matrix in p_matrices
  matrix_index <- k_indices[run]

  # Retrieve and return the p-matrix
  p_matrix <- x$P[[matrix_index]]
  return(p_matrix) # nolint
}


#' Tidy a Q matrix
#'
#' Takes a `q_matrix` object, which is a matrix, and returns a tidied tibble.
#'
#' @param x A Q matrix object (as returned by [`q_matrix`]).
#' @param data An associated tibble (e.g. a [`gen_tibble`]), with the
#'   individuals in the same order as the data used to generate the Q matrix
#' @param ... not currently used
#' @return A tidied tibble containing columns:
#'   \item{`row`}{ID of the original observation (i.e. rowname from original
#'     data).}
#'   \item{`Q`}{Integer indicating a Q component.} \item{`value`}{The proportion
#'   for that particular Q value.}
#'
#' @export
#' @examples
#' # run the example only if we have the package installed
#' if (requireNamespace("LEA", quietly = TRUE)) {
#'   example_gt <- load_example_gt("gen_tbl")
#'
#'   # Create a gt_admix object
#'   admix_obj <- example_gt %>% gt_snmf(k = 1:3, project = "force")
#'
#'   # Extract a Q matrix
#'   q_mat_k3 <- get_q_matrix(admix_obj, k = 3, run = 1)
#'
#'   tidy(q_mat_k3, data = example_gt)
#' }
tidy.q_matrix <- function(x, data, ...) {
  rlang::check_dots_empty()

  # convert to tibble
  q_tbl <- x %>%
    tibble::as_tibble()

  # use attributes if supplied
  if (!is.null(attr(x, "id")) && !is.null(attr(x, "group"))) {
    q_tbl$id <- attr(x, "id")
    q_tbl$group <- attr(x, "group")
    for (i in seq_len(ncol(q_tbl))) {
      attr(q_tbl[[i]], "id") <- NULL
      attr(q_tbl[[i]], "group") <- NULL
      attr(q_tbl[[i]], "class") <- NULL
    }

    # otherwise use supplied gen_tibble
  } else if (
    is.null(attr(x, "id")) &&
      is.null(attr(x, "group")) &&
      inherits(data, "grouped_df")
  ) {
    # nolint
    groupdf <- data %>% dplyr::group_data()
    colu <- seq(1, nrow(data))
    for (i in seq_len(nrow(groupdf))) {
      group_rows <- groupdf$.rows[[i]]
      group_name <- groupdf[[1]][i]
      colu[group_rows] <- as.character(group_name)
    }
    q_tbl <- q_tbl %>%
      dplyr::mutate(
        id = data$id,
        group = colu
      )
  } else {
    q_tbl <- q_tbl %>%
      dplyr::mutate(id = data$id)
  }

  # pivot the resulting data
  q_tbl <- q_tbl %>%
    tidyr::pivot_longer(
      cols = dplyr::starts_with(".Q"),
      names_to = "q",
      values_to = "percentage"
    ) %>%
    dplyr::mutate(percentage = as.numeric(.data$percentage))

  q_tbl
}

#' Augment data with information from a q_matrix object
#'
#' Augment for `q_matrix` accepts a model object and a dataset and adds Q values
#' to each observation in the dataset. Q values  are stored in separate columns,
#' which is given name with the pattern ".Q1",".Q2", etc. For consistency with
#' [broom::augment.prcomp], a column ".rownames" is also returned; it is a copy
#' of 'id', but it ensures that any scripts written for data augmented with
#' [broom::augment.prcomp] will work out of the box (this is especially helpful
#' when adapting plotting scripts).
#' @param x  A `q_matrix` object
#' @param data the `gen_tibble` used to run the clustering algorithm
#' @param ... Not used. Needed to match generic signature only.
#' @return A  [gen_tibble] containing the original data along with additional
#'   columns containing each observation's Q values.
#' @export
#' @name augment_q_matrix
#' @examples
#' # run the example only if we have the package installed
#' if (requireNamespace("LEA", quietly = TRUE)) {
#'   example_gt <- load_example_gt("gen_tbl")
#'
#'   # Create a gt_admix object
#'   admix_obj <- example_gt %>% gt_snmf(k = 1:3, project = "force")
#'
#'   # Extract a Q matrix
#'   q_mat_k3 <- get_q_matrix(admix_obj, k = 3, run = 1)
#'
#'   # Augment the gen_tibble with Q values
#'   augment(q_mat_k3, data = example_gt)
#' }
augment.q_matrix <- function(x, data = NULL, ...) {
  if (inherits(data, "grouped_df")) {
    if (!".rownames" %in% names(data)) {
      group_vars <- group_vars(data)
      data <- data %>% dplyr::ungroup()
      data <- data %>%
        dplyr::mutate(.rownames = row_number())
      data <- data %>% dplyr::group_by(across(all_of(group_vars)))
    }
  } else {
    if (!".rownames" %in% names(data)) {
      data <- data %>%
        dplyr::mutate(.rownames = data$id)
    }
  }

  q_tbl <- x %>%
    tibble::as_tibble()

  if (!is.null(attr(x, "id")) && !is.null(attr(x, "group"))) {
    q_tbl$id <- attr(x, "id")
    q_tbl$group <- attr(x, "group")

    for (i in seq_len(ncol(q_tbl))) {
      attr(q_tbl[[i]], "id") <- NULL
      attr(q_tbl[[i]], "group") <- NULL
      attr(q_tbl[[i]], "class") <- NULL
    }
  } else if (
    is.null(attr(x, "id")) &&
      is.null(attr(x, "group")) &&
      inherits(data, "grouped_df")
  ) {
    # nolint
    groupdf <- data %>% dplyr::group_data()
    colu <- seq(1, nrow(data))

    for (i in seq_len(nrow(groupdf))) {
      group_rows <- groupdf$.rows[[i]]
      group_name <- groupdf[[1]][i]
      colu[group_rows] <- as.character(group_name)
    }

    q_tbl <- q_tbl %>%
      dplyr::mutate(
        id = data$id,
        group = colu
      )
  } else {
    q_tbl <- q_tbl %>%
      dplyr::mutate(id = data$id)
  }
  q_tbl

  # and finally left join
  data <- dplyr::left_join(data, q_tbl, by = "id")

  if (inherits(data, "grouped_df")) {
    data <- data %>% dplyr::group_by(across(group_vars(data)))
    class(data) <- c("gen_tbl", class(data))
  }
  data
}


#' Autoplots for `q_matrix` objects
#'
#' This autoplot will automatically rearrange individuals according to their id
#' and any grouping variables if an associated 'data' gen_tibble is provided. To
#' avoid any automatic re-sorting of individuals, set `arrange_by_group` and
#' `arrange_by_indiv` to FALSE.
#'
#' @param object A Q matrix object (as returned by [q_matrix()]).
#' @param data An associated tibble (e.g. a [`gen_tibble`]), with the
#'   individuals in the same order as the data used to generate the Q matrix
#' @param annotate_group Boolean determining whether to annotate the plot with
#'   the group information
#' @param arrange_by_group Boolean determining whether to arrange the
#'   individuals by group. If the grouping variable in the `gen_tibble` or the
#'   metadata of the `gt_admixt` object is a factor, the data will be ordered by
#'   the levels of the factor; else it will be ordered alphabetically.
#' @param arrange_by_indiv Boolean determining whether to arrange the
#'   individuals by their individual id (if arrange_by_group is TRUE, they will
#'   be arranged by group first and then by individual id, i.e. within each
#'   group). If `id` in the `get_tibble` or the metadata of the `gt_admix`
#'   object is a factor, it will be ordered by the levels of the factor; else it
#'   will be ordered alphabetically.
#' @param reorder_within_groups Boolean determining whether to reorder the
#'   individuals within each group based on their ancestry proportion (note that
#'   this is not advised if you are making multiple plots, as you would get a
#'   different order for each plot!). If TRUE, `annotate_group` must also be
#'   TRUE.
#' @param ... not currently used.
#' @returns a barplot of individuals, coloured by ancestry proportion
#' @name autoplot_q_matrix
#' @export
#' @examples
#' # Read example gt_admix obejct
#' admix_obj <-
#'   readRDS(system.file("extdata", "anolis", "anole_adm_k3.rds",
#'     package = "tidypopgen"
#'   ))
#'
#' # Extract a Q matrix
#' q_mat_k3 <- get_q_matrix(admix_obj, k = 3, run = 1)
#'
#' # Basic autoplot
#' autoplot(q_mat_k3, annotate_group = FALSE, arrange_by_group = FALSE)
#'
#' # To arrange individuals by group and by Q proportion
#' autoplot(q_mat_k3,
#'   annotate_group = TRUE, arrange_by_group = TRUE,
#'   arrange_by_indiv = TRUE, reorder_within_groups = TRUE
#' )
#'
autoplot.q_matrix <- function(
    object,
    data = NULL,
    annotate_group = TRUE,
    arrange_by_group = TRUE,
    arrange_by_indiv = TRUE,
    reorder_within_groups = FALSE,
    ...) {
  rlang::check_dots_empty()

  # test that if reorder_within_groups is TRUE, annotate_group is also TRUE
  if (reorder_within_groups && !annotate_group) {
    stop("If reorder_within_groups is TRUE, annotate_group should also be TRUE")
  }

  K <- ncol(object) # nolint
  # create dataset if we don't have a gen_tibble
  if (is.null(data)) {
    q_tbl <- as.data.frame(object)
    # if the q_matrix has an id attribute, add it to the data
    if ("id" %in% names(attributes(object))) {
      q_tbl$id <- attr(object, "id")
    } else {
      q_tbl$id <- seq_len(nrow(q_tbl))
    }
    # if the q_matrix has a group attribute, add it to the data
    if ("group" %in% names(attributes(object))) {
      q_tbl$group <-
        rep(
          attr(object, "group"),
          each = nrow(q_tbl) / length(attr(object, "group"))
        )
    }
  } else {
    # if we have the info from the gen_tibble
    # if there is geometry, drop it
    data <- sf::st_drop_geometry(data)
    q_tbl <- augment(object, data)
  }
  # if we have a grouping variable and we plan to use it, then reorder by it
  q_tbl$id <- seq_len(nrow(q_tbl))
  if (("group" %in% names(q_tbl)) && annotate_group) {
    if (arrange_by_group && arrange_by_indiv) {
      q_tbl <- q_tbl %>%
        dplyr::arrange(.data$group, .data$id)
    } else if (arrange_by_group) {
      q_tbl <- q_tbl %>%
        dplyr::arrange(.data$group)
    } else if (arrange_by_indiv) {
      q_tbl <- q_tbl %>%
        dplyr::arrange(.data$group)
    }
  }
  # now reset the id to the new order
  q_tbl$id <- seq_len(nrow(q_tbl))
  q_tbl <- q_tbl %>%
    tidyr::pivot_longer(
      cols = dplyr::starts_with(".Q"),
      names_to = "q",
      values_to = "percentage"
    ) %>%
    dplyr::mutate(percentage = as.numeric(.data$percentage))

  # if we reorder within group
  if (("group" %in% names(q_tbl)) && reorder_within_groups) {
    q_tbl <- q_tbl %>%
      dplyr::group_by(.data$group, .data$id) %>%
      # only sort by group, so that we don't reorder individuals within groups
      dplyr::arrange(.data$group) %>%
      dplyr::mutate(
        q = factor(
          .data$q,
          levels = .data$q[order(.data$percentage, decreasing = FALSE)]
        )
      ) # nolint

    dominant_q <- q_tbl %>%
      dplyr::group_by(.data$id) %>%
      dplyr::summarize(dominant_q = max(.data$percentage), .groups = "drop")

    q_tbl <- q_tbl %>%
      dplyr::left_join(dominant_q, by = "id")

    q_tbl <- q_tbl %>%
      dplyr::group_by(.data$group) %>%
      dplyr::arrange(desc(.data$dominant_q), .by_group = TRUE)
  }

  levels_q <- unique(q_tbl$id)
  q_tbl <- q_tbl %>%
    dplyr::mutate(id = factor(.data$id, levels = levels_q))

  plt <- ggplot2::ggplot(
    q_tbl,
    ggplot2::aes(
      x = .data$id,
      y = .data$percentage,
      fill = .data$q
    )
  ) +
    ggplot2::geom_col(
      width = 1,
      position = "stack"
    ) +
    ggplot2::labs(y = paste("K = ", K)) +
    theme_distruct() +
    scale_fill_distruct()

  if (annotate_group) {
    if (!"group" %in% names(q_tbl)) {
      warning(paste(
        "no annotation possible if 'gen_tbl' is NULL and",
        "q_matrix does not contain group information"
      ))
    } else {
      plt <- plt + annotate_group_info(q_tbl, plt) # nolint
    }
  }
  plt
}
# thin vertical lines show when plot is saved as .pdf and opened with certain
# viewers, this is a product of the specific viewer (seen on unix and mac),
# knitting to html instead fixes, or choosing a different output (not pdf)

Try the tidypopgen package in your browser

Any scripts or data that you put into this service are public.

tidypopgen documentation built on Aug. 28, 2025, 1:08 a.m.