R/gt_admixture.R

Defines functions gt_admixture

Documented in gt_admixture

#' Run ADMIXTURE from R
#'
#' This function runs ADMIXTURE, taking either a `gen_tibble` or a file as an
#' input. This is a wrapper that runs ADMIXTURE from the command line, and reads
#' the output into R. It can run multiple values of `k` and multiple repeats for
#' each `k`.
#'
#' @details This is a wrapper for the command line program ADMIXTURE. It can
#'   either use a binary present in the main environment, or use a copy
#'   installed in a conda environment.
#' @param x a `gen_tibble` or a character giving the path of the input PLINK bed
#'   file
#' @param k an integer giving the number of clusters
#' @param n_runs the number of runs for each k value (defaults to 1)
#' @param crossval boolean, should cross validation be used to assess the fit
#'   (defaults to FALSE)
#' @param n_cores number of cores (defaults to 1)
#' @param seed the seed for the random number generator (defaults to NULL)
#' @param conda_env the name of the conda environment to use. "none" forces the
#'   use of a local copy, whilst any other string will direct the function to
#'   use a custom conda environment.
#' @return an object of class `gt_admix` consisting of a list with the following
#'   elements:
#' - `k` the number of clusters
#' - `Q` a matrix with the admixture proportions
#' - `P` a matrix with the allele frequencies
#' - `log` a log of the output generated by ADMIXTURE
#'    (usually printed on the screen when running from the command line)
#' - `cv` the cross validation error (if `crossval` is TRUE)
#' - `loglik` the log likelihood of the model
#' - `id` the id column of the input `gen_tibble` (if applicable)
#' - `group` the group column of the input `gen_tibble` (if applicable)
#' @export
#' @examples
#' # run the example only if we have the package installed
#' \dontrun{
#' bed_file <-
#'   system.file("extdata", "lobster", "lobster.bed", package = "tidypopgen")
#' lobsters <- gen_tibble(bed_file,
#'   backingfile = tempfile("lobsters"),
#'   quiet = TRUE
#' )
#' lobsters <- lobsters %>% group_by(population)
#' gt_admixture(lobsters,
#'   k = 2:3, seed = c(1, 2),
#'   n_runs = 2, crossval = TRUE
#' )
#' }
# If the package `tidygenclust` is installed, and its conda environment has been
# set up with `ADMIXTURE` in it (the default), it will automatically use that
# version unless you change `conda_env` to "none". If set to "auto", the
# default, a copy from `tidygenclust` will be preferred if available, otherwise
# a local copy will be used.
gt_admixture <- function(
    x,
    k,
    n_runs = 1,
    crossval = FALSE,
    n_cores = 1,
    seed = NULL,
    conda_env = "auto") {
  # check that we have the right number of repeats
  if (length(seed) != n_runs) {
    stop("'seed' should be a vector of length 'n_runs'")
  }

  # if x is a character, check that it is file that exists
  if (is.character(x)) {
    if (!file.exists(x)) {
      stop("The file ", x, " does not exist")
    }
    input_file <- x
  } else {
    # if x is a gen_tibble
    if (!inherits(x, "gen_tbl")) {
      stop("x must be a gen_tibble or a character")
    }
    # write the gen_tibble to a temp file
    input_file <- gt_as_plink(x, file = tempfile(), chromosomes_as_int = TRUE)
  }

  # expand path to file to be full path
  input_file <- normalizePath(input_file)

  # cast k as an integer
  k <- as.integer(k)

  # set and check the conda environment
  # if there is no reticulate
  if (!requireNamespace("reticulate", quietly = TRUE)) {
    # but we have not set conda_env to none
    if (conda_env != "none") {
      # if we have auto, set to none
      if (conda_env == "auto") {
        conda_env <- "none"
      } else {
        stop(paste(
          "The package reticulate is needed to use",
          "a conda environemnt in R."
        ))
      }
    }
  } else {
    # if reticulate is available
    # if we have "auto"
    if (conda_env == "auto") {
      conda_envs <- reticulate::conda_list()[["name"]]
      # if we are on Linux and have ctidygenclust installed
      if (
        (Sys.info()["sysname"] == "Linux") &&
          ("ctidygenclust" %in% conda_envs)
      ) {
        conda_env <- "ctidygenclust"
      } else if (
        (Sys.info()["sysname"] == "Darwin") &&
          ("cadmixture86" %in% conda_envs)
      ) {
        # else on osx use cadmixture86
        conda_env <- "cadmixture86"
      } else { # if we have no conda environment
        conda_env <- "none"
      }

      # check that the conda environment does exist
      if (
        (conda_env != "none") &&
          (!conda_env %in% reticulate::conda_list()[["name"]])
      ) {
        # nolint
        stop("The conda environment ", conda_env, " does not exist.")
      }
    }
  }

  # if conda is "none", check that we have admixture installed
  if (conda_env == "none") {
    if (system2("which", args = "admixture", stdout = NULL) != 0) {
      stop("ADMIXTURE is not installed in this path")
    }
  }

  # set the working path to the tempdir, where admixture will dump text files
  out <- tempdir()
  # store working directory
  wd <- getwd()
  # change to the directory of the input file
  setwd(out)
  on.exit(setwd(wd))

  # initialise list to store results
  adm_list <- list(
    k = integer(),
    Q = list(),
    P = list(),
    log = list(),
    loglik = numeric()
  )
  if (crossval) {
    adm_list$cv <- numeric()
  }
  class(adm_list) <- c("gt_admix", "list")

  # loop over values of k and number of repeats
  index <- 1
  for (this_k in as.integer(k)) {
    for (this_rep in seq_len(n_runs)) {
      # set the arguments for admixture
      admixture_args <- paste(input_file, this_k)
      if (crossval) {
        admixture_args <- paste(admixture_args, "--cv")
      }
      if (n_cores > 1) {
        admixture_args <- paste(admixture_args, paste0("-j", n_cores))
      }
      if (!is.null(seed)) {
        admixture_args <- paste(admixture_args, paste0("-s ", seed[index]))
      }

      # if we conda_env is none
      if (conda_env == "none") {
        adm_out <- system2("admixture", args = admixture_args, stdout = TRUE)
        # change back to the original working directory
      } else {
        adm_out <- reticulate::conda_run2(
          "admixture",
          args = admixture_args,
          envname = conda_env,
          intern = TRUE
        )
      }

      # build expected .Q filename
      q_file <- file.path(out, paste0(
        gsub(".bed$", "", basename(input_file)),
        ".", this_k, ".Q"
      ))
      # check if no .Q files were written and if adm_out contains "Error:"
      # stop and print adm_out if both are true
      if (
        !file.exists(q_file) &&
          length(grep("Error:", adm_out)) > 0
      ) {
        # nolint
        stop(adm_out)
      }

      # read the output
      output_prefix <- file.path(out, gsub(".bed", "", basename(input_file)))
      adm_list$k[index] <- this_k
      adm_list$Q[[index]] <-
        q_matrix(utils::read.table(
          paste(output_prefix, this_k, "Q", sep = "."),
          header = FALSE
        ))
      adm_list$P[[index]] <- utils::read.table(
        paste(output_prefix, this_k, "P", sep = "."),
        header = FALSE
      ) # nolint
      adm_list$loglik[index] <- as.numeric(strsplit(
        grep("^Loglikelihood", adm_out, value = TRUE),
        ":"
      )[[1]][2]) # nolint
      adm_list$log[[index]] <- adm_out

      if (crossval) {
        # extract value from line with CV error (number after :)
        adm_list$cv[index] <- as.numeric(strsplit(
          grep("CV error", adm_out, value = TRUE),
          ":"
        )[[1]][2]) # nolint
      }
      index <- index + 1
    }
  }

  # add metadata if x is a gen_tibble
  if (inherits(x, "gen_tbl")) {
    adm_list$id <- x$id
    # if it is grouped, add the group
    if (inherits(x, "grouped_gen_tbl")) {
      adm_list$group <- x[[dplyr::group_vars(x)]]
    }
  }

  # add info on algorithm
  adm_list$algorithm <- "ADMIXTURE"

  return(adm_list)
}

Try the tidypopgen package in your browser

Any scripts or data that you put into this service are public.

tidypopgen documentation built on Aug. 28, 2025, 1:08 a.m.